In [1]:
import os
from dotenv import load_dotenv

import pandas as pd
import numpy as np
import seaborn as sns

import mlflow

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import resample

In [2]:
year = 2021
month = 1
color = "yellow"

In [3]:
### Read the 01-2021 data from the website
# Download the data
if not os.path.exists(f"./data/{color}_tripdata_{year}-{month:02d}.parquet"):
    os.system(f"wget -P ./data https://d37ci6vzurychx.cloudfront.net/trip-data/{color}_tripdata_{year}-{month:02d}.parque")
# https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet    

In [4]:
# Load the data
df = pd.read_parquet(f"./data/{color}_tripdata_{year}-{month:02d}.parquet")

In [5]:
load_dotenv()
MLFLOW_TRACKING_URI=os.getenv("MLFLOW_TRACKING_URI")
MLFLOW_TRACKING_URI

'http://34.89.155.255:5000'

In [6]:
# Set up the connection to MLflow
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# Setup the MLflow experiment 
mlflow.set_experiment("yellow-taxi-trip-duration-rf")

<Experiment: artifact_location='gs://mlflow-artifacts-go/artifacts/2', creation_time=1689497873517, experiment_id='2', last_update_time=1689497873517, lifecycle_stage='active', name='yellow-taxi-trip-duration-rf', tags={}>

In [7]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1369769 entries, 0 to 1369768
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   VendorID               1369769 non-null  int64         
 1   tpep_pickup_datetime   1369769 non-null  datetime64[ns]
 2   tpep_dropoff_datetime  1369769 non-null  datetime64[ns]
 3   passenger_count        1271417 non-null  float64       
 4   trip_distance          1369769 non-null  float64       
 5   RatecodeID             1271417 non-null  float64       
 6   store_and_fwd_flag     1271417 non-null  object        
 7   PULocationID           1369769 non-null  int64         
 8   DOLocationID           1369769 non-null  int64         
 9   payment_type           1369769 non-null  int64         
 10  fare_amount            1369769 non-null  float64       
 11  extra                  1369769 non-null  float64       
 12  mta_tax                13697

In [8]:
# Look for missing values
df.isnull().sum()

VendorID                       0
tpep_pickup_datetime           0
tpep_dropoff_datetime          0
passenger_count            98352
trip_distance                  0
RatecodeID                 98352
store_and_fwd_flag         98352
PULocationID                   0
DOLocationID                   0
payment_type                   0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
improvement_surcharge          0
total_amount                   0
congestion_surcharge       98352
airport_fee              1369764
dtype: int64

In [9]:
features = ["PULocationID", "DOLocationID", "trip_distance", "passenger_count", "tpep_pickup_datetime"]
target = 'duration'

In [10]:
# calculate the trip duration in minutes and drop trips that are less than 1 minute and more than 2 hours
def calculate_trip_duration_in_minutes(df):
    df["trip_duration_minutes"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60
    df = df[(df["trip_duration_minutes"] >= 1) & (df["trip_duration_minutes"] <= 60)]
    return df

In [11]:
# Time of the day in minutes as feature
def get_days_minutes(date_time):
    return date_time.hour * 60 + date_time.minute

In [12]:
### Preprocessing as a function. Must be applied before pipeline, because the X/Y split is based upon
### the creation of df.trip_duration_minutes
def preprocess(df):
    df = df.copy()
    df = calculate_trip_duration_in_minutes(df)
    df["pickup_time_minutes"] = df["tpep_pickup_datetime"].apply(lambda x: get_days_minutes(x))
    categorical_features = ["PULocationID", "DOLocationID"]
    df[categorical_features] = df[categorical_features].astype(str)
    df['trip_route'] = df["PULocationID"] + "_" + df["DOLocationID"]
    df = df[['trip_route', 'trip_distance', 'pickup_time_minutes', 'passenger_count', 'trip_duration_minutes']]
    return df


In [13]:
df_preprocessed = preprocess(df)
### Resample Data for faster computation



In [14]:
n_sample = len(df_preprocessed) // 3
df_pre_resampled = resample(df_preprocessed, n_samples=n_sample, replace=False, random_state=42)

In [15]:
y=df_pre_resampled["trip_duration_minutes"]
X=df_pre_resampled.drop(columns=["trip_duration_minutes"])
#x_columns = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)


In [16]:
impute_columns = ["passenger_count"]
encode_columns = ["trip_route"]

In [18]:
# Define Parameter-Grid
param_grid = {"RF__n_estimators": [10, 50, 100, 200],
              "RF__max_depth": [5, 9, 13, 17],
              "RF__min_samples_leaf": [50, 100, 200, 500],
              "RF__max_features":["sqrt"]}

transformer = ColumnTransformer(
    [("imputer", SimpleImputer(strategy="most_frequent"), impute_columns),
    ("encoder", OneHotEncoder(drop=None), encode_columns)],
    remainder="passthrough"
)

pipeline = Pipeline([("transformer", transformer),
                     ("RF", RandomForestRegressor())],
                    )

model = RandomizedSearchCV(pipeline, param_grid, n_iter = 20, scoring="neg_root_mean_squared_error")
model.fit(X_train, y_train)

Traceback (most recent call last):
  File "/Users/gunnaroeh/neuefische/mle-model-deployment-project/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/gunnaroeh/neuefische/mle-model-deployment-project/.venv/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
           ^^^^^^^^^^^^
  File "/Users/gunnaroeh/neuefische/mle-model-deployment-project/.venv/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/gunnaroeh/neuefische/mle-model-deployment-project/.venv/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [None]:
df_cv_results = pd.DataFrame(model.cv_results_)
df_cv_results.columns

In [None]:
cols = ['param_RF__min_samples_leaf', 'param_RF__max_features', 'param_RF__max_depth']
for col in cols:
    df_cv_results[col] = df_cv_results[col].astype("str")

In [None]:
### Look at the combination of parameter values and the resulting score
sns.relplot(data=df_cv_results, y = "mean_test_score", x = "param_RF__n_estimators", 
            hue = 'param_RF__max_depth', 
            kind = 'line')

In [None]:
### Run best Model
best_rf_model = model.best_estimator_

In [None]:
best_rf_model

In [None]:
### Register in MLFlow
### Run best Model
features = list(X_train.columns)
target = y_train.name

with mlflow.start_run():
    tags = {"model": "RandomForestRegressor",
        "developer": "Gunnar",
        "dataset": f"{color}-taxi",
        "year": year,
        "month": month,
        "features": features,
        "target": target}
    mlflow.set_tags(tags)

    transformer = ColumnTransformer(
    [("imputer", SimpleImputer(strategy="most_frequent"), impute_columns),
    ("encoder", OneHotEncoder(drop=None,handle_unknown="ignore"), encode_columns)],
    remainder="passthrough")

    pipeline = Pipeline([("transformer", transformer),
                     ("RF", RandomForestRegressor(max_depth=13, max_features='sqrt', min_samples_leaf=50,
                      n_estimators=10))],
                    )
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    mlflow.sklearn.log_model(pipeline, "model")