In [None]:
import os
from dotenv import load_dotenv

import pandas as pd
import numpy as np

import mlflow

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV

In [None]:
year = 2021
month = 1
color = "yellow"

In [None]:
### Read the 01-2021 data from the website
# Download the data
if not os.path.exists(f"./data/{color}_tripdata_{year}-{month:02d}.parquet"):
    os.system(f"wget -P ./data https://d37ci6vzurychx.cloudfront.net/trip-data/{color}_tripdata_{year}-{month:02d}.parque")
# https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet    

In [None]:
# Load the data
df = pd.read_parquet(f"./data/{color}_tripdata_{year}-{month:02d}.parquet")

In [None]:
load_dotenv()
MLFLOW_TRACKING_URI=os.getenv("MLFLOW_TRACKING_URI")
MLFLOW_TRACKING_URI

In [None]:
# Set up the connection to MLflow
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# Setup the MLflow experiment 
mlflow.set_experiment("yellow-taxi-trip-duration-rf")

In [None]:
df.head()
df.info()

In [None]:
# Look for missing values
df.isnull().sum()

In [None]:
features = ["PULocationID", "DOLocationID", "trip_distance", "passenger_count", "tpep_pickup_datetime"]
target = 'duration'

In [None]:
# calculate the trip duration in minutes and drop trips that are less than 1 minute and more than 2 hours
def calculate_trip_duration_in_minutes(df):
    df["trip_duration_minutes"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60
    df = df[(df["trip_duration_minutes"] >= 1) & (df["trip_duration_minutes"] <= 60)]
    return df

In [None]:
# Time of the day in minutes as feature
def get_days_minutes(date_time):
    return date_time.hour * 60 + date_time.minute

In [None]:
### Preprocessing as a function. Must be applied before pipeline, because the X/Y split is based upon
### the creation of df.trip_duration_minutes
def preprocess(df):
    df = df.copy()
    df = calculate_trip_duration_in_minutes(df)
    df["pickup_time_minutes"] = df["tpep_pickup_datetime"].apply(lambda x: get_days_minutes(x))
    categorical_features = ["PULocationID", "DOLocationID"]
    df[categorical_features] = df[categorical_features].astype(str)
    df['trip_route'] = df["PULocationID"] + "_" + df["DOLocationID"]
    df = df[['trip_route', 'trip_distance', 'pickup_time_minutes', 'passenger_count', 'trip_duration_minutes']]
    return df


In [None]:
df_preprocessed = preprocess(df)

In [None]:
y=df_preprocessed["trip_duration_minutes"]
X=df_preprocessed.drop(columns=["trip_duration_minutes"])
#x_columns = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [None]:
impute_columns = ["passenger_count"]
encode_columns = ["trip_route"]

In [None]:
# Define Parameter-Grid
param_grid = {"RF__n_estimators": [10, 50, 100, 200],
              "RF__max_depth": [3, 5, 7, 9],
              "RF__min_samples_leaf": [50, 100, 200, 500],
              "RF__max_features":["sqrt"]}

transformer = ColumnTransformer(
    [("imputer", SimpleImputer(strategy="most_frequent"), impute_columns),
    ("encoder", OneHotEncoder(drop=None,handle_unknown="ignore"), encode_columns)],
    remainder="passthrough"
)

pipeline = Pipeline([("transformer", transformer),
                     ("RF", RandomForestRegressor())],
                    )

model = RandomizedSearchCV(pipeline, param_grid, n_iter = 20)
model.fit(X_train, y_train)