In [1]:
# importing all the necesasry libraries

import pickle
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso

from sklearn.metrics import root_mean_squared_error


from sklearn.model_selection import cross_val_score, KFold

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [2]:
# # Downloading the dataset - Run When using GitHub, as the download path is based on GitHUb CodeSpaces
# !wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet -P /workspaces/mlops-learning/01-intro/data
# !wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet -P /workspaces/mlops-learning/01-intro/data


# # Download 2021 data
# !wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet -P /workspaces/mlops-learning/01-intro/data
# !wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-02.parquet -P /workspaces/mlops-learning/01-intro/data

In [3]:
import mlflow

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# Create a new MLflow Experiment - Inside an experiment, there will be Runs
mlflow.set_experiment("taxi-model-management")

2024/08/30 13:55:21 INFO mlflow.tracking.fluent: Experiment with name 'taxi-model-management' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/380033675744979635', creation_time=1725026121906, experiment_id='380033675744979635', last_update_time=1725026121906, lifecycle_stage='active', name='taxi-model-management', tags={}>

In [4]:
# a function to read the data, preprocess it and return it
def read_and_preprocess(filename):
    data = pd.read_parquet(filename)
    
    # create the target variable
    data['ride_duration'] = data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime'] 
    data['ride_duration'] = data['ride_duration'].apply(lambda x: x.total_seconds()/60) 

    # take only the data below 1 hour
    data = data[(data['ride_duration'] >= 1) & (data['ride_duration'] <= 60)]

    # # sample the data to 70k rows
    # if len(data) > 70000:
    #     sampled_data = data.iloc[:70000,:].copy()
    # else:
    #     sampled_data = data.copy()
    sampled_data = data.copy()
    
    # chosing categorical
    categorical = ['PULocationID', 'DOLocationID']

    # convert these numerical categorical features to string categorical features
    sampled_data[categorical] = sampled_data[categorical].astype(str)


    return sampled_data

In [5]:
# When using GitHub CodeSpaces - Path is Set according to that
# df_train = read_and_preprocess('/workspaces/mlops-learning/01-intro/data/yellow_tripdata_2021-01.parquet')
# df_valid = read_and_preprocess('/workspaces/mlops-learning/01-intro/data/yellow_tripdata_2021-02.parquet')

# when not using GitHub CodeSpaces
df_train = read_and_preprocess('../01-intro/data/yellow_tripdata_2021-01.parquet')
df_valid = read_and_preprocess('../01-intro/data/yellow_tripdata_2021-02.parquet')

In [6]:
# chosing categorical and numerical features
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

# to use the DictVectorizer, we need to convert the dataframe to dict
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts = df_valid[categorical + numerical].to_dict(orient='records')


dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_valid = dv.fit_transform(val_dicts)

# storing our target variable
target = 'ride_duration'
y_train = df_train[target].values
y_val = df_valid[target].values

In [7]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_valid, label=y_val)

Auto Logging for XGBoost Fails for a Reason - Crashing the Kernel

In [8]:
# # these parameters are taken after running the above cell and
# # checking which parameters got the best rmse
# params = {
#     'max_depth': 36,
#     'learning_rate': 0.4,
#     'reg_alpha': 0.015,
#     'reg_lambda': 0.0053,
#     'min_child_weight': 15.88,
#     'seed': 42
# }

# # we provide the autolog here, just before the run function which gets executed in objective function
# # mlflow.xgboost.autolog() 

# booster = xgb.train(
#             params=params,
#             dtrain=train,
#             num_boost_round=100,
#             evals=[(valid, 'validation')],
#             early_stopping_rounds=10
#         )

Saving the Model

In [9]:
# these parameters are taken after running the above cell and
# checking which parameters got the best rmse
params = {
    'max_depth': 36,
    'learning_rate': 0.4,
    'reg_alpha': 0.015,
    'reg_lambda': 0.0053,
    'min_child_weight': 15.88,
    'seed': 42
}

# starting the run
with mlflow.start_run():
    mlflow.set_tag("model",'XGBoost')

    # logging the parameters
    mlflow.log_params(params)
    
    booster = xgb.train(
                params=params,
                dtrain=train,
                num_boost_round=100,
                evals=[(valid, 'validation')],
                early_stopping_rounds=10
            )
    
    y_pred = booster.predict(valid)

    # logging the predictions
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric('rmse',rmse)

    # saving the model - creates a folder and stores the model in that folder
    mlflow.xgboost.log_model(booster, artifact_path='models_mlflow')

[0]	validation-rmse:6.67760
[1]	validation-rmse:5.43694
[2]	validation-rmse:4.91329
[3]	validation-rmse:4.70343
[4]	validation-rmse:4.61204
[5]	validation-rmse:4.57421
[6]	validation-rmse:4.55375
[7]	validation-rmse:4.54199
[8]	validation-rmse:4.53226
[9]	validation-rmse:4.51629
[10]	validation-rmse:4.51067
[11]	validation-rmse:4.50948
[12]	validation-rmse:4.51238
[13]	validation-rmse:4.50975
[14]	validation-rmse:4.50785
[15]	validation-rmse:4.50801
[16]	validation-rmse:4.50526
[17]	validation-rmse:4.49809
[18]	validation-rmse:4.50009
[19]	validation-rmse:4.50017
[20]	validation-rmse:4.49735
[21]	validation-rmse:4.50112
[22]	validation-rmse:4.50302
[23]	validation-rmse:4.49901
[24]	validation-rmse:4.50344
[25]	validation-rmse:4.50533
[26]	validation-rmse:4.50451
[27]	validation-rmse:4.50134
[28]	validation-rmse:4.50144
[29]	validation-rmse:4.49909


2024/08/30 13:56:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run shivering-stag-673 at: http://127.0.0.1:5000/#/experiments/380033675744979635/runs/401da8cb355c486395c3026968ba5f6d.
2024/08/30 13:56:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/380033675744979635.
