In [None]:
# importing all the necesasry libraries

import pickle
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso

from sklearn.metrics import root_mean_squared_error


from sklearn.model_selection import cross_val_score, KFold

In [2]:
import mlflow

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# Create a new MLflow Experiment - Inside an experiment, there will be Runs
mlflow.set_experiment("taxi-prediction-model-management")

<Experiment: artifact_location='mlflow-artifacts:/137411656770551778', creation_time=1725021597969, experiment_id='137411656770551778', last_update_time=1725021597969, lifecycle_stage='active', name='taxi-prediction-model-management', tags={}>

In [3]:
# a function to read the data, preprocess it and return it
def read_and_preprocess(filename):
    data = pd.read_parquet(filename)
    
    # create the target variable
    data['ride_duration'] = data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime'] 
    data['ride_duration'] = data['ride_duration'].apply(lambda x: x.total_seconds()/60) 

    # take only the data below 1 hour
    data = data[(data['ride_duration'] >= 1) & (data['ride_duration'] <= 60)]

    # # sample the data to 70k rows
    # if len(data) > 70000:
    #     sampled_data = data.iloc[:70000,:].copy()
    # else:
    #     sampled_data = data.copy()
    sampled_data = data.copy()
    
    # chosing categorical
    categorical = ['PULocationID', 'DOLocationID']

    # convert these numerical categorical features to string categorical features
    sampled_data[categorical] = sampled_data[categorical].astype(str)


    return sampled_data

In [4]:
# When using GitHub CodeSpaces - Path is Set according to that
# df_train = read_and_preprocess('/workspaces/mlops-learning/01-intro/data/yellow_tripdata_2021-01.parquet')
# df_valid = read_and_preprocess('/workspaces/mlops-learning/01-intro/data/yellow_tripdata_2021-02.parquet')

# when not using GitHub CodeSpaces
df_train = read_and_preprocess('../01-intro/data/yellow_tripdata_2021-01.parquet')
df_valid = read_and_preprocess('../01-intro/data/yellow_tripdata_2021-02.parquet')

In [5]:
# chosing categorical and numerical features
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

# to use the DictVectorizer, we need to convert the dataframe to dict
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts = df_valid[categorical + numerical].to_dict(orient='records')


dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_valid = dv.fit_transform(val_dicts)

# storing our target variable
target = 'ride_duration'
y_train = df_train[target].values
y_val = df_valid[target].values

In [6]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [7]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_valid, label=y_val)

In [9]:
# these parameters are taken after running the above cell and
# checking which parameters got the best rmse
params = {
    'max_depth': 36,
    'learning_rate': 0.4,
    'reg_alpha': 0.015,
    'reg_lambda': 0.0053,
    'min_child_weight': 15.88,
    'seed': 42
}

# we provide the autolog here, just before the run function which gets executed in objective function
# mlflow.xgboost.autolog() 

# so the objective will be called 10 times and for each time, a run will be registered in the MLFlow
booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10
        )

[0]	validation-rmse:6.67760
[1]	validation-rmse:5.43694
[2]	validation-rmse:4.91329
[3]	validation-rmse:4.70343
[4]	validation-rmse:4.61204
[5]	validation-rmse:4.57421
[6]	validation-rmse:4.55375
[7]	validation-rmse:4.54199
[8]	validation-rmse:4.53226
[9]	validation-rmse:4.51629
[10]	validation-rmse:4.51067
[11]	validation-rmse:4.50948
[12]	validation-rmse:4.51238
[13]	validation-rmse:4.50975
[14]	validation-rmse:4.50785
[15]	validation-rmse:4.50801
[16]	validation-rmse:4.50526
[17]	validation-rmse:4.49809
[18]	validation-rmse:4.50009
[19]	validation-rmse:4.50017
[20]	validation-rmse:4.49735
[21]	validation-rmse:4.50112
[22]	validation-rmse:4.50302
[23]	validation-rmse:4.49901
[24]	validation-rmse:4.50344
[25]	validation-rmse:4.50533
[26]	validation-rmse:4.50451
[27]	validation-rmse:4.50134
[28]	validation-rmse:4.50144
[29]	validation-rmse:4.49909


In [8]:
# we provide the autolog here, just before the run function which gets executed in objective function
mlflow.xgboost.autolog() 

In [9]:
# these parameters are taken after running the above cell and
# checking which parameters got the best rmse
params = {
    'max_depth': 36,
    'learning_rate': 0.4,
    'reg_alpha': 0.015,
    'reg_lambda': 0.0053,
    'min_child_weight': 15.88,
    'seed': 42
}

# so the objective will be called 10 times and for each time, a run will be registered in the MLFlow
booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10
        )

2024/08/30 12:50:06 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'cf498044cda94b008bc35b21541095c3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


: 