In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.feature_extraction import DictVectorizer

In [3]:
from sklearn.linear_model import LinearRegression

In [4]:
from sklearn.linear_model import Lasso

In [5]:
from sklearn.linear_model import Ridge

In [6]:
from sklearn.metrics import mean_squared_error

In [32]:
#pip install xgboost
#pip install hyperopt

In [35]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [25]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [26]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1717038746543, experiment_id='1', last_update_time=1717038746543, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [9]:
#### Modularizing the code 

def read_dataframe(filename):

    df = pd.read_parquet(filename)
    df['duration'] = df.lpep_dropoff_datetime  - df.lpep_pickup_datetime   
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)

    df = df[((df.duration>=1) & (df.duration<=60))]

    categorical =['PULocationID','DOLocationID' ]
    #numerical =['trip_distance']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [10]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [11]:
len(df_train), len(df_val)

(73908, 61921)

In [12]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [13]:
categorical =  ['PU_DO'] #['PULocationID','DOLocationID' ]
numerical =['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical ].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)


In [14]:
target ='duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [15]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

7.480879703714338

In [27]:
with mlflow.start_run():
    
    mlflow.set_tag("developer", "Justus")
    
    mlflow.set_tag("model_name", "Lasso")
    
    mlflow.log_param("train-url", "green_tripdata_2021-01.parquet")
    mlflow.log_param("val-url", "green_tripdata_2021-02.parquet")
    
    alpha = 0.001
    mlflow.log_param("alpha", alpha)

    lr = Lasso(0.001)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    #
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)

In [28]:
with mlflow.start_run():
    
    mlflow.set_tag("developer", "Justus")
    
    mlflow.set_tag("model_name", "Ridge")
    
    mlflow.log_param("train-url", "green_tripdata_2021-01.parquet")
    mlflow.log_param("val-url", "green_tripdata_2021-02.parquet")
    
    alpha = 0.001
    mlflow.log_param("alpha", alpha)


    lr = Ridge(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)

    rmse =mean_squared_error(y_val, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)

In [20]:
import pickle

In [21]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv,lr), f_out)

In [36]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [37]:
params = {
    'learning_rate' : 0.20472,
    'max_depth' : 17,
    'min_child_weight' : 1.24026117, 
    'objective': 'reg:linear',
    'reg_alpha': 0.28567,
    'reg_lambda': 0.0042644,
    'seed': 42
    
}

mlflow.xgboost.autolog()

booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )

2024/05/30 19:27:15 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'fd11023e506748aebf05cea5da7118c4', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:11.98179
[1]	validation-rmse:11.83350
[2]	validation-rmse:11.73825
[3]	validation-rmse:11.65759
[4]	validation-rmse:11.59579
[5]	validation-rmse:11.54758
[6]	validation-rmse:11.50366
[7]	validation-rmse:11.46061
[8]	validation-rmse:11.41970
[9]	validation-rmse:11.38580
[10]	validation-rmse:11.35126
[11]	validation-rmse:11.32099
[12]	validation-rmse:11.28905
[13]	validation-rmse:11.26045
[14]	validation-rmse:11.23964
[15]	validation-rmse:11.21157
[16]	validation-rmse:11.19271
[17]	validation-rmse:11.16952
[18]	validation-rmse:11.14505
[19]	validation-rmse:11.12726
[20]	validation-rmse:11.10530
[21]	validation-rmse:11.08596
[22]	validation-rmse:11.06404
[23]	validation-rmse:11.04630
[24]	validation-rmse:11.02613
[25]	validation-rmse:11.00776
[26]	validation-rmse:10.98970
[27]	validation-rmse:10.97370
[28]	validation-rmse:10.95542
[29]	validation-rmse:10.93649
[30]	validation-rmse:10.92504
[31]	validation-rmse:10.90775
[32]	validation-rmse:10.89888
[33]	validation-rmse

[272]	validation-rmse:9.59309
[273]	validation-rmse:9.59043
[274]	validation-rmse:9.58854
[275]	validation-rmse:9.58489
[276]	validation-rmse:9.58243
[277]	validation-rmse:9.57996
[278]	validation-rmse:9.57557
[279]	validation-rmse:9.57268
[280]	validation-rmse:9.57095
[281]	validation-rmse:9.56718
[282]	validation-rmse:9.56401
[283]	validation-rmse:9.56198
[284]	validation-rmse:9.55843
[285]	validation-rmse:9.55499
[286]	validation-rmse:9.55403
[287]	validation-rmse:9.55007
[288]	validation-rmse:9.54594
[289]	validation-rmse:9.54438
[290]	validation-rmse:9.54078
[291]	validation-rmse:9.53760
[292]	validation-rmse:9.53366
[293]	validation-rmse:9.53260
[294]	validation-rmse:9.53077
[295]	validation-rmse:9.52884
[296]	validation-rmse:9.52584
[297]	validation-rmse:9.52364
[298]	validation-rmse:9.52036
[299]	validation-rmse:9.51833
[300]	validation-rmse:9.51638
[301]	validation-rmse:9.51479
[302]	validation-rmse:9.51165
[303]	validation-rmse:9.50897
[304]	validation-rmse:9.50559
[305]	vali

[546]	validation-rmse:9.02737
[547]	validation-rmse:9.02564
[548]	validation-rmse:9.02498
[549]	validation-rmse:9.02268
[550]	validation-rmse:9.02101
[551]	validation-rmse:9.01913
[552]	validation-rmse:9.01716
[553]	validation-rmse:9.01626
[554]	validation-rmse:9.01397
[555]	validation-rmse:9.01198
[556]	validation-rmse:9.01026
[557]	validation-rmse:9.00957
[558]	validation-rmse:9.00779
[559]	validation-rmse:9.00614
[560]	validation-rmse:9.00524
[561]	validation-rmse:9.00332
[562]	validation-rmse:9.00207
[563]	validation-rmse:9.00045
[564]	validation-rmse:8.99892
[565]	validation-rmse:8.99723
[566]	validation-rmse:8.99530
[567]	validation-rmse:8.99398
[568]	validation-rmse:8.99322
[569]	validation-rmse:8.99106
[570]	validation-rmse:8.98980
[571]	validation-rmse:8.98871
[572]	validation-rmse:8.98788
[573]	validation-rmse:8.98639
[574]	validation-rmse:8.98440
[575]	validation-rmse:8.98279
[576]	validation-rmse:8.98128
[577]	validation-rmse:8.98038
[578]	validation-rmse:8.97984
[579]	vali

[820]	validation-rmse:8.69735
[821]	validation-rmse:8.69663
[822]	validation-rmse:8.69484
[823]	validation-rmse:8.69345
[824]	validation-rmse:8.69248
[825]	validation-rmse:8.69144
[826]	validation-rmse:8.69060
[827]	validation-rmse:8.68930
[828]	validation-rmse:8.68848
[829]	validation-rmse:8.68724
[830]	validation-rmse:8.68679
[831]	validation-rmse:8.68621
[832]	validation-rmse:8.68530
[833]	validation-rmse:8.68420
[834]	validation-rmse:8.68310
[835]	validation-rmse:8.68196
[836]	validation-rmse:8.68070
[837]	validation-rmse:8.67947
[838]	validation-rmse:8.67898
[839]	validation-rmse:8.67779
[840]	validation-rmse:8.67611
[841]	validation-rmse:8.67482
[842]	validation-rmse:8.67388
[843]	validation-rmse:8.67296
[844]	validation-rmse:8.67188
[845]	validation-rmse:8.67143
[846]	validation-rmse:8.67044
[847]	validation-rmse:8.66952
[848]	validation-rmse:8.66875
[849]	validation-rmse:8.66771
[850]	validation-rmse:8.66673
[851]	validation-rmse:8.66572
[852]	validation-rmse:8.66499
[853]	vali

