In [31]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import seaborn as sns
import matplotlib.pyplot as plt

import pickle

In [2]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment('nyc-taxi')

<Experiment: artifact_location='/home/azureuser/MLOps_zoomcamp/notebooks/mlruns/1', creation_time=1685564918529, experiment_id='1', last_update_time=1685564918529, lifecycle_stage='active', name='nyc-taxi', tags={}>

In [3]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    
    df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
    df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
    
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds()/60)
    
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    cat = ['PULocationID', 'DOLocationID']
    num = ['trip_distance']
    
    df[cat] = df[cat].astype(str)
    
    df['DO_PU'] = df['PULocationID'] + '_' + df['DOLocationID']
    
    return df

In [4]:
df_train = read_dataframe('data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('data/green_tripdata_2021-02.parquet')

In [5]:
df_train.shape, df_val.shape

((73908, 22), (61921, 22))

In [6]:
df_train.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration,DO_PU
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.5,...,0.0,0.0,,0.3,6.8,2.0,1.0,0.0,3.933333,43_151
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.0,...,2.81,0.0,,0.3,16.86,1.0,1.0,2.75,8.75,166_239
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1.0,41,42,1.0,1.12,6.0,...,1.0,0.0,,0.3,8.3,1.0,1.0,0.0,5.966667,41_42
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1.0,168,75,1.0,1.99,8.0,...,0.0,0.0,,0.3,9.3,2.0,1.0,0.0,7.083333,168_75
7,2,2021-01-01 00:26:31,2021-01-01 00:28:50,N,1.0,75,75,6.0,0.45,3.5,...,0.96,0.0,,0.3,5.76,1.0,1.0,0.0,2.316667,75_75


In [7]:
cat = ['DO_PU']
num = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[cat + num].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[cat + num].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [8]:
target = 'duration'

y_train = df_train[target].values
y_val = df_val[target].values

In [29]:
with mlflow.start_run():

    # mlflow.set_tag("developer", "GT")
    mlflow.sklearn.autolog()

    mlflow.log_param("LinearRegression", "defaults")

    lr = LinearRegression()
    lr.fit(X_train, y_train)

    lr_pred = lr.predict(X_val)

    rmse = mean_squared_error(lr_pred, y_val, squared=False)
    mlflow.log_metric("rmse", rmse)
    print(rmse)

    mlflow.log_artifact(local_path="models/lr.bin", artifact_path="models_pickle")

In [33]:
with mlflow.start_run():

    # mlflow.set_tag("developer", "GT")
    mlflow.sklearn.autolog()
    
    alpha = 0.01
    mlflow.log_param("lasso", alpha)

    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train, y_train)

    lasso_pred = lasso.predict(X_val)

    rmse = mean_squared_error(lasso_pred, y_val, squared=False)
    print(rmse)
    mlflow.log_metric("rmse", rmse)

    with open('models/lasso.bin', 'wb') as file:
        pickle.dump((dv, lasso), file)

    mlflow.log_artifact(local_path="models/lasso.bin", artifact_path="models_pickle")

11.167275941179728


In [34]:
with mlflow.start_run():

    # mlflow.set_tag("developer", "GT")
    mlflow.sklearn.autolog()
    
    # alpha = 0.01
    # mlflow.log_param("lasso", alpha)

    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)

    rf_pred = rf.predict(X_val)

    rmse = mean_squared_error(rf_pred, y_val, squared=False)
    print(rmse)
    mlflow.log_metric("rmse", rmse)

    with open('models/rf.bin', 'wb') as file:
        pickle.dump((dv, rf), file)

    mlflow.log_artifact(local_path="models/rf.bin", artifact_path="models_pickle")

KeyboardInterrupt: 

In [11]:
import xgboost as xgb

In [12]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [13]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [14]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=10,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [24]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=Trials()
)

[0]	validation-rmse:10.16913                          
[1]	validation-rmse:7.36294                           
[2]	validation-rmse:6.72951                           
[3]	validation-rmse:6.57960                           
[4]	validation-rmse:6.51276                           
[5]	validation-rmse:6.48991                           
[6]	validation-rmse:6.47815                           
[7]	validation-rmse:6.47018                           
[8]	validation-rmse:6.46347                           
[9]	validation-rmse:6.45846                           
[0]	validation-rmse:15.38525                                                   
[1]	validation-rmse:11.75647                                                   
[2]	validation-rmse:9.57854                                                    
[3]	validation-rmse:8.31310                                                    
[4]	validation-rmse:7.60547                                                    
[5]	validation-rmse:7.21254                       

In [17]:
mlflow.xgboost.autolog(disable=True)

In [19]:
with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")

        # mlflow.xgboost.autolog()
        params = {
            "learning_rate":	0.14080421574286942,
            "max_depth": 50,
            "min_child_weight":	1.721635493649425,
            "objective":	'reg:linear',
            "reg_alpha":	0.02681711890192135,
            "reg_lambda":	0.009206105899311917,
            "seed":	42
        }

        mlflow.log_params(params)

        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

        with open("models/preprocessor.b", "wb") as file:
            pickle.dump(dv, file)

        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

[0]	validation-rmse:18.68552
[1]	validation-rmse:16.56750
[2]	validation-rmse:14.78471
[3]	validation-rmse:13.29091
[4]	validation-rmse:12.04843
[5]	validation-rmse:11.01767
[6]	validation-rmse:10.17235
[7]	validation-rmse:9.47653
[8]	validation-rmse:8.91244
[9]	validation-rmse:8.45358
[10]	validation-rmse:8.08293
[11]	validation-rmse:7.78338
[12]	validation-rmse:7.54389
[13]	validation-rmse:7.35218
[14]	validation-rmse:7.19659
[15]	validation-rmse:7.07094
[16]	validation-rmse:6.96973
[17]	validation-rmse:6.88524
[18]	validation-rmse:6.81683
[19]	validation-rmse:6.76176
[20]	validation-rmse:6.71628
[21]	validation-rmse:6.67853
[22]	validation-rmse:6.64571
[23]	validation-rmse:6.61811
[24]	validation-rmse:6.59465
[25]	validation-rmse:6.57538
[26]	validation-rmse:6.55821
[27]	validation-rmse:6.54432
[28]	validation-rmse:6.53186
[29]	validation-rmse:6.52144
[30]	validation-rmse:6.51413
[31]	validation-rmse:6.50734
[32]	validation-rmse:6.50147
[33]	validation-rmse:6.49679
[34]	validation-r

In [42]:
# with open('models/lr.bin', 'wb') as file:
#     pickle.dump((dv, lr), file)

In [20]:
# import mlflow
logged_model = 'runs:/f371a0c3fe2b48bf844a80dc2808f2ec/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)




In [21]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: f371a0c3fe2b48bf844a80dc2808f2ec

In [24]:
xgboost_model = mlflow.xgboost.load_model(logged_model)
xgboost_model



<xgboost.core.Booster at 0x7f3391977b80>

In [25]:
xgboost_model.predict(valid)

array([15.303841 ,  6.9746785, 17.123621 , ..., 13.363762 ,  6.2096205,
        8.306488 ], dtype=float32)

In [27]:
loaded_model.predict(y_val)

ValueError: Please reshape the input data into 2-dimensional matrix.

In [None]:

# Predict on a Pandas DataFrame.
import pandas as pd
loaded_model.predict(pd.DataFrame(data))