# Module 2 - Tracking an Experiment

## Import libraries

In [2]:

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

import pyarrow  # read .parquet files
import pickle

import mlflow

## Load and preprocess data

In [3]:
# Tracking with MLFlow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [4]:
# Load the data the .parquet file with pandas
df_jan = pd.read_parquet(r"C:\Users\JC\projects\MLOps_Zoomcamp_2022\data\green_tripdata_2021-01.parquet")
df_feb = pd.read_parquet(r"C:\Users\JC\projects\MLOps_Zoomcamp_2022\data\green_tripdata_2021-02.parquet")
df_jan.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.5,0.5,0.5,0.0,0.0,,0.3,6.8,2.0,1.0,0.0
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.0,0.5,0.5,2.81,0.0,,0.3,16.86,1.0,1.0,2.75
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1.0,41,42,1.0,1.12,6.0,0.5,0.5,1.0,0.0,,0.3,8.3,1.0,1.0,0.0
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1.0,168,75,1.0,1.99,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,2.0,1.0,0.0
4,2,2021-01-01 00:16:36,2021-01-01 00:16:40,N,2.0,265,265,3.0,0.0,-52.0,0.0,-0.5,0.0,0.0,,-0.3,-52.8,3.0,1.0,0.0


In [10]:
df_feb.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2021-02-01 00:34:03,2021-02-01 00:51:58,N,1.0,130,205,5.0,3.66,14.0,0.5,0.5,10.0,0.0,,0.3,25.3,1.0,1.0,0.0
1,2,2021-02-01 00:04:00,2021-02-01 00:10:30,N,1.0,152,244,1.0,1.1,6.5,0.5,0.5,0.0,0.0,,0.3,7.8,2.0,1.0,0.0
2,2,2021-02-01 00:18:51,2021-02-01 00:34:06,N,1.0,152,48,1.0,4.93,16.5,0.5,0.5,0.0,0.0,,0.3,20.55,2.0,1.0,2.75
3,2,2021-02-01 00:53:27,2021-02-01 01:11:41,N,1.0,152,241,1.0,6.7,21.0,0.5,0.5,0.0,0.0,,0.3,22.3,2.0,1.0,0.0
4,2,2021-02-01 00:57:46,2021-02-01 01:06:44,N,1.0,75,42,1.0,1.89,8.5,0.5,0.5,2.45,0.0,,0.3,12.25,1.0,1.0,0.0


In [5]:
# Preprocessing of the data
# Functionize the read and processing of the dataframe
def read_df(path):
    '''
    Takes in the data, calculates the target variable trip_duration in minutes,
    filters out outlier below 1 min and above 60 min 
    
    Parameters:
    path: path to data in .parquet format
    '''
    # Read the .parquet file
    df = pd.read_parquet(path)

    # Create the target variable
    df["trip_duration"] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.trip_duration = df.trip_duration.apply(lambda x: x.total_seconds() / 60)

    # Extra filter on trip duration from 1 min up to 60 min
    df = df[(df.trip_duration >= 1) & (df.trip_duration <= 60)]
    
    return df

In [6]:
# Preprocess the data
# Data from January will be the training data
df_train = read_df(r"C:\Users\JC\projects\MLOps_Zoomcamp_2022\data\green_tripdata_2021-01.parquet")

#Data from February will be the validation data
df_val = read_df(r"C:\Users\JC\projects\MLOps_Zoomcamp_2022\data\green_tripdata_2021-02.parquet")

In [16]:
df_train.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,trip_duration
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.5,...,0.5,0.0,0.0,,0.3,6.8,2.0,1.0,0.0,3.933333
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.0,...,0.5,2.81,0.0,,0.3,16.86,1.0,1.0,2.75,8.75
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1.0,41,42,1.0,1.12,6.0,...,0.5,1.0,0.0,,0.3,8.3,1.0,1.0,0.0,5.966667
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1.0,168,75,1.0,1.99,8.0,...,0.5,0.0,0.0,,0.3,9.3,2.0,1.0,0.0,7.083333
7,2,2021-01-01 00:26:31,2021-01-01 00:28:50,N,1.0,75,75,6.0,0.45,3.5,...,0.5,0.96,0.0,,0.3,5.76,1.0,1.0,0.0,2.316667


In [7]:
# Function for preprocessing the features (X)
def create_dicts(df, features):
    # Convert to string (object)
    df[features] = df[features].astype(str)
    
    # Convert features to dictionary
    dicts = df[features].to_dict(orient="records")

    return dicts

In [8]:
# Define the features to predict the target variable
features = ["PULocationID", "DOLocationID"]

# Process the features
feature_dicts_train = create_dicts(df_train, features)

# Create a DictVectorizer
dv = DictVectorizer()

# Fit to the feature dict and save it in a feature matrix
X_train = dv.fit_transform(feature_dicts_train)
y_train = df_train["trip_duration"].values

# Process the features
feature_dicts_val = create_dicts(df_val, features)

# Fit to the feature dict and save it in a feature matrix
X_val = dv.transform(feature_dicts_val)
y_val = df_val["trip_duration"].values

## Experiment tracking with MLFlow

### A Simple way


In [36]:
# Connect to MLFLow: Keep track of the history when running a model. 
# A simple way of adding tracking is defining a new run.

with mlflow.start_run():

    mlflow.set_tag("developer", "jana")

    mlflow.log_param("train-data-path", "./data/...")
    mlflow.log_param("valid-data-path", "./data/...")

    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

See the results on http://127.0.0.1:5000

### Hyperparameter tuning in MLFlow

In [9]:
# Preprocess the data for XGBoost
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

Here, we pass the hyperparameters that are available in XGBoost.

In [10]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")

        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, "validation")],
            early_stopping_rounds=25           
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
    return {"loss": rmse, "status": STATUS_OK}

#### Defining the search space

The search space is the range, in which the hyperparameters shall be optimized. Here, we pass the hyperparameters that are available in XGBoost.

In [11]:
# Definition of the search space
search_space = {
    "max_depth": scope.int(hp.quniform("max_depth", 4, 100, 1)),
    "learning_rate": hp.loguniform("learning_rate", -3, 0), # range is from exp(-3) and exp(0)
    "reg_alpha": hp.loguniform("reg_alpha", -5, -1),
    "reg_lambda": hp.loguniform("reg_lambda", -6, -1),
    "min_child_weight": hp.loguniform("min_child_weight", -1, 3),
    "objective": "reg:linear",
    "seed": 42
}

We are using the hyperopt library for hyperparameter optimization. Hyperopt uses Baysian optimization algorithms based on Gaussian processes and regression trees. The default algorithm is Tree of Parzen Estimators (tpe).

In [12]:
# Defining the function to minimize (fmin)
best_results = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=5,
    trials=Trials()
)

[0]	validation-rmse:16.69525                         
[1]	validation-rmse:13.83097                         
[2]	validation-rmse:11.99373                         
[3]	validation-rmse:10.91671                         
[4]	validation-rmse:10.28778                         
[5]	validation-rmse:9.74064                          
[6]	validation-rmse:9.36773                          
[7]	validation-rmse:9.17641                          
[8]	validation-rmse:9.02631                          
[9]	validation-rmse:8.92391                          
[10]	validation-rmse:8.65274                         
[11]	validation-rmse:8.57871                         
[12]	validation-rmse:8.46490                         
[13]	validation-rmse:8.41398                         
[14]	validation-rmse:8.37095                         
[15]	validation-rmse:8.32479                         
[16]	validation-rmse:8.28345                         
[17]	validation-rmse:8.25660                         
[18]	validation-rmse:8.22479

### Automatic logging (autolog)

Autolog allows to log metrics, params and model without explicit log statements. Whether you use it as a statement before your training code or you can use library-specific calls (available for scikit-learn, Tensorflow, XGBoost etc.) 

```
mlflow_autolog()
```


Now, we train a model with the best params found in the hyperparameter optimization and use autolog.

In [13]:
# Best hyperparameters from training
best_params = {
    "learning_rate": 0.39620689794623454,
    "max_depth": 67,
    "min_child_weight": 2.7580783277428704,
    "objective": "reg:linear",
    "reg_alpha": 0.05148549288379137,
    "reg_lambda": 0.2048884681763893,
    "seed": 42
}


In [14]:
# Calling autolog for XGBoost
mlflow.xgboost.autolog()

# Train the model
booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, "validation")],
        early_stopping_rounds=25           
        )
 


2022/06/01 09:32:15 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b1f59010eedd4c9e8715fc284cf225d7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:15.21008
[1]	validation-rmse:11.96441
[2]	validation-rmse:10.30629
[3]	validation-rmse:9.52358
[4]	validation-rmse:8.92702
[5]	validation-rmse:8.68885
[6]	validation-rmse:8.45796
[7]	validation-rmse:8.30992
[8]	validation-rmse:8.19262
[9]	validation-rmse:8.10092
[10]	validation-rmse:8.02743
[11]	validation-rmse:7.96601
[12]	validation-rmse:7.89609
[13]	validation-rmse:7.84209
[14]	validation-rmse:7.79532
[15]	validation-rmse:7.77007
[16]	validation-rmse:7.74577
[17]	validation-rmse:7.71932
[18]	validation-rmse:7.67236
[19]	validation-rmse:7.66303
[20]	validation-rmse:7.63391
[21]	validation-rmse:7.60215
[22]	validation-rmse:7.58748
[23]	validation-rmse:7.56474
[24]	validation-rmse:7.55627
[25]	validation-rmse:7.55345
[26]	validation-rmse:7.54478
[27]	validation-rmse:7.53962
[28]	validation-rmse:7.53729
[29]	validation-rmse:7.53822
[30]	validation-rmse:7.52944
[31]	validation-rmse:7.52623
[32]	validation-rmse:7.52438
[33]	validation-rmse:7.52300
[34]	validation-rmse:



## Model management

In [35]:
with mlflow.start_run():

    mlflow.set_tag("developer", "jana")

    mlflow.log_param("train-data-path", r"C:\Users\JC\projects\MLOps_Zoomcamp_2022\data\green_tripdata_2021-01.parquet")
    mlflow.log_param("valid-data-path", r"C:\Users\JC\projects\MLOps_Zoomcamp_2022\data\green_tripdata_2021-01.parquet")

    alpha = 0.01
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
    
    # Save the model as an artifact
    mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_pickle/")

MlflowException: Invalid artifact path: 'models_pickle/'. Names may be treated as files in certain cases, and must not resolve to other names when treated as such. This name would resolve to 'models_pickle'

In [None]:
# Save the model as an artifact
mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="model/pickle")