In [1]:
# importing the necessary libraries
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR
from sklearn.metrics import root_mean_squared_error
import pickle
import joblib
import mlflow

In [2]:
# defining a function to read and process the data
def read_data(filename):
    """reads the data file and process the data"""

    data = pd.read_parquet(filename)

    data['duration'] = data.tpep_dropoff_datetime - data.tpep_pickup_datetime

    data['duration'] = data['duration'].apply(lambda td: td.total_seconds() / 60)
    data = data[(data['duration'] >= 1) & (data['duration'] <= 60)]

    # categorical columns
    categorical = ["PULocationID", "DOLocationID"]
    data[categorical] = data[categorical].astype(str)

    return data

# getting the data
# loading the parquet data 
train_data = read_data("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet")
val_data = read_data("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet")

val_data

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,2,2024-02-01 00:04:45,2024-02-01 00:19:58,1.0,4.39,1.0,N,68,236,1,20.50,1.0,0.5,1.28,0.00,1.0,26.78,2.5,0.00,15.216667
1,2,2024-02-01 00:56:31,2024-02-01 01:10:53,1.0,7.71,1.0,N,48,243,1,31.00,1.0,0.5,9.00,0.00,1.0,45.00,2.5,0.00,14.366667
2,2,2024-02-01 00:07:50,2024-02-01 00:43:12,2.0,28.69,2.0,N,132,261,2,70.00,0.0,0.5,0.00,6.94,1.0,82.69,2.5,1.75,35.366667
3,1,2024-02-01 00:01:49,2024-02-01 00:10:47,1.0,1.10,1.0,N,161,163,1,9.30,3.5,0.5,2.85,0.00,1.0,17.15,2.5,0.00,8.966667
4,1,2024-02-01 00:37:35,2024-02-01 00:51:15,1.0,2.60,1.0,N,246,79,2,15.60,3.5,0.5,0.00,0.00,1.0,20.60,2.5,0.00,13.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3007521,2,2024-02-29 23:20:17,2024-02-29 23:28:25,,0.35,,,164,170,0,21.02,0.0,0.5,0.00,0.00,1.0,25.02,,,8.133333
3007522,2,2024-02-29 23:38:37,2024-02-29 23:49:37,,1.99,,,249,45,0,18.78,0.0,0.5,0.00,0.00,1.0,22.78,,,11.000000
3007523,2,2024-02-29 23:14:40,2024-02-29 23:19:29,,0.84,,,113,249,0,8.82,0.0,0.5,0.00,0.00,1.0,12.82,,,4.816667
3007524,1,2024-02-29 23:12:06,2024-02-29 23:21:53,,0.00,,,164,107,0,14.37,0.0,0.5,0.00,0.00,1.0,18.37,,,9.783333


In [3]:
# selecting the categorical and numerical columns needed
train_data['PU_DO'] = train_data['PULocationID'] + '_' + train_data['DOLocationID']
val_data['PU_DO'] = val_data['PULocationID'] + '_' + val_data['DOLocationID']

val_data

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration,PU_DO
0,2,2024-02-01 00:04:45,2024-02-01 00:19:58,1.0,4.39,1.0,N,68,236,1,...,1.0,0.5,1.28,0.00,1.0,26.78,2.5,0.00,15.216667,68_236
1,2,2024-02-01 00:56:31,2024-02-01 01:10:53,1.0,7.71,1.0,N,48,243,1,...,1.0,0.5,9.00,0.00,1.0,45.00,2.5,0.00,14.366667,48_243
2,2,2024-02-01 00:07:50,2024-02-01 00:43:12,2.0,28.69,2.0,N,132,261,2,...,0.0,0.5,0.00,6.94,1.0,82.69,2.5,1.75,35.366667,132_261
3,1,2024-02-01 00:01:49,2024-02-01 00:10:47,1.0,1.10,1.0,N,161,163,1,...,3.5,0.5,2.85,0.00,1.0,17.15,2.5,0.00,8.966667,161_163
4,1,2024-02-01 00:37:35,2024-02-01 00:51:15,1.0,2.60,1.0,N,246,79,2,...,3.5,0.5,0.00,0.00,1.0,20.60,2.5,0.00,13.666667,246_79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3007521,2,2024-02-29 23:20:17,2024-02-29 23:28:25,,0.35,,,164,170,0,...,0.0,0.5,0.00,0.00,1.0,25.02,,,8.133333,164_170
3007522,2,2024-02-29 23:38:37,2024-02-29 23:49:37,,1.99,,,249,45,0,...,0.0,0.5,0.00,0.00,1.0,22.78,,,11.000000,249_45
3007523,2,2024-02-29 23:14:40,2024-02-29 23:19:29,,0.84,,,113,249,0,...,0.0,0.5,0.00,0.00,1.0,12.82,,,4.816667,113_249
3007524,1,2024-02-29 23:12:06,2024-02-29 23:21:53,,0.00,,,164,107,0,...,0.0,0.5,0.00,0.00,1.0,18.37,,,9.783333,164_107


In [4]:
# converting the needed data to dict to get the explanatory features
categorical = ['PU_DO']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = train_data[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = val_data[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [5]:
# retrieving the target feature
target = 'duration'
y_train = train_data[target].values
y_val = val_data[target].values

In [6]:
# building the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

root_mean_squared_error(y_val, y_pred)

5.509515870357674

In [10]:
# saving the linear model
with open('models/lin_reg.bin', 'wb') as file:
    pickle.dump((dv, model), file)

In [None]:
# setting the tracking URI
# mlflow.set_tracking_uri("sqlite:///mlflow.db")

In [7]:
# creating the function to create an experiment
def create_ML_experiment(experiment_name, tags):
    """The function creates a new artifact if experiment doesn't exist"""
    try:
        experiment_id = mlflow.create_experiment(name=experiment_name, tags=tags)
        print(f"Experiment ID: {experiment_name} Created")

    except:
        print(f"Experiment {experiment_name} already exist")
        experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
    
    return experiment_id

In [8]:
# creating an experiment
create_ML_experiment("nyc-taxi-experiment",  {"env": "dev", "version": "1.0.0", "author": "Irene"})

Experiment nyc-taxi-experiment already exist


'243624383208551886'

In [15]:
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='file:///c:/Users/STUDSENT/Desktop/machine-learning-Ops/02-experiment-tracking/mlruns/1', creation_time=1747409127292, experiment_id='1', last_update_time=1747409127292, lifecycle_stage='active', name='nyc-taxi-experiment', tags={'author': 'Irene', 'env': 'dev', 'version': '1.0.0'}>

In [9]:
# starting runs to track the models
mlflow.set_experiment("nyc-taxi-experiment")
with mlflow.start_run(run_name="linear regression model"):
    mlflow.set_tags({"env": "dev", "version": "1.0.0", "author": "Irene"})

    mlflow.log_param("train-data-path", train_data)
    mlflow.log_param("val-data-path", val_data)

    # alpha = 0.01
    # mlflow.log_param("alpha", alpha)

    linear_model = LinearRegression()
    linear_model.fit(X_train, y_train)

    y_pred = linear_model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # saving the linear model
    with open('models/linear_model.bin', 'wb') as file:
        pickle.dump((dv, linear_model), file)

    mlflow.log_artifact(local_path="models/linear_model.bin", artifact_path="models_pickle")

In [10]:
# tracking the second model
mlflow.set_experiment("nyc-taxi-experiment")
with mlflow.start_run():
    mlflow.set_tags({"env": "dev", "version": "1.0.0", "author": "Irene"})

    mlflow.log_param("train-data-path", train_data)
    mlflow.log_param("val-data-path", val_data)

    alpha = 0.1
    mlflow.log_param("alpha", alpha)

    lasso_model = Lasso(alpha)
    lasso_model.fit(X_train, y_train)

    y_pred = lasso_model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # saving the linear model
    with open('models/lasso_model.bin', 'wb') as file:
        pickle.dump((dv, lasso_model), file)

    mlflow.log_artifact(local_path="models/lasso_model.bin", artifact_path="models_pickle")

In [11]:
# let's create another experiment
create_ML_experiment("nyc-taxi-trial",  {"env": "dev", "version": "1.0.0", "author": "Irene"})

Experiment nyc-taxi-trial already exist


'800818189677310631'

In [None]:
mlflow.set_experiment("nyc-taxi-trial")

# auto logging parameters, metrics, etc
mlflow.sklearn.autolog()

# starting new runs for for each model
for model in (RandomForestRegressor, GradientBoostingRegressor):
    with mlflow.start_run():
        mlflow.log_param("train-data-path", "data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "data/green_tripdata_2021-02.csv")
        
        # Create and save the preprocessor (example: DictVectorizer)
        preprocessor = DictVectorizer()
        preprocessor.fit(train_dicts)
        joblib.dump(preprocessor, "models/preprocessor.bin")

        # Log the saved preprocessor file after it's saved
        mlflow.log_artifact("models/preprocessor.bin", artifact_path="models_pickle")

        mlmodel = model()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("root-mean-square-error", rmse)



In [13]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
experiments = client.search_experiments()

for exp in experiments:
    print(f"{exp.experiment_id}: {exp.name} (tags: {exp.tags})")


800818189677310631: nyc-taxi-trial (tags: {'author': 'Irene', 'env': 'dev', 'version': '1.0.0'})
243624383208551886: nyc-taxi-experiment (tags: {'author': 'Irene', 'env': 'dev', 'version': '1.0.0'})
0: Default (tags: {})
