In [1]:
#pip install pyarrow


In [2]:
# Import necessary libraries
import pandas as pd
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import mlflow


In [23]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-experiment')

<Experiment: artifact_location='/workspaces/mlopsProduction/02-experiment-tracking/mlruns/1', creation_time=1716364782674, experiment_id='1', last_update_time=1716364782674, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [4]:
#experiment_id = mlflow.create_experiment("New Experiment Name")
#mlflow.set_experiment(experiment_id)

In [5]:
# Function to read and preprocess data
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    df['PULocationID'] = df['PULocationID'].astype('category')
    df['DOLocationID'] = df['DOLocationID'].astype('category')
    return df

In [6]:
# Load and prepare training and validation data
df_train = read_dataframe('data/yellow_tripdata_2023-01.parquet')
df_val = read_dataframe('data/yellow_tripdata_2023-02.parquet')
target = 'duration'
categorical_columns = ['PULocationID', 'DOLocationID']

<div style="background-color: green; color: white; padding: 10px;">

The purpose of the talk is purely focused on MLOps. I have selected only a few columns in order to keep the model and training process simple.

</div>



In [7]:
df_train['duration'].describe()

count    3.009173e+06
mean     1.420486e+01
std      9.939386e+00
min      1.000000e+00
25%      7.216667e+00
50%      1.155000e+01
75%      1.818333e+01
max      6.000000e+01
Name: duration, dtype: float64

In [8]:
# Vectorize categorical features
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(df_train[categorical_columns].to_dict(orient='records'))
X_val = dv.transform(df_val[categorical_columns].to_dict(orient='records'))
y_train = df_train[target].values
y_val = df_val[target].values

In [9]:
print("Dimensionality (number of columns):", X_val.shape[1])

Dimensionality (number of columns): 2


## LinearRegression

In [10]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_rmse = mean_squared_error(y_val, lr.predict(X_val), squared=False)
print("Linear Regression MSE:", lr_rmse)

Linear Regression MSE: 9.963607595829973




In [11]:
from sklearn import linear_model
with mlflow.start_run():
    mlflow.set_tag('developer','Huseyn')
    mlflow.log_param('train-data-path', 'data/yellow_tripdata_2023-01.parquet')
    mlflow.log_param('valid-data-path', 'data/yellow_tripdata_2023-02.parquet')
    alpha=0.01
    ls = linear_model.Lasso(alpha=alpha)
    ls.fit(X_train, y_train)
    ls_rmse = mean_squared_error(y_val, lr.predict(X_val), squared=False)
    mlflow.log_metric('rmse',ls_rmse)
    print("Linear Regression MSE:", ls_rmse)


Linear Regression MSE: 9.963607595829973




In [12]:

alpha_values = [0.01, 0.05, 0.1, 0.5, 1.0]
for alpha in alpha_values:
    with mlflow.start_run():
        mlflow.set_tag('developer', 'Huseyn')
        mlflow.log_param('train-data-path', 'data/yellow_tripdata_2023-01.parquet')
        mlflow.log_param('valid-data-path', 'data/yellow_tripdata_2023-02.parquet')
        mlflow.log_param('alpha', alpha)
        ls = linear_model.Lasso(alpha=alpha)
        ls.fit(X_train, y_train)
        ls_rmse = mean_squared_error(y_val, ls.predict(X_val), squared=False)
        mlflow.log_metric('rmse', ls_rmse)
        print(f"Linear Regression RMSE with alpha={alpha}: {ls_rmse}")




Linear Regression RMSE with alpha=0.01: 9.96360781849841




Linear Regression RMSE with alpha=0.05: 9.963608750857988




Linear Regression RMSE with alpha=0.1: 9.963610009929864




Linear Regression RMSE with alpha=0.5: 9.963623827395615
Linear Regression RMSE with alpha=1.0: 9.96365046141962




In [14]:
import os
import joblib
model_folder = 'models'
model_path = os.path.join(model_folder, 'lasso_model.pkl')
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
joblib.dump(ls, model_path)
print(f"Model saved successfully in {model_path}!")


Model saved successfully in models/lasso_model.pkl!


In [15]:
# Save the models and DictVectorizer
with open('mlruns/models/Lasso_model.pkl', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

## Xgboost model

In [None]:
### xgboost model

In [18]:
import xgboost as xgb
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import root_mean_squared_error

def objective(params):
    model = xgb.XGBRegressor(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        learning_rate=params['learning_rate'],
        subsample=params['subsample'],
        gamma=params['gamma'],
        colsample_bytree=params['colsample_bytree']
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, preds)
    mlflow.log_metric("rmse", rmse)
    return {'loss': -rmse, 'status': STATUS_OK}


In [24]:
space = {
    'max_depth': hp.choice('max_depth', range(1, 3)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.02),
    'n_estimators': hp.choice('n_estimators', range(100, 101)),
    'subsample': hp.uniform('subsample', 0.7, 0.8),
    'gamma': hp.uniform('gamma', 0.0, 0.1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
}


In [25]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)
print("Best hyperparameters:", best)


  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

Parameters: { "n_estimators" } are not used.


job exception: Must have at least 1 validation dataset for early stopping.



  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]


ValueError: Must have at least 1 validation dataset for early stopping.

In [None]:
## Tree models

In [16]:
# Train a decision tree model
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
tree_mse = mean_squared_error(y_val, tree.predict(X_val), squared=False)
print("Decision Tree MSE:", tree_mse)

Decision Tree MSE: 5.220358942859578




In [21]:
import xgboost as xgb

# Preparing training and validation datasets
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)


In [26]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        booster=xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            early_stopping_rounds=50,
            evals=[(valid, "validation")]
        
        )
        y_pred=booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
    return {'loss': rmse, 'status': STATUS_OK}

In [27]:
space = {
    'max_depth': hp.choice('max_depth', range(1, 3)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.02),
    'n_estimators': hp.choice('n_estimators', range(100, 101)),
    'subsample': hp.uniform('subsample', 0.7, 0.8),
    'gamma': hp.uniform('gamma', 0.0, 0.1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
}


In [28]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)
print("Best hyperparameters:", best)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

Parameters: { "n_estimators" } are not used.




[0]	validation-rmse:10.06335                          
[1]	validation-rmse:10.04471                          
[2]	validation-rmse:10.04049                          
[3]	validation-rmse:10.02221                          
[4]	validation-rmse:10.01806                          
[5]	validation-rmse:10.01401                          
[6]	validation-rmse:9.99609                           
[7]	validation-rmse:9.99261                           
[8]	validation-rmse:9.97504                           
[9]	validation-rmse:9.97163                           
[10]	validation-rmse:9.96829                          
[11]	validation-rmse:9.96502                          
[12]	validation-rmse:9.94778                          
[13]	validation-rmse:9.93089                          
[14]	validation-rmse:9.91431                          
[15]	validation-rmse:9.89807                          
[16]	validation-rmse:9.88213                          
[17]	validation-rmse:9.87896                          
[18]	valid


Parameters: { "n_estimators" } are not used.




[0]	validation-rmse:10.06454                                                    
[1]	validation-rmse:10.05590                                                    
[2]	validation-rmse:10.05289                                                    
[3]	validation-rmse:10.04453                                                    
[4]	validation-rmse:10.04162                                                    
[5]	validation-rmse:10.03869                                                    
[6]	validation-rmse:10.03063                                                    
[7]	validation-rmse:10.02782                                                    
[8]	validation-rmse:10.02005                                                    
[9]	validation-rmse:10.01732                                                    
[10]	validation-rmse:10.01468                                                   
[11]	validation-rmse:10.01201                                                   
[12]	validation-rmse:10.0045


Parameters: { "n_estimators" } are not used.




[0]	validation-rmse:10.06256                                                    
[1]	validation-rmse:10.04054                                                    
[2]	validation-rmse:10.03557                                                    
[3]	validation-rmse:10.01402                                                    
[4]	validation-rmse:10.00917                                                    
[5]	validation-rmse:10.00500                                                    
[6]	validation-rmse:9.98400                                                     
[7]	validation-rmse:9.97997                                                     
[8]	validation-rmse:9.95944                                                     
[9]	validation-rmse:9.95552                                                     
[10]	validation-rmse:9.95095                                                    
[11]	validation-rmse:9.94716                                                    
[12]	validation-rmse:9.92708


Parameters: { "n_estimators" } are not used.




[0]	validation-rmse:10.06215                                                      
[1]	validation-rmse:10.03833                                                      
[2]	validation-rmse:10.03297                                                      
[3]	validation-rmse:10.00973                                                      
[4]	validation-rmse:10.00451                                                      
[5]	validation-rmse:10.00002                                                      
[6]	validation-rmse:9.97736                                                       
[7]	validation-rmse:9.97303                                                       
[8]	validation-rmse:9.95095                                                       
[9]	validation-rmse:9.94673                                                       
[10]	validation-rmse:9.94263                                                      
[11]	validation-rmse:9.93863                                                      
[12]


Parameters: { "n_estimators" } are not used.




[0]	validation-rmse:10.06575                                                     
[1]	validation-rmse:10.06044                                                     
[2]	validation-rmse:10.05856                                                     
[3]	validation-rmse:10.05336                                                     
[4]	validation-rmse:10.05152                                                     
[5]	validation-rmse:10.04972                                                     
[6]	validation-rmse:10.04464                                                     
[7]	validation-rmse:10.04284                                                     
[8]	validation-rmse:10.03788                                                     
[9]	validation-rmse:10.03613                                                     
[10]	validation-rmse:10.03438                                                    
[11]	validation-rmse:10.03269                                                    
[12]	validation-




[0]	validation-rmse:10.06439                                                     
 10%|█         | 5/50 [12:03<1:46:24, 141.87s/trial, best loss: 8.35393422478502]

Parameters: { "n_estimators" } are not used.




[1]	validation-rmse:10.05535                                                     
[2]	validation-rmse:10.05221                                                     
[3]	validation-rmse:10.04347                                                     
[4]	validation-rmse:10.04044                                                     
[5]	validation-rmse:10.03737                                                     
[6]	validation-rmse:10.02897                                                     
[7]	validation-rmse:10.02603                                                     
[8]	validation-rmse:10.01794                                                     
[9]	validation-rmse:10.01510                                                     
[10]	validation-rmse:10.01236                                                    
[11]	validation-rmse:10.00959                                                    
[12]	validation-rmse:10.00181                                                    
[13]	validation-


Parameters: { "n_estimators" } are not used.




[0]	validation-rmse:10.06348                                                     
[1]	validation-rmse:10.04541                                                     
[2]	validation-rmse:10.04132                                                     
[3]	validation-rmse:10.02358                                                     
[4]	validation-rmse:10.01956                                                     
[5]	validation-rmse:10.01562                                                     
[6]	validation-rmse:9.99823                                                      
[7]	validation-rmse:9.99485                                                      
[8]	validation-rmse:9.97779                                                      
[9]	validation-rmse:9.97447                                                      
[10]	validation-rmse:9.97123                                                     
[11]	validation-rmse:9.96805                                                     
[12]	validation-


Parameters: { "n_estimators" } are not used.




[0]	validation-rmse:10.06467                                                     
[1]	validation-rmse:10.05639                                                     
[2]	validation-rmse:10.05351                                                     
[3]	validation-rmse:10.04550                                                     
[4]	validation-rmse:10.04270                                                     
[5]	validation-rmse:10.03989                                                     
[6]	validation-rmse:10.03216                                                     
[7]	validation-rmse:10.02949                                                     
[8]	validation-rmse:10.02202                                                     
[9]	validation-rmse:10.01935                                                     
[10]	validation-rmse:10.01681                                                    
[11]	validation-rmse:10.01424                                                    
[12]	validation-


Parameters: { "n_estimators" } are not used.




[0]	validation-rmse:10.06286                                                     
[1]	validation-rmse:10.04211                                                     
[2]	validation-rmse:10.03743                                                     
[3]	validation-rmse:10.01711                                                     
[4]	validation-rmse:10.01253                                                     
[5]	validation-rmse:10.00860                                                     
[6]	validation-rmse:9.98874                                                      
[7]	validation-rmse:9.98492                                                      
[8]	validation-rmse:9.96549                                                      
[9]	validation-rmse:9.96111                                                      
[10]	validation-rmse:9.95744                                                     
[11]	validation-rmse:9.95385                                                     
[12]	validation-


Parameters: { "n_estimators" } are not used.




[0]	validation-rmse:10.06250                                                     
[1]	validation-rmse:10.04023                                                     
[2]	validation-rmse:10.03520                                                     
[3]	validation-rmse:10.01341                                                     
[4]	validation-rmse:10.00851                                                     
[5]	validation-rmse:10.00429                                                     
[6]	validation-rmse:9.98307                                                      
[7]	validation-rmse:9.97900                                                      
[8]	validation-rmse:9.95824                                                      
[9]	validation-rmse:9.95428                                                      
[10]	validation-rmse:9.94967                                                     
[11]	validation-rmse:9.94584                                                     
[12]	validation-


Parameters: { "n_estimators" } are not used.




[0]	validation-rmse:10.06408                                                      
[1]	validation-rmse:10.05417                                                      
[2]	validation-rmse:10.05074                                                      
[3]	validation-rmse:10.04121                                                      
[4]	validation-rmse:10.03792                                                      
[5]	validation-rmse:10.03457                                                      
[6]	validation-rmse:10.02543                                                      
[7]	validation-rmse:10.02223                                                      
[8]	validation-rmse:10.01347                                                      
[9]	validation-rmse:10.01040                                                      
[10]	validation-rmse:10.00737                                                     
[11]	validation-rmse:10.00445                                                     
[12]


Parameters: { "n_estimators" } are not used.




[0]	validation-rmse:10.06492                                                      
[1]	validation-rmse:10.05733                                                      
[2]	validation-rmse:10.05467                                                      
[3]	validation-rmse:10.04729                                                      
[4]	validation-rmse:10.04472                                                      
[5]	validation-rmse:10.04213                                                      
[6]	validation-rmse:10.03499                                                      
[7]	validation-rmse:10.03252                                                      
[8]	validation-rmse:10.02560                                                      
[9]	validation-rmse:10.02313                                                      
[10]	validation-rmse:10.02077                                                     
[11]	validation-rmse:10.01839                                                     
[12]


Parameters: { "n_estimators" } are not used.




[0]	validation-rmse:10.06398                                                      
[1]	validation-rmse:10.05378                                                      
[2]	validation-rmse:10.05026                                                      
[3]	validation-rmse:10.04046                                                      
[4]	validation-rmse:10.03707                                                      
[5]	validation-rmse:10.03363                                                      
[6]	validation-rmse:10.02426                                                      
[7]	validation-rmse:10.02097                                                      
[8]	validation-rmse:10.01198                                                      
[9]	validation-rmse:10.00884                                                      
[10]	validation-rmse:10.00574                                                     
[11]	validation-rmse:10.00275                                                     
[12]


Parameters: { "n_estimators" } are not used.




[0]	validation-rmse:10.05990                                                      
[1]	validation-rmse:10.02640                                                      
[2]	validation-rmse:10.01895                                                      
[3]	validation-rmse:9.98658                                                       
[4]	validation-rmse:9.97942                                                       
[5]	validation-rmse:9.97320                                                       
[6]	validation-rmse:9.94203                                                       
[7]	validation-rmse:9.93611                                                       
[8]	validation-rmse:9.90600                                                       
[9]	validation-rmse:9.90032                                                       
[10]	validation-rmse:9.89486                                                      
[11]	validation-rmse:9.88958                                                      
[12]

KeyboardInterrupt: 

In [None]:

class TaxiTripModel(nn.Module):
    def __init__(self):
        super(TaxiTripModel, self).__init__()
        self.layer1 = nn.Linear(X_train.shape[1], 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        return self.output(x)


In [None]:
# Convert data to PyTorch tensors and create DataLoaders
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_data = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)

In [None]:
# Train the PyTorch model
model = TaxiTripModel()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

In [None]:
def train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            preds = model(X_batch)
            loss = loss_fn(preds, y_batch.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        model.eval()
        val_loss = sum(loss_fn(model(X_val), y_val.unsqueeze(1)) for X_val, y_val in val_loader) / len(val_loader)
        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}')

In [None]:
train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=10)

# Save the trained PyTorch model and DictVectorizer
with open('models/TaxiTripModel.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out)