In [52]:
!python -V

Python 3.11.9


In [53]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [54]:
import pickle

In [55]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [56]:
import mlflow
import mlflow.sklearn

# Set the tracking URI to your MLFlow server
mlflow.set_tracking_uri("http://localhost:5000")

# Set the experiment name
mlflow.set_experiment("nyc-taxi-experiment-01")

artifact_location = "./mlruns"

In [57]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [58]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-02.parquet')

In [59]:
len(df_train), len(df_val)

(59603, 66097)

In [60]:
from sklearn.pipeline import make_pipeline

In [61]:


with mlflow.start_run():
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    mlflow.log_params({
        'categorical': categorical,
        'numerical': numerical,
    })
    model_params = dict(
        fit_intercept = False
    )
    mlflow.log_params(model_params)
    
    pipeline = make_pipeline(
        DictVectorizer(),
        LinearRegression(**model_params)
        
    )
    
    target = 'duration'
    y_train = df_train[target].values
    y_val = df_val[target].values

    train_dicts = df_train[categorical + numerical].to_dict(orient='records')
    pipeline.fit(train_dicts, y_train) 
    
    val_dicts = df_val[categorical + numerical].to_dict(orient='records')   
    y_pred = pipeline.predict(val_dicts)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    print(rmse)
    mlflow.log_metric("rmse", rmse)

    mlflow.sklearn.log_model(pipeline, 'model')

8.290458507986793




In [70]:
logged_model = 'file:///C:/Users/agniv/Desktop/MLOps/mlruns/2/8a3db1cad7724bc5a9c1bd08ae3530eb/artifacts/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [71]:
val_dicts[0]

{'PULocationID': '43', 'DOLocationID': '238', 'trip_distance': 1.16}

In [72]:
loaded_model.predict({'PULocationID': '43', 'DOLocationID': '238', 'trip_distance': 1.16})

array([8.03385226])