In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle
import os
import mlflow
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
import uuid

In [2]:
mlflow.set_tracking_uri("sqlite:////home/kaustubh/mlops_zoomcamp/Mlops-ZoomCamp/04_deployment/web-service-mlflow/mlflow.db")
mlflow.set_experiment("deployment_experiment")

<Experiment: artifact_location='/home/kaustubh/mlops_zoomcamp/Mlops-ZoomCamp/04_deployment/web-service-mlflow/mlruns/1', creation_time=1718293289067, experiment_id='1', last_update_time=1718293289067, lifecycle_stage='active', name='deployment_experiment', tags={}>

In [3]:
tracking_uri="sqlite:////home/kaustubh/mlops_zoomcamp/Mlops-ZoomCamp/04_deployment/web-service-mlflow/mlflow.db"
mlflow.set_tracking_uri(tracking_uri)
run_id="af618559e8f9430482c5aa9b543b8294"

# Load model as a PyFuncModel.
logged_model = f'runs:/{run_id}/model'
model = mlflow.pyfunc.load_model(logged_model)

In [4]:
def read_dataframe(filename):
    df=pd.read_parquet(filename)

    df['duration']= df.lpep_dropoff_datetime-df.lpep_pickup_datetime

    df.duration=df["duration"].apply(lambda td: td.total_seconds()/60)

    df=df[(df.duration >=1 ) & (df.duration <=60 )]

    return df

def prepare_dictonaries(df:pd.DataFrame):
    categorical=['PULocationID','DOLocationID']

    df[categorical]=df[categorical].astype('str')

    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']

    categorical = ['PU_DO']
    numerical = ['trip_distance']
    
    dicts = df[categorical + numerical ].to_dict(orient='records')
    return dicts


In [22]:
year = 2023
month = 1
taxi_type = 'green'

input_file = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{taxi_type}_tripdata_{year:04d}-{month:02d}.parquet"
output_file = f"output/{taxi_type}/{year:04d}-{month:02d}.parquet"

In [23]:
!mkdir output/green

mkdir: cannot create directory ‘output/green’: File exists


In [7]:
filename="/home/kaustubh/mlops_zoomcamp/Mlops-ZoomCamp/data/green_tripdata_2023-01.parquet"
df = read_dataframe(input_file)

In [8]:
dicts = prepare_dictonaries(df)

In [9]:
y_pred = model.predict(dicts)
# y_pred

In [10]:
df_result = pd.DataFrame()

In [11]:
str(uuid.uuid4())

'a9cbaf93-d06a-462f-8636-2c1bb8a02824'

In [12]:
len(df)

65946

In [13]:
n = len(df)
ride_ids=[]
for i in range(n):
    ride_ids.append(str(uuid.uuid4()))

In [14]:
ride_ids[:5]

['2c8bdff1-0a1e-471e-a269-65f3af2457f8',
 '6226754b-4d57-432d-bd5b-4bafff4801d1',
 'dfa8d629-93ca-4cd8-ae1a-be6da8c3ac9a',
 '0bc0cb0b-e615-4250-a265-7110f4115af5',
 '6e54c013-6734-4894-9562-a8f6942139e0']

In [15]:
df['ride_id'] = ride_ids

In [16]:
df

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration,PU_DO,ride_id
0,2,2023-01-01 00:26:10,2023-01-01 00:37:11,N,1.0,166,143,1.0,2.58,14.90,...,0.0,,1.0,24.18,1.0,1.0,2.75,11.016667,166_143,2c8bdff1-0a1e-471e-a269-65f3af2457f8
1,2,2023-01-01 00:51:03,2023-01-01 00:57:49,N,1.0,24,43,1.0,1.81,10.70,...,0.0,,1.0,15.84,1.0,1.0,0.00,6.766667,24_43,6226754b-4d57-432d-bd5b-4bafff4801d1
2,2,2023-01-01 00:35:12,2023-01-01 00:41:32,N,1.0,223,179,1.0,0.00,7.20,...,0.0,,1.0,11.64,1.0,1.0,0.00,6.333333,223_179,dfa8d629-93ca-4cd8-ae1a-be6da8c3ac9a
3,1,2023-01-01 00:13:14,2023-01-01 00:19:03,N,1.0,41,238,1.0,1.30,6.50,...,0.0,,1.0,10.20,1.0,1.0,0.00,5.816667,41_238,0bc0cb0b-e615-4250-a265-7110f4115af5
4,1,2023-01-01 00:33:04,2023-01-01 00:39:02,N,1.0,41,74,1.0,1.10,6.00,...,0.0,,1.0,8.00,1.0,1.0,0.00,5.966667,41_74,6e54c013-6734-4894-9562-a8f6942139e0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68206,2,2023-01-31 22:29:00,2023-01-31 22:42:00,,,49,62,,4070.82,15.70,...,0.0,,1.0,16.70,,,,13.000000,49_62,833adaaa-0f49-4b20-ac07-c8e0b7e115c7
68207,2,2023-01-31 22:40:00,2023-01-31 22:48:00,,,10,205,,2.14,4.41,...,0.0,,1.0,5.41,,,,8.000000,10_205,3e8a535c-bd1b-41ea-a99b-c3a8bec54977
68208,2,2023-01-31 23:46:00,2023-02-01 00:02:00,,,66,37,,3.44,16.53,...,0.0,,1.0,21.04,,,,16.000000,66_37,fa4e1a2f-942f-4446-a5fe-a3f10398a5cd
68209,2,2023-01-31 23:01:00,2023-01-31 23:19:00,,,225,189,,3.03,14.98,...,0.0,,1.0,19.18,,,,18.000000,225_189,ba8082dc-e3f5-404e-99f9-240f32e99979


In [17]:
df_result['ride_id'] = df['ride_id']
df_result['lpep_pickup_datetime'] = df['lpep_pickup_datetime']
df_result['PULocationID'] = df['PULocationID']
df_result['DOLocationID'] = df['DOLocationID']
df_result['actual_duration'] = df['duration']
df_result['predicted_duration'] = y_pred
df_result['diff'] = df_result['actual_duration'] - df_result['predicted_duration']
df_result['model_version'] = run_id


In [19]:
!mkdir output

In [24]:
df_result.to_parquet(output_file,index=False)