In [1]:
!python -V

Python 3.12.7


In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
import pickle

In [2]:
# compile to a function -- read the data

def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['diff'] = df.lpep_dropoff_datetime -df.lpep_pickup_datetime
    df['duration'] = df['diff'].dt.total_seconds()/60
    df = df[(df.duration >=1)&(df.duration <=60)]
    categorical = ['PULocationID','DOLocationID'] # pick the categorical features that might be useful to predict duration
    df[categorical] = df[categorical].astype('str')
    return df

In [3]:
df_train = read_dataframe("data/green_tripdata_2021-01.parquet")
df_val = read_dataframe("data/green_tripdata_2021-02.parquet")

In [4]:
len(df_train), len(df_val)

(73908, 61921)

In [10]:
# exploring new features to better predict the duration
# e.g. pickup-dropoff pair
df_train['PU_DO'] = df_train['PULocationID'] + '_'+ df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_'+ df_val['DOLocationID']

In [11]:
categorical = ['PU_DO'] # ['PULocationID','DOLocationID'] 
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical+numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts) 

val_dicts = df_val[categorical+numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts) # for validation we don't run fit_transform

In [6]:
target ='duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [12]:
lr = LinearRegression() # create an instance/object
lr.fit(X_train, y_train)

y_val_pred = lr.predict(X_val) # apply the model trained on train-set on the validation set

root_mean_squared_error(y_val, y_val_pred)

7.758715209663881

In [13]:
# baseline -- linear regression
# then try Lasso

ls = Lasso()
# ls = Lasso(alpha = 0.01) # adjust alpha to see fitting performance
ls.fit(X_train, y_train)

y_pred = ls.predict(X_train)
y_val_pred = ls.predict(X_val)

print('Training RMSE is', root_mean_squared_error(y_train, y_pred))
print('Validation RMSE is', root_mean_squared_error(y_val, y_val_pred))

Training RMSE is 11.562050466293025
Validation RMSE is 12.212583224318818


In [14]:
# try Ridge
rd = Ridge(alpha=10)

rd.fit(X_train, y_train)

y_pred = rd.predict(X_train)
y_val_pred = rd.predict(X_val)

print('Training RMSE is', root_mean_squared_error(y_train, y_pred))
print('Validation RMSE is', root_mean_squared_error(y_val, y_val_pred))

Training RMSE is 7.6660279773989375
Validation RMSE is 8.846837413677452


#### suppose we want to keep the linear regression (baseline) model

In [16]:
with open('models/lin_reg.bin', 'wb') as f_out: # mode = 'wb' (write binary)
    pickle.dump((dv, lr), f_out) 