In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
from sklearn.feature_extraction import DictVectorizer

In [8]:
from sklearn.linear_model import LinearRegression

In [9]:
from sklearn.linear_model import Lasso

In [10]:
from sklearn.linear_model import Ridge

In [11]:
from sklearn.metrics import mean_squared_error

In [28]:
#### Modularizing the code 

def read_dataframe(filename):

    df = pd.read_parquet(filename)
    df['duration'] = df.lpep_dropoff_datetime  - df.lpep_pickup_datetime   
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)

    df = df[((df.duration>=1) & (df.duration<=60))]

    categorical =['PULocationID','DOLocationID' ]
    #numerical =['trip_distance']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [29]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [30]:
len(df_train), len(df_val)

(73908, 61921)

In [31]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [32]:
categorical =  ['PU_DO'] #['PULocationID','DOLocationID' ]
numerical =['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical ].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)


In [33]:
target ='duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [38]:

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

7.480879703714338

In [35]:
lr = Lasso(0.001)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

9.23346989760693

In [36]:
lr = Ridge(0.001)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

7.479495067043056

In [37]:
import pickle

In [39]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv,lr), f_out)