In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from pathlib import Path

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

import warnings; warnings.filterwarnings('ignore')

In [2]:
DATA = Path.cwd().parents[2] / 'data' / 'taxi'

jan = pd.read_parquet(DATA / 'yellow_tripdata_2022-01.parquet')
feb = pd.read_parquet(DATA / 'yellow_tripdata_2022-02.parquet')

In [6]:
jan.shape[1]

19

In [7]:
jan.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0


In [31]:
# jan['duration'] = jan['tpep_dropoff_datetime'] - jan['tpep_pickup_datetime']
# jan['duration'].apply(lambda td: td.total_seconds() / 60)

jan['duration'] = (jan['tpep_dropoff_datetime'] - jan['tpep_pickup_datetime']).dt.total_seconds() / 60

In [32]:
jan['duration'].std()

46.44530513776802

In [33]:
no_outliers = jan[(jan['duration'] >= 1) & (jan['duration'] <= 60)]
(no_outliers.shape[0] / jan.shape[0]) * 100

98.27547930522405

In [38]:
cats = ['PULocationID', 'DOLocationID']
target = ['duration']

no_outliers[cats] = no_outliers[cats].astype(str)

id_dict = no_outliers[cats].to_dict('records')

In [39]:
dv = DictVectorizer()
X_train = dv.fit_transform(id_dict)

X_train.shape

(2421440, 515)

In [41]:
y_train = no_outliers[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_preds = lr.predict(X_train)
mean_squared_error(y_train, y_preds, squared=False)

6.986191065500608

In [3]:
def process_dataset(df):
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    return df[(df['duration'] >= 1) & (df['duration'] <= 60)]

def fit_dv_train(train):
    cats = ['PULocationID', 'DOLocationID']
    train[cats] = train[cats].astype(str)
    id_dict = train[cats].to_dict('records')
    dv = DictVectorizer()
    X_train = dv.fit_transform(id_dict)
    return X_train, dv

def transform_dv_eval(eval, dv):
    cats = ['PULocationID', 'DOLocationID']
    eval[cats] = eval[cats].astype(str)
    id_dict = eval[cats].to_dict('records')
    return dv.transform(id_dict)

def train(df):
    processed_df = process_dataset(df)
    X, dv = fit_dv_train(processed_df)
    Y = processed_df['duration'].values
    lr = LinearRegression()
    lr.fit(X, Y)
    return lr, dv

def eval(df, dv, model):
    processed_df = process_dataset(df)
    X = transform_dv_eval(processed_df, dv)
    Y = processed_df['duration'].values
    y_preds = model.predict(X)
    return mean_squared_error(Y, y_preds, squared=False)

In [7]:
model, dv = train(jan)
eval(feb, dv, model)

7.786409085078911

In [6]:
model

(LinearRegression(), DictVectorizer())