In [1]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_parquet("yellow_tripdata_2022-01.parquet", engine='pyarrow')
len(df)

2463931

In [3]:
old_len = len(df)
old_len

2463931

In [4]:
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

In [5]:
# Nuumber of columns in Jan 2022 Yellow Taxi Trip data

len(df.columns)

19

In [6]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df['duration'] = df.duration.dt.total_seconds() / 60

In [7]:
# standard Deviation

df.duration.std()

46.44530513776499

In [8]:
df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

In [9]:
len(df)

2421440

In [10]:
# Fraction of the records left after dropping the outliers

len(df)/old_len

0.9827547930522406

In [11]:
categorical = ['PULocationID', 'DOLocationID']

df[categorical] = df[categorical].fillna(-1).astype('int')

In [12]:
df[categorical] = df[categorical].astype('str')

In [13]:
train_dicts = df[categorical].to_dict(orient='records')

In [14]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [15]:
# Dimensionality after OHE

X_train.shape

(2421440, 515)

In [16]:
y_train = df.duration.values

In [17]:
len(dv.feature_names_)

515

In [18]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [19]:
y_pred = lr.predict(X_train)

# RMSE on train
mean_squared_error(y_train, y_pred, squared=False)

6.986190135965746

In [20]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename, engine='pyarrow')
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [21]:
df_val = read_data('yellow_tripdata_2022-02.parquet')

In [22]:
val_dicts = df_val[categorical].to_dict(orient='records')

In [23]:
X_val = dv.transform(val_dicts)

In [24]:
y_pred = lr.predict(X_val)

In [25]:
y_val = df_val.duration.values

In [26]:
# RMSE on validation

mean_squared_error(y_val, y_pred, squared=False)

7.786389487779153