In [1]:
import numpy as np
import pandas as pd

In [2]:
import pickle

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [46]:
train_df = pd.read_parquet('../data/fhv_tripdata_2021-01.parquet')
val_df = pd.read_parquet('../data/fhv_tripdata_2021-02.parquet')


### Q1. Downloading the data
We'll use the same NYC taxi dataset, but instead of "Green Taxi Trip Records", we'll use "For-Hire Vehicle Trip Records"

Download the data for January and February 2021

Note that you need "For-Hire Vehicle Trip Records", not "High Volume For-Hire Vehicle Trip Records".

Read the data for January. How many records are there?

In [47]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1154112 entries, 0 to 1154111
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   dispatching_base_num    1154112 non-null  object        
 1   pickup_datetime         1154112 non-null  datetime64[ns]
 2   dropOff_datetime        1154112 non-null  datetime64[ns]
 3   PUlocationID            195845 non-null   float64       
 4   DOlocationID            991892 non-null   float64       
 5   SR_Flag                 0 non-null        object        
 6   Affiliated_base_number  1153227 non-null  object        
dtypes: datetime64[ns](2), float64(2), object(3)
memory usage: 61.6+ MB


In [48]:
train_df.shape

(1154112, 7)

### Q2. Computing duration
Now let's compute the duration variable. It should contain the duration of a ride in minutes.

What's the average trip duration in January?

In [49]:
train_df['duration'] = train_df['dropOff_datetime'] - train_df['pickup_datetime']

train_df['duration_min'] = train_df['duration'].dt.total_seconds() / 60

In [50]:
train_df.duration_min.mean()

19.167224093791006

### Data preparation
Check the distribution of the duration variable. There are some outliners.

Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

How many records did you drop?



In [51]:
train_df_filtered = train_df.query('duration_min >= 1. and duration_min <= 60.')

### Q3. Missing values
The features we'll user for our model are the pickup and dropoff location IDs.

But they have a lot of missing values there. Let's replace them with "-1"

What's the factions of missing values for the pickup location ID? (Or the fraction of "-1"s after you filled the NAs)

In [52]:
# %%time
train_df_filtered['PUlocationID'].isna().sum() / train_df_filtered.shape[0]

0.8352732770722617

In [53]:
%%time
train_df['PUlocationID'].isna().sum() / train_df.shape[0]

CPU times: user 7.49 ms, sys: 2.15 ms, total: 9.64 ms
Wall time: 7.01 ms


0.8303067639882438

### Q4. One-hot encoding
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

Turn the dataframe into a list of dictionaries
Fit a dictionary vectorizer
Get a feature matrix from it

In [54]:
categorical = ['PUlocationID', 'DOlocationID']
numerical = ['trip_distance']

In [55]:
train_df[categorical] = train_df[categorical].fillna(-1).astype('str')

In [56]:
train_df_filtered = train_df.query('duration_min >= 1. and duration_min <= 60.')

In [57]:
%%time
dv = DictVectorizer()

X_train = dv.fit_transform(train_df[categorical].to_dict(orient='records'))
X_train.shape

CPU times: user 2.2 s, sys: 80.4 ms, total: 2.28 s
Wall time: 2.29 s


(1154112, 525)

In [58]:
# train_df_filtered['PU_DO'] = train_df_filtered['PUlocationID'] + '_' + train_df_filtered['DOlocationID']

In [59]:
# categorical = ['PU_DO']

In [60]:
dv = DictVectorizer()

X_train = dv.fit_transform(train_df_filtered[categorical].to_dict(orient='records'))
y_train = train_df_filtered['duration_min'].values
X_train.shape

(1109826, 525)

### Q5. Training a model
Now let's use the feature matrix from the previous step to train a model.

Train a plain linear regression model with default parameters
Calculate the RMSE of the model on the training data
What's the RMSE on train?

In [61]:
%%time
model = LinearRegression()

model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)

# train_loss = np.sqrt(mean_squared_error(y_train, y_pred_train))

train_loss = mean_squared_error(y_train, y_pred_train, squared=False)

print(f'RMSE: {train_loss}')

RMSE: 10.528519433270043
CPU times: user 2.02 s, sys: 364 ms, total: 2.39 s
Wall time: 2.39 s


### Q6. Evaluating the model
Now let's apply this model to the validation dataset.

What's the RMSE on validation?

In [69]:
def transform_data(df):
    df['duration'] = df['dropOff_datetime'] - df['pickup_datetime']
    df['duration_min'] = df['duration'].dt.total_seconds() / 60
    
    df[categorical] = df[categorical].fillna(-1).astype('str')
    
    df = df.query('duration_min >= 1. and duration_min <= 60.')

    return df

In [70]:
val_df = transform_data(val_df)

In [71]:
val_df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration,duration_min
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021,0 days 00:10:40,10.666667
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021,0 days 00:14:34,14.566667
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021,0 days 00:07:57,7.95
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,-1.0,225.0,,B00037,0 days 00:13:48,13.8
5,B00037,2021-02-01 00:00:37,2021-02-01 00:09:35,-1.0,61.0,,B00037,0 days 00:08:58,8.966667


In [72]:
%%time

X_val = dv.transform(val_df[categorical].to_dict(orient='records'))
y_val = val_df['duration_min'].values
X_val.shape

CPU times: user 1.86 s, sys: 70.1 ms, total: 1.93 s
Wall time: 1.93 s


(990113, 525)

In [73]:
%%time

y_pred = model.predict(X_val)

val_loss = mean_squared_error(y_val, y_pred, squared=False)

print(f'RMSE: {val_loss}')

RMSE: 11.01428568518046
CPU times: user 9.57 ms, sys: 5.34 ms, total: 14.9 ms
Wall time: 24.7 ms
