In [58]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [59]:
rides = pd.read_parquet(r'data\fhv_tripdata_2021-01.parquet')
rides.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [60]:
records = rides.count().max()
records

1154112

In [61]:
rides['duration'] = rides['dropOff_datetime'] - rides['pickup_datetime']
rides.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,0 days 00:17:00
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,0 days 00:17:00
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,0 days 01:50:00
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,0 days 00:08:17
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,0 days 00:15:13


In [62]:
rides.duration = rides.duration.apply(lambda dt: dt.total_seconds() / 60)
rides.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.0
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,110.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667


In [63]:
avg_duration = rides['duration'].mean()
avg_duration

19.1672240937939

In [64]:
rides[['PUlocationID', 'DOlocationID']] = rides[['PUlocationID', 'DOlocationID']].fillna(-1)
rides.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,-1.0,-1.0,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,-1.0,-1.0,,B00009,17.0
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,-1.0,-1.0,,B00013,110.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,-1.0,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,-1.0,61.0,,B00037,15.216667


In [65]:
nulls_pickup = rides['PUlocationID'].value_counts()[-1]
nulls_pickup / records

0.8303067639882438

In [66]:
rides = rides[(rides.duration >= 1) & (rides.duration <= 60)]
rides.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,-1.0,-1.0,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,-1.0,-1.0,,B00009,17.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,-1.0,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,-1.0,61.0,,B00037,15.216667
5,B00037,2021-01-01 00:59:02,2021-01-01 01:08:05,-1.0,71.0,,B00037,9.05


In [67]:
categorical = ['DOlocationID', 'PUlocationID']

In [68]:
rides[categorical] = rides[categorical].astype(str)

In [69]:
feature_dict = rides[categorical].to_dict(orient='records')
vc = DictVectorizer()
feature_matrix = vc.fit_transform(feature_dict)
feature_matrix.shape

(1109826, 525)

In [70]:
target = 'duration'
y_train = rides[target].values
y_train

array([17.        , 17.        ,  8.28333333, ..., 16.2       ,
       19.43333333, 36.        ])

In [71]:
lr = LinearRegression()
lr.fit(feature_matrix, y_train)

LinearRegression()

In [72]:
y_pred = lr.predict(feature_matrix)
RMSE = mean_squared_error(y_train, y_pred, squared=False)
RMSE

10.52851910721103

In [73]:
val_rides = pd.read_parquet(r'data\fhv_tripdata_2021-02.parquet')
val_rides.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00013,2021-02-01 00:01:00,2021-02-01 01:33:00,,,,B00014
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,,225.0,,B00037


In [80]:
val_rides['duration'] = val_rides['dropOff_datetime'] - val_rides['pickup_datetime']
val_rides.duration = val_rides.duration.apply(lambda dt: dt.total_seconds() / 60)
val_rides = val_rides[(val_rides.duration >= 1) & (val_rides.duration <= 60)]
val_rides.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021,10.666667
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021,14.566667
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021,7.95
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,-1.0,225.0,,B00037,13.8
5,B00037,2021-02-01 00:00:37,2021-02-01 00:09:35,-1.0,61.0,,B00037,8.966667


In [81]:
val_rides[['PUlocationID', 'DOlocationID']] = val_rides[['PUlocationID', 'DOlocationID']].fillna(-1)
val_rides.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021,10.666667
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021,14.566667
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021,7.95
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,-1.0,225.0,,B00037,13.8
5,B00037,2021-02-01 00:00:37,2021-02-01 00:09:35,-1.0,61.0,,B00037,8.966667


In [82]:
val_rides[categorical] = val_rides[categorical].astype(str)

In [83]:
val_feature_dict = val_rides[categorical].to_dict(orient='records')
val_feature_matrix = vc.transform(val_feature_dict)
val_feature_matrix.shape

(990113, 525)

In [84]:
val_y_train = val_rides[target].values
val_y_train

array([10.66666667, 14.56666667,  7.95      , ..., 25.38333333,
       18.05      , 16.        ])

In [85]:
val_y_pred = lr.predict(val_feature_matrix)
RMSE = mean_squared_error(val_y_train, val_y_pred, squared=False)
RMSE

11.014283190951092