In [1]:
import pandas as pd

In [2]:
import datetime

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

In [4]:
from sklearn.metrics import mean_squared_error

In [5]:
jan_data = pd.read_parquet('../data/fhv_tripdata_2021-01.parquet')

In [6]:
feb_data = pd.read_parquet('../data/fhv_tripdata_2021-02.parquet')

## Q1 

In [7]:
jan_data.shape

(1154112, 7)

## Q2 

In [8]:
jan_data.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [9]:
for data in [jan_data, feb_data]:
    data['duration'] = data.dropOff_datetime - data.pickup_datetime
    data['duration'] = data['duration'].apply(lambda x: x.total_seconds()/60)

In [10]:
jan_data['duration'].describe()

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
25%      7.766667e+00
50%      1.340000e+01
75%      2.228333e+01
max      4.233710e+05
Name: duration, dtype: float64

# Data Cleaning 

In [11]:
filtred_jan = jan_data[(jan_data.duration >= 1) & (jan_data.duration <= 60)]
filtred_feb = feb_data[(feb_data.duration >= 1) & (feb_data.duration <= 60)]

In [12]:
jan_data.shape[0] - filtred_jan.shape[0]

44286

In [13]:
filtred_jan.columns

Index(['dispatching_base_num', 'pickup_datetime', 'dropOff_datetime',
       'PUlocationID', 'DOlocationID', 'SR_Flag', 'Affiliated_base_number',
       'duration'],
      dtype='object')

In [14]:
for filtred_data in [filtred_jan, filtred_feb]:
        filtred_data.loc[:,['PUlocationID', 'DOlocationID']] = \
                filtred_data.loc[:,['PUlocationID', 'DOlocationID']].fillna(-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtred_data.loc[:,['PUlocationID', 'DOlocationID']] = \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtred_data.loc[:,['PUlocationID', 'DOlocationID']] = \


In [15]:
filtred_jan.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,-1.0,-1.0,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,-1.0,-1.0,,B00009,17.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,-1.0,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,-1.0,61.0,,B00037,15.216667
5,B00037,2021-01-01 00:59:02,2021-01-01 01:08:05,-1.0,71.0,,B00037,9.05


## Q3 

In [16]:
filtred_jan[filtred_jan.PUlocationID == -1].shape[0]/filtred_jan.shape[0]

0.8352732770722617

# 1-hot encode

In [17]:
for filtred_data in [filtred_jan, filtred_feb]:
    filtred_data[['PUlocationID','DOlocationID']] = filtred_data[['PUlocationID','DOlocationID']].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtred_data[['PUlocationID','DOlocationID']] = filtred_data[['PUlocationID','DOlocationID']].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtred_data[['PUlocationID','DOlocationID']] = filtred_data[['PUlocationID','DOlocationID']].astype('str')


In [18]:
filtred_jan['PUlocationID'][0]

'-1.0'

In [19]:
locations_dict = filtred_jan[['PUlocationID','DOlocationID']].to_dict('records')
locat_vector = DictVectorizer()
X_train = locat_vector.fit_transform(locations_dict)

## Q4 

In [20]:
len(locat_vector.feature_names_)

525

In [21]:
X_train.shape

(1109826, 525)

# Train model 

In [22]:
y_train = filtred_jan.duration

In [23]:
X_train.shape, y_train.shape

((1109826, 525), (1109826,))

In [24]:
lin_reg = LinearRegression()

In [25]:
lin_reg.fit(X_train, y_train)

LinearRegression()

## Q5

In [26]:
mean_squared_error(y_train, lin_reg.predict(X_train), squared=False)

10.5285191072072

# Eval model 

In [27]:
filtred_feb.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021,10.666667
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021,14.566667
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021,7.95
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,-1.0,225.0,,B00037,13.8
5,B00037,2021-02-01 00:00:37,2021-02-01 00:09:35,-1.0,61.0,,B00037,8.966667


In [28]:
X_val = locat_vector.transform(filtred_feb[['PUlocationID','DOlocationID']].to_dict("records"))
y_val = filtred_feb.duration

In [29]:
X_val.shape, y_val.shape

((990113, 525), (990113,))

## Q6 

In [30]:
mean_squared_error(y_val, lin_reg.predict(X_val), squared=False)

11.014283163400654