In [187]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
from datetime import datetime
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [188]:
train = pd.read_parquet('fhv_tripdata_2021-01.parquet', engine='pyarrow')

### Q1

In [189]:
train.shape[0]

1154112

In [190]:
train.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


### Q2

In [191]:
train['duration'] = train.dropOff_datetime - train.pickup_datetime
train['duration'] = train['duration'].apply(lambda td: td.total_seconds()/60)
train.duration.mean()

19.1672240937939

In [192]:
train.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.0
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,110.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667


In [193]:
df = train[(train.duration >= 1) & (train.duration <= 60)]
print(f"The number of records dropped of is {train.shape[0] - df.shape[0]}")

The number of records dropped of is 44286


In [194]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1109826 entries, 0 to 1154111
Data columns (total 8 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   dispatching_base_num    1109826 non-null  object        
 1   pickup_datetime         1109826 non-null  datetime64[ns]
 2   dropOff_datetime        1109826 non-null  datetime64[ns]
 3   PUlocationID            182818 non-null   float64       
 4   DOlocationID            961919 non-null   float64       
 5   SR_Flag                 0 non-null        object        
 6   Affiliated_base_number  1109053 non-null  object        
 7   duration                1109826 non-null  float64       
dtypes: datetime64[ns](2), float64(3), object(3)
memory usage: 76.2+ MB


In [195]:
categorical = ['PUlocationID', 'DOlocationID']

df[categorical] = df[categorical].fillna(-1).astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


### Q3

In [196]:
print((len(df[df['PUlocationID'] == -1])/len(df['PUlocationID'])) * 100)
#print((len(df[df['DOlocationID'] == -1])/len(df['DOlocationID'])) * 100)

83.52732770722618


In [197]:
df[categorical] = df[categorical].astype(str)

In [198]:
dv = DictVectorizer()
df_dict = df[categorical].to_dict(orient='records')
X_train = dv.fit_transform(df_dict)

### Q4

In [199]:
target = 'duration'
y_train = df[target]
X_train.shape[1]

525

### Q5

In [200]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)

10.528519107206316

In [201]:
categorical = ['PUlocationID', 'DOlocationID']
def read_dataframe(filename):
    df = pd.read_parquet(filename, engine='pyarrow')
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)
    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
    df[categorical] = df[categorical].fillna(-1).astype('int').astype(str)
    df[categorical] = df[categorical].astype(str)
    return df
    

In [202]:
df_val = read_dataframe('fhv_tripdata_2021-02.parquet')

### Q6

In [203]:
val_dict = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dict)
target = 'duration'
y_val = df_val[target]
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

11.014283149347039