In [14]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [15]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder

In [16]:
from sklearn.metrics import mean_squared_error

In [17]:
import pickle

In [18]:
def read_df(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df.dropOff_datetime-df.pickup_datetime
    df.duration = df.duration.apply(lambda x: x.total_seconds()/60)
    return df

In [19]:
df_train = read_df('https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet')
df_val = read_df('https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet')

### Question 1
##### Number of records in Jan 2021 FHV data

In [20]:
len(df_train.index)

1154112

In [21]:
df_train.describe(include='all', datetime_is_numeric=True).loc['count']

dispatching_base_num        1154112
pickup_datetime             1154112
dropOff_datetime            1154112
PUlocationID               195845.0
DOlocationID               991892.0
SR_Flag                           0
Affiliated_base_number      1153227
duration                  1154112.0
Name: count, dtype: object

Answer: 1154112 rows (B)

### Question 2
##### Average duration in Jan 2021 FHV

In [22]:
df_train.duration.mean()

19.167224093791006

In [23]:
df_train.describe(include='all', datetime_is_numeric=True).loc['mean']

dispatching_base_num                                NaN
pickup_datetime           2021-01-16 21:13:00.469540864
dropOff_datetime          2021-01-16 21:32:10.502986752
PUlocationID                                  139.85969
DOlocationID                                  135.89803
SR_Flag                                             NaN
Affiliated_base_number                              NaN
duration                                      19.167224
Name: mean, dtype: object

Answer: 19.16 min (B)

### Question 3
#### Fraction of missing values

In [24]:
df_train.isnull().sum() * 100 / len(df_train)

dispatching_base_num        0.000000
pickup_datetime             0.000000
dropOff_datetime            0.000000
PUlocationID               83.030676
DOlocationID               14.055828
SR_Flag                   100.000000
Affiliated_base_number      0.076682
duration                    0.000000
dtype: float64

Supposing we are talking about column 'PUlocationID'->
Answer: 83% (D)

### Question 4
#### Dimensionality after OHE

In [32]:
categorical = ['PUlocationID', 'DOlocationID']
df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)

In [26]:
encoder = OneHotEncoder()
array_enc = encoder.fit_transform(df_train[['PUlocationID','DOlocationID']]).toarray()
len(array_enc[0]) #number of columns

525

Answer: 525 (D)

### Question 5
#### RMSE on train

In [27]:
df_train.duration.describe(percentiles = [0.95, 0.96, 0.97, 0.98, 0.99])

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
50%      1.340000e+01
95%      4.725000e+01
96%      5.146667e+01
97%      5.775000e+01
98%      6.613333e+01
99%      9.030000e+01
max      4.233710e+05
Name: duration, dtype: float64

In [28]:
df = df_train[(df_train.PUlocationID + df_train.DOlocationID != 'nannan')]
df = df[(df.duration > 1) & (df.duration < 57.7)]
array_enc = encoder.fit_transform(df[['PUlocationID','DOlocationID']]).toarray()

In [29]:
y_train = df['duration'].values
#y_val = df_val['duration'].values

In [30]:
lr = Lasso()
lr.fit(array_enc, y_train)
y_pred = lr.predict(array_enc)

In [31]:
mean_squared_error(y_train, y_pred, squared = False)

10.539380804487939

Answer: Aprox. 10.52 min (B)

### Question 6
#### RMSE on validation

In [33]:
df_val = df_val[df_val.PUlocationID.isin(df_train.PUlocationID.unique())]
df_val = df_val[df_val.DOlocationID.isin(df_train.DOlocationID.unique())]
df_val.duration.describe(percentiles = [0.95, 0.96, 0.97, 0.98, 0.99])

count    1.037689e+06
mean     2.070700e+01
std      1.611086e+02
min      1.666667e-02
50%      1.410000e+01
95%      5.225000e+01
96%      5.745000e+01
97%      6.366667e+01
98%      7.550400e+01
99%      1.050000e+02
max      1.109190e+05
Name: duration, dtype: float64

In [34]:
encoder = OneHotEncoder()

df = df_train #[(df_train.PUlocationID + df_train.DOlocationID != 'nannan')]
df = df[(df.duration > 1) & (df.duration < 57)]
array_enc_train = encoder.fit_transform(df[['PUlocationID','DOlocationID']]).toarray()

y_train = df['duration'].values

df = df_val #[(df_val.PUlocationID + df_val.DOlocationID != 'nannan')]
df = df[(df.duration > 1) & (df.duration < 57)]
array_enc_val = encoder.fit_transform(df[['PUlocationID','DOlocationID']]).toarray()

y_val = df['duration'].values

In [35]:
lr = Lasso()
lr.fit(array_enc_train, y_train)
y_pred = lr.predict(array_enc_val)

In [36]:
mean_squared_error(y_val, y_pred, squared = False)

11.502750407905614

Answer: Aprox. 11.01 (B)