In [1]:
import os
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [3]:
!bash get_data.sh

In [4]:
def prepare_file(file):
    df = pd.read_parquet('../data/'+file)
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    entries_total = df.shape[0]
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    print(f"{entries_total - df.shape[0]} (potential) outliers were dropped.", '\n')
    
    df['PUlocationID'].fillna(-1, inplace=True)
    print(f"The proportion of trips with missing pickup location is {df[df.PUlocationID == -1].shape[0]/df.shape[0]}.", '\n')
    
    return df


In [5]:
###Question 1
file = 'fhv_tripdata_2021-01.parquet'
df = pd.read_parquet('../data/'+file)
print(f"Q1: The number of entries in {file} is {df.shape[0]}", '\n')

###Question 2
df['duration'] = df.dropOff_datetime - df.pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
print(f"Q2: The average duration of trips in {file} is {df.duration.mean()} minutes.", '\n')

entries_total = df.shape[0]
df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
print(f"{entries_total - df.shape[0]} (potential) outliers were dropped.", '\n')

###Question 3
df['PUlocationID'].fillna(-1, inplace=True)
print(f"Q3: The proportion of trips with missing pickup location is {df[df.PUlocationID == -1].shape[0]/df.shape[0]}.", '\n')


Q1: The number of entries in fhv_tripdata_2021-01.parquet is 1154112 

Q2: The average duration of trips in fhv_tripdata_2021-01.parquet is 19.167224093791006 minutes. 

44286 (potential) outliers were dropped. 

Q3: The proportion of trips with missing pickup location is 0.8352732770722617. 



In [6]:
features = ['PUlocationID', 'DOlocationID']
target = 'duration'
train_dicts = df[features].astype(str).to_dict(orient='records')

In [7]:
file_val = 'fhv_tripdata_2021-02.parquet'
df_val = prepare_file(file_val)
val_dicts = df_val[features].astype(str).to_dict(orient='records')

47579 (potential) outliers were dropped. 

The proportion of trips with missing pickup location is 0.8571354986754037. 



In [8]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [9]:
print(f"Q4: The dimensionality of the feature matrix is {X_train.shape[1]}.")

Q4: The dimensionality of the feature matrix is 525.


In [10]:
lr = LinearRegression()

In [11]:
y_train = df[target].values
y_val = df_val[target].values

In [12]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
y_pred_val = lr.predict(X_val)
RMSE = mean_squared_error(y_train, y_pred, squared=False)
RMSE_val = mean_squared_error(y_val, y_pred_val, squared=False)

In [13]:
print(f"Q5: The RMSE on the training set is {RMSE}.")

Q5: The RMSE on the training set is 10.528519405244424.


In [14]:
print(f"Q6: The RMSE on the validation set is {RMSE_val}.")

Q6: The RMSE on the validation set is 11.014288029210666.
