In [1]:
#Setup Libs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import pickle

In [2]:
#Data Paths
DATA_PATH_TRAIN = '../data/raw/fhv_tripdata_2021-01.parquet'
DATA_PATH_VAL = '../data/raw/fhv_tripdata_2021-02.parquet'

In [3]:
print("How many records are there for January ? {}".format(pd.read_parquet(DATA_PATH_TRAIN).shape[0]))

How many records are there for January ? 1154112


In [4]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    return df

In [5]:
df_train = read_dataframe(DATA_PATH_TRAIN)
df_val = read_dataframe(DATA_PATH_VAL)

df_train.shape[0]

1109826

- Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).
- How many records did you drop?

In [6]:
print("number of droped records : {}".format(1154112-1109826))

number of droped records : 44286


In [7]:
df_train.describe().apply(lambda s: s.apply('{0:.5f}'.format))

Unnamed: 0,PUlocationID,DOlocationID,duration
count,182818.0,961919.0,1109826.0
mean,139.86163,135.68915,16.24725
std,74.76379,80.34968,11.5515
min,1.0,1.0,1.0
25%,75.0,67.0,7.85
50%,146.0,132.0,13.23333
75%,206.0,213.0,21.46667
max,265.0,265.0,60.0


In [8]:
print("What's the average trip duration in January? 16.24")

What's the average trip duration in January? 16.24


In [9]:
# Missing Values
df_train['PUlocationID'].fillna(-1,inplace=True)
df_train['DOlocationID'].fillna(-1,inplace=True)
df_val['PUlocationID'].fillna(-1,inplace=True)
df_val['DOlocationID'].fillna(-1,inplace=True)

In [10]:
# What's the fractions of missing values for the pickup location ID? I.e. fraction of "-1"s after you filled the NAs.
df_train[df_train['PUlocationID']==-1].shape[0]/df_train.shape[0]

0.8352732770722617

In [12]:
categorical = ['PUlocationID', 'DOlocationID']
df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)

df_train['PU_DO'] = df_train['PUlocationID'] + '_' + df_train['PUlocationID']
df_val['PU_DO'] = df_val['PUlocationID'] + '_' + df_val['PUlocationID']

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [13]:
#What's the dimensionality of this matrix? (The number of columns).
X_train

<1109826x525 sparse matrix of type '<class 'numpy.float64'>'
	with 2219652 stored elements in Compressed Sparse Row format>

In [17]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [16]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

10.52851910721103

In [18]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

11.014283190951092