In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression


from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_parquet('./data/fhv_tripdata_2021-01.parquet')

## Q1. Downloading the data; How many records are there?

In [4]:
len(df)

1154112

## Q2. Computing duration; What's the average trip duration in January? 

In [5]:
df['duration'] = df.dropOff_datetime - df.pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [6]:
df.duration.mean()

19.1672240937939

## Data preparation

In [7]:
df = df[((df.duration >= 1) & (df.duration <= 60))]

In [8]:
len(df)

1109826

How many records did you drop?

In [9]:
1154112-1109826

44286

## Q3. Missing values; What's the fractions of missing values for the pickup location ID?

In [10]:
mean_PUlocationID = df['PUlocationID'].mean()
mean_PUlocationID

139.86163288078853

In [11]:
df.isnull().sum()

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID               927008
DOlocationID               147907
SR_Flag                   1109826
Affiliated_base_number        773
duration                        0
dtype: int64

In [12]:
df['PUlocationID'] = df['PUlocationID'].fillna(-1)
df['DOlocationID'] = df['DOlocationID'].fillna(-1)

In [13]:
mean_PUlocationID_ = df['PUlocationID'].mean()
mean_PUlocationID_

22.203675170702436

In [14]:
1-(mean_PUlocationID_/mean_PUlocationID)

0.8412454172501489

In [15]:
927008/1154112

0.8032218710142517

## Q4. One-hot encoding;What's the dimensionality of this matrix

In [16]:
categorical = ['PUlocationID', 'DOlocationID']
df[categorical] = df[categorical].astype(str)

train_dicts = df[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_train

<1109826x525 sparse matrix of type '<class 'numpy.float64'>'
	with 2219652 stored elements in Compressed Sparse Row format>

In [17]:
categorical = ['PUlocationID', 'DOlocationID']

## Q5. Training a model; What's the RMSE on train?

In [18]:
target = 'duration'
y_train = df[target].values

In [19]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [20]:
y_pred = lr.predict(X_train)

In [21]:
mean_squared_error(y_train, y_pred, squared=False)

10.528519107213159

## Q6. Evaluating the model; What's the RMSE on validation?

In [22]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    df = df[((df.duration >= 1) & (df.duration <= 60))]
    
    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].astype(str)
   
    return df

In [23]:
df_train = read_dataframe('./data/fhv_tripdata_2021-01.parquet')
df_val = read_dataframe('./data/fhv_tripdata_2021-02.parquet')

In [24]:
categorical = ['PUlocationID', 'DOlocationID']  

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [25]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [26]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

11.01428312810924