In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error


<h3>Q1. Downloading the data<h3>

In [2]:
df_train = pd.read_parquet('../test_data/yellow_tripdata_2023-01.parquet')
print(f'number of columns in dataframe: {df_train.shape[1]}')

number of columns in dataframe: 19


<h3>Q2. Computing duration<h3>

In [3]:
def calculate_duration(df):
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    return df

In [4]:
df_train = calculate_duration(df_train)

std_dev = np.std(df_train['duration'])
print(f'the standard deviation of the trips duration in January: {std_dev}')

the standard deviation of the trips duration in January: 42.59434429744777


<h3>Q3. Dropping outliers<h3>

In [5]:
def drop_outliers(df):
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    return df

In [6]:
raw_count = df_train.shape[0]

df_train = drop_outliers(df_train)

fraction = 100 * df_train.shape[0] / raw_count
print(f'Fraction of the records remaining after screening outliers {fraction}')

Fraction of the records remaining after screening outliers 98.1220282212598


<h3>Q4. One-hot encoding<h3>

In [7]:
categorical = ['PULocationID', 'DOLocationID']

df_train[categorical] = df_train[categorical].astype(str)

train_dicts = df_train[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

print(f'The dimensionality of the feature matrix: {X_train.shape[1]}')

The dimensionality of the feature matrix: 515


<h3>Q5. Training a model<h3>

In [8]:
target = 'duration'
y_train = df_train[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

rmse_train = root_mean_squared_error(y_train, y_pred)

print(f'RMSE on train: {rmse_train}')

RMSE on train: 7.649262183753913


<h3>Q6. Evaluating the model<h3>

In [11]:
df_val = pd.read_parquet('../test_data/yellow_tripdata_2023-02.parquet')
df_val = calculate_duration(df_val)
df_val = drop_outliers(df_val)

categorical = ['PULocationID', 'DOLocationID']
df_val[categorical] = df_val[categorical].astype(str)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_val[target].values

y_pred = lr.predict(X_val)
rmse_pred = root_mean_squared_error(y_val, y_pred)

print(f'RMSE on valid: {rmse_pred}')

RMSE on valid: 7.811812092681157
