## Load the libraries

In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Load the dataset

In [3]:
df = pd.read_parquet("data/yellow_tripdata_2022-01.parquet")

In [4]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [5]:
df.shape

(2463931, 19)

There are **19** columns inside the dataframe.

## Computing the duration

In [6]:
df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
df['duration'] = df['duration'].apply(lambda x: x.total_seconds()/60)

In [7]:
df['duration'].describe().apply(lambda x: format(x, 'f'))

count    2463931.000000
mean          14.212203
std           46.445305
min        -3442.400000
25%            6.316667
50%           10.183333
75%           16.166667
max         8513.183333
Name: duration, dtype: object

In [8]:
std = df['duration'].std().round(2)

In [9]:
print(f"Standard deviation of duration is {std}")

Standard deviation of duration is 46.45


## Dropping Outliers

In [10]:
print(f"Total number of rows before dropping outliers is {len(df)}")

Total number of rows before dropping outliers is 2463931


In [11]:
before_rows = len(df)
print(before_rows)

2463931


In [12]:
## dropping outliers

df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

In [13]:
print(f"Total number of rows after dropping outliers is {len(df)}")

Total number of rows after dropping outliers is 2421440


In [14]:
after_rows = len(df)
print(after_rows)

2421440


In [15]:
rows_left_perc = after_rows / before_rows * 100
rows_left_perc = round(rows_left_perc)
print(rows_left_perc)

98


In [16]:
print(f"{rows_left_perc}% of rows left after dropping outliers")

98% of rows left after dropping outliers


## One-hot Encoding 

In [17]:
cat_vars = ['PULocationID', 'DOLocationID']

In [18]:
df[cat_vars] = df[cat_vars].astype(str)

In [19]:
train_dicts = df[cat_vars].to_dict(orient='records')

In [20]:
dv = DictVectorizer()

In [21]:
X_train = dv.fit_transform(train_dicts)

In [22]:
X_train

<2421440x515 sparse matrix of type '<class 'numpy.float64'>'
	with 4842880 stored elements in Compressed Sparse Row format>

In [23]:
print(f"The dimensionality of this matrix is {X_train.shape[1]}")

The dimensionality of this matrix is 515


## Training Models

In [24]:
target = "duration"
y_train = df[target].values

In [25]:
lr = LinearRegression()

In [26]:
lr.fit(X_train, y_train)

In [27]:
y_pred = lr.predict(X_train)

In [28]:
rmse = mean_squared_error(y_train, y_pred, squared=False).round(2)

In [29]:
print(f"RMSE on train is {rmse}")

RMSE on train is 6.99


## Evaluating the models

In [30]:
def read_data(file_path):
    
    df = pd.read_parquet(file_path)
    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    df['duration'] = df['duration'].apply(lambda x: x.total_seconds()/60)
    
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    cat_vars = ['PULocationID', 'DOLocationID']
    
    df[cat_vars] = df[cat_vars].astype(str)
    
    return df
    

In [31]:
val_df = read_data("data/yellow_tripdata_2022-02.parquet")

In [32]:
val_dicts = val_df[cat_vars].to_dict(orient='records')

In [33]:
X_val = dv.transform(val_dicts)

In [34]:
y_val = val_df[target].values

In [35]:
y_pred = lr.predict(X_val)

In [36]:
rmse = mean_squared_error(y_val, y_pred, squared=False).round(2)

In [37]:
print(f"RMSE on validation is {rmse}")

RMSE on validation is 7.79
