In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error

In [22]:
TRAIN_FILE = "./data/yellow_tripdata_2023-01.parquet"
TEST_FILE = "./data/yellow_tripdata_2023-02.parquet"

In [23]:
train_df = pd.read_parquet(TRAIN_FILE)
test_df = pd.read_parquet(TEST_FILE)

### Q1. How many columns on January dataset?

In [24]:
len(train_df.columns)

19

### Q2. Computing duration

In [25]:
train_df["duration"] = (
    pd.to_datetime(train_df["tpep_dropoff_datetime"]) - pd.to_datetime(train_df["tpep_pickup_datetime"])
)
train_df["duration"] = train_df["duration"].dt.total_seconds() / 60

In [26]:
train_df["duration"].describe()

count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

### Q3. Dropping outliers

In [28]:
len_before = len(train_df)
train_df = train_df[(train_df["duration"]>= 1) & (train_df["duration"] <= 60)]
len_after = len(train_df)

In [29]:
len_after/len_before

0.9812202822125979

### Q4. One-hot encoding

In [36]:
columns_ohe = ["PULocationID", "DOLocationID"]
train_df[columns_ohe] = train_df[columns_ohe].astype(str)

In [38]:
dict_vectorizer = DictVectorizer()
train_df_ohe = dict_vectorizer.fit_transform(train_df[columns_ohe].to_dict(orient="records"))

In [42]:
train_df_ohe.shape[1]

515

### Q5. Training a model

In [43]:
lr = LinearRegression()
X_train = train_df_ohe
y_train = train_df["duration"].values
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
rmse = root_mean_squared_error(y_train, y_pred)
rmse

7.649261931416412

### Q6. Evaluating the model

In [52]:
test_df["duration"] = (
    pd.to_datetime(test_df["tpep_dropoff_datetime"]) - pd.to_datetime(test_df["tpep_pickup_datetime"])
)
test_df["duration"] = test_df["duration"].dt.total_seconds() / 60
test_df[columns_ohe] = test_df[columns_ohe].astype(str)

In [55]:
X_test = dict_vectorizer.transform(test_df[columns_ohe].to_dict(orient="records"))
y_test = test_df["duration"].values

In [58]:
y_pred = lr.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
rmse

7.8118162035401735