In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [None]:
# Q1. Downloading the data
df = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')
df

In [None]:
# Q2. Computing duration
df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
df["duration"] = df["duration"].apply(lambda td: td.total_seconds() / 60)
np.std(df["duration"])

In [None]:
# Q3. Dropping outliers
new_df = df[(df["duration"] >= 1) & (df["duration"] <= 60)]
len(new_df)/len(df)*100

In [None]:
# Q4. One-hot encoding
features = ["PULocationID", "DOLocationID"]
new_df[features] = new_df[features].astype(str)

train_dicts = new_df[features].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_train.shape

In [None]:
# Q5. Training a model
y_train = new_df["duration"].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

root_mean_squared_error(y_train, y_pred)

In [None]:
# Q6. Evaluating the model
df_valid = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')
df_valid["duration"] = df_valid["tpep_dropoff_datetime"] - df_valid["tpep_pickup_datetime"]
df_valid["duration"] = df_valid["duration"].apply(lambda td: td.total_seconds() / 60)
df_valid = df_valid[(df_valid["duration"] >= 1) & (df_valid["duration"] <= 60)]
df_valid[features] = df_valid[features].astype(str)
valid_dicts = df_valid[features].to_dict(orient='records')

In [None]:
X_valid = dv.fit_transform(valid_dicts)
y_valid = df_valid["duration"].values
y_pred = lr.predict(X_valid)

root_mean_squared_error(y_valid, y_pred)