In [2]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error


In [3]:
def read_dataframe(filename):
    if filename.endswith(".csv"):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)

    elif filename.endswith(".parquet"):
        df = pd.read_parquet(filename)

    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)

    return df

In [4]:
df_train = read_dataframe("../data/yellow_tripdata_2023-01.parquet")
df_val = read_dataframe("../data/yellow_tripdata_2023-02.parquet")

In [4]:
df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]

In [5]:
categorical = ["PU_DO"]
numerical = ["trip_distance"]

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [6]:
y_train = df_train["duration"].values
y_val = df_val["duration"].values

In [10]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)

5.247519676439711

In [12]:
X_train.shape[1]

21802

In [5]:
categorical = ['PULocationID', 'DOLocationID']
df_train[categorical] = df_train[categorical].astype(str)

# Turn dataframe into list of dictionaries
train_dicts = df_train[categorical].to_dict(orient='records')

# Fit dictionary vectorizer
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

# Get dimensionality
print(f"Q4 Answer - Dimensionality of feature matrix: {X_train.shape[1]}")

Q4 Answer - Dimensionality of feature matrix: 515


In [6]:
y_train = df_train['duration'].values

# Train linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Calculate RMSE on training data
y_pred_train = lr.predict(X_train)
rmse_train = root_mean_squared_error(y_train, y_pred_train)
print(f"Q5 Answer - RMSE on train: {rmse_train:.2f}")

Q5 Answer - RMSE on train: 7.65


In [8]:
df_val[categorical] = df_val[categorical].astype(str)
val_dicts = df_val[categorical].to_dict(orient='records')

X_val = dv.transform(val_dicts)
y_val = df_val['duration'].values

# Make predictions
y_pred_val = lr.predict(X_val)

# Calculate RMSE on validation
rmse_val = root_mean_squared_error(y_val, y_pred_val)
print(f"Q6 Answer - RMSE on validation: {rmse_val:.2f}")

Q6 Answer - RMSE on validation: 7.81
