In [2]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [3]:
def read_dataframe(filename, month):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    print(f"Number of columns for {month}: {df.shape[1]}")

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    print(f"Standard deviation for column duration for {month}: {df['duration'].std()}")
    print(f"Fraction of the records left for {month}: {((df.duration >= 1) & (df.duration <= 60)).mean()}")

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [4]:
df_train = read_dataframe('../data/yellow_tripdata_2023-01.parquet', 'January')
df_val = read_dataframe('../data/yellow_tripdata_2023-02.parquet', 'February')

Number of columns for January: 19
Standard deviation for column duration for January: 42.594351241920904
Fraction of the records left for January: 0.9812202822125979
Number of columns for February: 19
Standard deviation for column duration for February: 42.84210176105097
Fraction of the records left for February: 0.9800944077722545


In [7]:
categorical = ['PULocationID', 'DOLocationID']

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
print(f"The dimensionality of the matrix for January: {X_train.shape[1]}")

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print(f"RMSE on train: {mean_squared_error(y_train, y_pred_train, squared=False)}")
y_pred_val = lr.predict(X_val)
print(f"RMSE on validation: {mean_squared_error(y_val, y_pred_val, squared=False)}")

The dimensionality of the matrix for January: 515
RMSE on train: 7.649261027792376
RMSE on validation: 7.811832836304415


Number of columns for January: 19 \
Standard deviation for column duration for January: 42.59 \
Fraction of the records left for January: 0.98 \
The dimensionality of the matrix for January: 515 \
RMSE on train: 7.64 \
RMSE on validation: 7.81