In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [17]:
df = pd.read_csv("../data/processed/2024/train.csv", parse_dates=['Trip_Start', 'Trip_End'])

In [27]:
df = df.sample(frac=0.1, random_state=42) 

In [28]:
# 1. Calculate trip duration in seconds
df['Trip_Duration'] = (df['Trip_End'] - df['Trip_Start']).dt.total_seconds()


In [29]:
# 2. Extract features from Trip_Start
df['Start_Hour'] = df['Trip_Start'].dt.hour
df['Start_DayOfWeek'] = df['Trip_Start'].dt.dayofweek


In [30]:
# Drop rows with missing or invalid data
df = df.dropna(subset=['Trip_Duration', 'Year_of_Birth', 'Gender', 'Origin_Id', 'Destination_Id'])


In [31]:
# Define features and target variable
X = df[['Year_of_Birth', 'Gender', 'Origin_Id', 'Destination_Id', 'Start_Hour', 'Start_DayOfWeek']]
y = df['Trip_Duration']


In [32]:
# One-hot encode the categorical column 'Gender'
categorical_features = ['Gender']
numerical_features = ['Year_of_Birth', 'Origin_Id', 'Destination_Id', 'Start_Hour', 'Start_DayOfWeek']


In [33]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

In [34]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [35]:
X_train.shape

(344468, 6)

In [36]:
# Baseline Model: Linear Regression
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [37]:
# Train and evaluate Linear Regression
lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)


In [38]:
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))

In [39]:
print(f"Linear Regression MAE: {mae_lr}")
print(f"Linear Regression RMSE: {rmse_lr}")

Linear Regression MAE: 286.9239296198802
Linear Regression RMSE: 508.7994380443073


In [40]:
# Advanced Model: Random Forest Regressor
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, max_depth=3, random_state=42, n_jobs=2))
])

# Train and evaluate Random Forest Regressor
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print(f"Random Forest Regressor MAE: {mae_rf}")
print(f"Random Forest Regressor RMSE: {rmse_rf}")


Random Forest Regressor MAE: 273.4928774507655
Random Forest Regressor RMSE: 504.2735714430493


In [42]:
import joblib
import datetime

current_date = datetime.datetime.now().strftime("%Y-%m-%d")

model_filename = f"../models/random_forest_mae-{mae_rf:.4f}_{current_date}.pkl"

# Save the trained model to a file
joblib.dump(rf_pipeline, model_filename)

print(f"Model saved successfully as {model_filename}")


Model saved successfully as ../models/random_forest_mae-273.4929_2025-01-29.pkl
