In [1]:
# In 03_duration_modeling.ipynb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
import pickle

# --- Haversine Formula for Distance ---
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat / 2)**2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

# --- Load and Prep Data ---
df = pd.read_csv('../data/Divvy_Trips_2022_Q1.csv') # Or your chosen file
df.dropna(subset=['start_station_name', 'end_station_name', 'start_lat', 'start_lng', 'end_lat', 'end_lng'], inplace=True)
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])
df['duration_sec'] = (df['ended_at'] - df['started_at']).dt.total_seconds()
df = df[(df['duration_sec'] > 60) & (df['duration_sec'] < 10800)]

# --- Feature Engineering ---
df['distance_km'] = haversine(df['start_lat'], df['start_lng'], df['end_lat'], df['end_lng'])
df['hour_of_day'] = df['started_at'].dt.hour
df['day_of_week_num'] = df['started_at'].dt.dayofweek

# --- Model Training ---
features = ['distance_km', 'hour_of_day', 'day_of_week_num', 'start_station_name', 'end_station_name']
target = 'duration_sec'

model_df = df[features + [target]].copy()
model_df = pd.get_dummies(model_df, columns=['start_station_name', 'end_station_name'])

X = model_df.drop(target, axis=1)
y = model_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

duration_model = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1, max_depth=10)
duration_model.fit(X_train, y_train)

# Evaluate (RMSE in seconds)
preds = duration_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"Duration Model RMSE: {rmse:.2f} seconds")

# --- Save Assets ---
joblib.dump(duration_model, '../output/duration_model.pkl')
with open('../output/duration_model_columns.pkl', 'wb') as f:
    pickle.dump(list(X.columns), f)
print("Duration model and columns saved successfully.")

Duration Model RMSE: 604.84 seconds
Duration model and columns saved successfully.
