LINEAR REGRESSION AND RANDOM FOREST

In [None]:
# FlowCastAI - Traffic Volume Prediction
# --------------------------------------
# Models used: Linear Regression, Random Forest

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load datasets
traffic_df = pd.read_csv('Traffic_Data.csv')
weather_df = pd.read_csv('Weather_Data.csv')

# Reshape traffic data: wide -> long format for hourly volumes
hour_cols = [f'hour_{i:02d}' for i in range(24)]
traffic_long = traffic_df.melt(
    id_vars=['station_key', 'year', 'month', 'day', 'day_of_week', 'public_holiday', 'school_holiday'],
    value_vars=hour_cols,
    var_name='hour',
    value_name='traffic_volume'
)

# Convert date and hour columns to usable format
traffic_long['date'] = pd.to_datetime(traffic_long[['year', 'month', 'day']])
traffic_long['hour'] = traffic_long['hour'].str.extract(r'(\d+)').astype(int)

# Clean and convert weather dates safely
weather_df['date'] = weather_df['date'].astype(str).str.strip()
weather_df['date'] = pd.to_datetime(weather_df['date'], errors='coerce', dayfirst=True)

# Merge datasets on the date
df = pd.merge(traffic_long, weather_df, on='date', how='inner')

# Drop incomplete records
df.dropna(subset=['traffic_volume', 'daily_rain', 'max_temp', 'min_temp'], inplace=True)

# Select input features and target
feature_cols = ['hour', 'day_of_week', 'public_holiday', 'school_holiday', 'daily_rain', 'max_temp', 'min_temp']
X = df[feature_cols]
y = df['traffic_volume']

# Convert categorical binary fields to dummy variables
X = pd.get_dummies(X, columns=['public_holiday', 'school_holiday'], drop_first=True)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---- Train models ----

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# ---- Evaluation ----

def get_scores(name, y_true, y_pred):
    return {
        'Model': name,
        'R2 Score': round(r2_score(y_true, y_pred), 4),
        'MSE': round(mean_squared_error(y_true, y_pred), 2)
    }

results = [
    get_scores("Linear Regression", y_test, y_pred_lr),
    get_scores("Random Forest", y_test, y_pred_rf)
]

results_df = pd.DataFrame(results)

# Print results
print("Model Evaluation Summary:")
print(results_df)

Model Evaluation Summary:
               Model  R2 Score      MSE
0  Linear Regression    0.0126  4156.05
1      Random Forest    0.3513  2730.60
