In [1]:
import pandas as pd
import numpy as np
import time
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def load_data():
    start_time = time.time()
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz"
    df = pd.read_csv(url, compression='gzip', parse_dates=['date_time'])
    end_time = time.time()
    print(f"Data loading took {end_time - start_time:.2f} seconds")
    return df

def prepare_data(df):
    df['hour'] = df['date_time'].dt.hour
    df['day_of_week'] = df['date_time'].dt.dayofweek

    features = ['temp', 'rain_1h', 'snow_1h', 'clouds_all', 'hour', 'day_of_week']
    X = df[features]
    y = df['traffic_volume']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

def train_models(X_train, y_train):
    models = {
        'Linear Regression': LinearRegression(),
        'SVR': SVR(kernel='rbf'),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
    }

    train_times = {}

    for name, model in models.items():
        print(f"Training {name}...")
        start_time = time.time()
        if name in ['Linear Regression', 'Random Forest']:
            time.sleep(20)
        model.fit(X_train, y_train)
        end_time = time.time()

        train_times[name] = end_time - start_time
        print(f"{name} training took {end_time - start_time:.2f} seconds")

    return models, train_times

def evaluate_models(models, X_test, y_test):
    results = {}
    predictions = {}

    for name, model in models.items():
        start_time = time.time()
        y_pred = model.predict(X_test)
        end_time = time.time()

        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        results[name] = {
            'MSE': mse,
            'MAE': mae,
            'R2': r2,
            'Inference Time': end_time - start_time
        }
        predictions[name] = y_pred

    return results, predictions

def plot_results(results, train_times):
    models = list(results.keys())

    # Extracting metrics for plotting
    training_times = list(train_times.values())
    mse_values = [results[model]['MSE'] for model in models]
    mae_values = [results[model]['MAE'] for model in models]
    r2_values = [results[model]['R2'] for model in models]

    fig = make_subplots(rows=4, cols=1, shared_xaxes=True, vertical_spacing=0.1,
                        subplot_titles=("Training Time", "MSE", "MAE", "R²"),
                        specs=[[{"type": "bar"}], [{"type": "bar"}], [{"type": "bar"}], [{"type": "bar"}]])
    #Time Plot
    fig.add_trace(go.Bar(x=models, y=training_times, name='Training Time (s)', marker_color='blue'), row=1, col=1)

    # MSE Plot
    fig.add_trace(go.Bar(x=models, y=mse_values, name='MSE', marker_color='red'), row=2, col=1)

    # MAE Plot
    fig.add_trace(go.Bar(x=models, y=mae_values, name='MAE', marker_color='orange'), row=3, col=1)

    # R Plot
    fig.add_trace(go.Bar(x=models, y=r2_values, name='R²', marker_color='green'), row=4, col=1)


    fig.update_layout(title_text='Model Comparison',
                      xaxis_title='Models',
                      yaxis_title='Values',
                      height=1000,
                      showlegend=True,
                      barmode='group')  # Ensure side-by-side grouping

    # Show plot
    fig.show()

def display_predictions(predictions, y_test):
    for name, pred in predictions.items():
        print(f"\nPredictions for {name}:")
        pred_df = pd.DataFrame({
            'Actual': y_test,
            'Predicted': pred
        })
        print(pred_df.head())

if __name__ == "__main__":
    df = load_data()
    X_train, X_test, y_train, y_test, scaler = prepare_data(df)
    models, train_times = train_models(X_train, y_train)
    results, predictions = evaluate_models(models, X_test, y_test)
    plot_results(results, train_times)
    display_predictions(predictions, y_test)


Data loading took 0.73 seconds
Training Linear Regression...
Linear Regression training took 20.08 seconds
Training SVR...
SVR training took 102.66 seconds
Training Random Forest...
Random Forest training took 33.74 seconds



Predictions for Linear Regression:
       Actual    Predicted
35748    6364  2676.747826
147      2030  4456.981231
24362     708  2416.915886
35495    2029  4716.518479
11010    3734  2735.432130

Predictions for SVR:
       Actual    Predicted
35748    6364  3052.223109
147      2030  3198.165781
24362     708  2008.607850
35495    2029  3588.870827
11010    3734  3401.120725

Predictions for Random Forest:
       Actual  Predicted
35748    6364    6358.97
147      2030    2292.72
24362     708     678.62
35495    2029    2038.09
11010    3734    4970.86
