### Model Stats CSV
Show info about the CSV file<br>
Show percentage of "is_better" overall and for each dataset

In [1]:
import pandas as pd

df = pd.read_csv('./model_stats.csv')
print(df.shape)

# Print percentage of "is_better" overall and for each dataset
print("\nPercentage of 'is_better' for each dataset:")
print((df.groupby(['dataset_name', 'dataset_group'])['scores_is_better']
    .value_counts(normalize=True) * 100))

print("\nPercentage of 'is_better' overall:")
print((df['scores_is_better'].value_counts(normalize=True) * 100))

(4320, 550)

Percentage of 'is_better' for each dataset:
dataset_name  dataset_group  scores_is_better
Gluonts       m1_monthly     True                70.138889
                             False               29.861111
              m1_quarterly   True                93.055556
                             False                6.944444
M3            Monthly        True                82.638889
                             False               17.361111
              Quarterly      True                59.027778
                             False               40.972222
Tourism       Monthly        False               90.277778
                             True                 9.722222
              Quarterly      False               81.944444
                             True                18.055556
Name: proportion, dtype: float64

Percentage of 'is_better' overall:
scores_is_better
True     55.439815
False    44.560185
Name: proportion, dtype: float64


### Feature Engineering (UNUSED)
Create new features:
- Calculate differences between consecutive steps for each layer
- ...

In [2]:
import pandas as pd
import re

df = pd.read_csv('./model_stats.csv')

# Define the layer steps you want to compare
steps = ['step_10', 'step_25', 'step_50', 'step_100', 'step_200', 'step_300', 'step_400', 'step_500']
layers = ['mlp.0', 'mlp.1', 'mlp.2']  # Adapt if you have more
stats = ['weight_mean', 'weight_std', 'weight_var', 'weight_frobenius_norm', 'weight_spectral_norm', 'weight_alpha_hat']  # etc.

# Find all relevant columns
pattern = re.compile(r'weights_step_(\d+)_mlp\.(\d+)\.(\w+)')
relevant_cols = [col for col in df.columns if pattern.match(col)]

# Compute diffs and store them in a dictionary first
features = {}

for stat in stats:
    for layer in layers:
        for i in range(1, len(steps)):
            prev_step = steps[i-1]
            curr_step = steps[i]
            col_prev = f'weights_{prev_step}_{layer}.{stat}'
            col_curr = f'weights_{curr_step}_{layer}.{stat}'
            if col_prev in df.columns and col_curr in df.columns:
                new_col = f'diff_{curr_step}_{prev_step}_{layer}_{stat}'
                features[new_col] = df[col_curr] - df[col_prev]

In [3]:
# # Concatenate all new features at once
# features_df = pd.DataFrame(features)
# df = pd.concat([df, features_df], axis=1)

# # Save or return the enhanced dataframe
# df.to_csv('./new_model_stats.csv', index=False)

## Stages Summary

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

In [5]:
# LINE PLOTS for CLASSIFICATION METRICS
df_classification = pd.read_csv('./output/classification/stagewise_summary.csv')

# Print evolution of metrics for classification in separate plots
base_metrics = ['acc_score', 'roc_auc_score', 'log_loss_score', 'f1_score']

output_dir = os.path.join("output", "images", "classification")
os.makedirs(output_dir, exist_ok=True)

for metric in base_metrics:
    mean_col = f"{metric}_mean"
    std_col = f"{metric}_std"

    plt.figure(figsize=(10, 6))
    plt.errorbar(df_classification['stage'], df_classification[mean_col],
    # plt.errorbar(df_classification['stage'], df_classification[mean_col], yerr=df_classification[std_col],
                 fmt='o-', capsize=5, label=metric)
    plt.ylabel(metric)
    plt.xlabel('Stage')
    plt.title(f"Evolution of '{metric}' (Classification)")
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.legend()

    plt.savefig(f"{output_dir}/{metric}_evolution.png")
    # plt.show()
    plt.close()

In [6]:
# LINE PLOTS for REGRESSION METRICS
df_regression = pd.read_csv('./output/regression/stagewise_summary.csv')

# Print evolution of metrics for regression in separate plots
base_metrics = ['mae_score', 'mse_score', 'r2_score', 'pearson', 'kendall', 'spearman']

output_dir = os.path.join("output", "images", "regression")
os.makedirs(output_dir, exist_ok=True)

for metric in base_metrics:
    mean_col = f"{metric}_mean"
    std_col = f"{metric}_std"

    plt.figure(figsize=(10, 6))
    plt.errorbar(df_regression['stage'], df_regression[mean_col],
    # plt.errorbar(df_regression['stage'], df_regression[mean_col], yerr=df_regression[std_col],
                 fmt='o-', capsize=5, label=metric)
    plt.ylabel(metric)
    plt.xlabel('Stage')
    plt.title(f"Evolution of '{metric}' (Regression)")
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.legend()

    plt.savefig(f"{output_dir}/{metric}_evolution.png")
    # plt.show()
    plt.close()

### Insights
* **Stage 10 Performance Drop**: Sharp degradation (highest MAE/MSE, lowest R² and correlations) indicates unstable learning early on.
* **Improvement Over Time**: MAE and MSE decline with more stages, showing better error control as training progresses.
* **Negative R² Throughout**: R² remains < 0 at all stages—model fits worse than a naive mean predictor.
* **Rank Correlation Improves**: Spearman and Kendall scores rise steadily to \~0.76, indicating good relative ranking of predictions.
* **Plateaus Detected**: Metrics stabilize after step\_100 and step\_400; performance gains flatten despite increased computation.
* **High Early Variance**: Step\_10 shows largest standard deviations—early predictions are inconsistent.
* **Time vs Benefit**: Time increases \~6× (3.45s → 22.04s) but error and correlation gains diminish after step\_100.
* **Actionable Insight**: Consider early stopping around step\_100–200; review model design to address poor absolute accuracy (negative R²).
* **Model Strength**: Performs well in ranking predictions, suitable for tasks where order matters more than exact values.