### Model Stats CSV
Show info about the CSV file<br>
Show percentage of "is_better" overall and for each dataset

In [None]:
import pandas as pd

df = pd.read_csv('./model_stats.csv')
print(df.shape)

# Print percentage of "is_better" overall and for each dataset
print("\nPercentage of 'is_better' for each dataset:")
print((df.groupby(['dataset_name', 'dataset_group'])['scores_is_better']
    .value_counts(normalize=True) * 100))

print("\nPercentage of 'is_better' overall:")
print((df['scores_is_better'].value_counts(normalize=True) * 100))

### Feature Engineering
Create new features:
- Calculate differences between consecutive steps for each layer
- ...

In [None]:
import pandas as pd
import re

df = pd.read_csv('./model_stats.csv')

# Define the layer steps you want to compare
steps = ['step_10', 'step_25', 'step_50', 'step_100', 'step_200', 'step_300', 'step_400', 'step_500']
layers = ['mlp.0', 'mlp.1', 'mlp.2']  # Adapt if you have more
stats = ['weight_mean', 'weight_std', 'weight_var', 'weight_frobenius_norm', 'weight_spectral_norm', 'weight_alpha_hat']  # etc.

# Find all relevant columns
pattern = re.compile(r'weights_step_(\d+)_mlp\.(\d+)\.(\w+)')
relevant_cols = [col for col in df.columns if pattern.match(col)]

# Compute diffs and store them in a dictionary first
features = {}

for stat in stats:
    for layer in layers:
        for i in range(1, len(steps)):
            prev_step = steps[i-1]
            curr_step = steps[i]
            col_prev = f'weights_{prev_step}_{layer}.{stat}'
            col_curr = f'weights_{curr_step}_{layer}.{stat}'
            if col_prev in df.columns and col_curr in df.columns:
                new_col = f'diff_{curr_step}_{prev_step}_{layer}_{stat}'
                features[new_col] = df[col_curr] - df[col_prev]

In [None]:
# # Concatenate all new features at once
# features_df = pd.DataFrame(features)
# df = pd.concat([df, features_df], axis=1)

# # Save or return the enhanced dataframe
# df.to_csv('./new_model_stats.csv', index=False)

## Stages Summary

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df_classification = pd.read_csv('./output/classification/stagewise_summary.csv')
df_regression = pd.read_csv('./output/regression/stagewise_summary.csv')

In [None]:
# Print evolution of metrics for classification in separate plots
metrics = ['acc_score', 'roc_auc_score', 'log_loss_score', 'f1_score', 'time']

for metric in metrics:
    plt.figure(figsize=(10, 6))
    plt.plot(df_classification['stage'], df_classification[metric], marker='o')
    plt.ylabel(metric.capitalize())
    plt.xlabel('Stage')
    plt.title(f'Evolution of {metric.capitalize()} (Classification)')
    plt.grid(True)
    plt.xticks(rotation='vertical')
    plt.tight_layout()
    
    # Save the plot as an image
    # plt.savefig(f'./output/images/classification_{metric}_evolution.png')
    plt.show()
    plt.close()  # Close the figure to free up memory


In [None]:
# Print evolution of metrics for regression in separate plots
metrics = ['mae_score', 'mse_score', 'r2_score', 'pearson', 'kendall', 'spearman', 'time']

for metric in metrics:
    plt.figure(figsize=(10, 6))
    plt.plot(df_regression['stage'], df_regression[metric], marker='o')
    plt.ylabel(metric.capitalize())
    plt.xlabel('Stage')
    plt.title(f'Evolution of {metric.capitalize()} (Regression)')
    plt.grid(True)
    plt.xticks(rotation='vertical')
    plt.tight_layout()
    
    # Save the plot as an image
    # plt.savefig(f'./output/images/regression_{metric}_evolution.png')
    plt.show()
    plt.close()  # Close the figure to free up memory