# Comprehensive Evaluation of Test-Time Scaling Methods

This notebook provides:
1. Comparative analysis of all scaling methods
2. Statistical significance testing
3. Robustness evaluation
4. Visualization of results
5. Conclusions and recommendations

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
from utils.preprocessing import ScalingManager
import joblib
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

## 1. Load Results from Previous Experiments

In [None]:
# Load the data and model
X_test = np.load('../data/processed/X_test.npy')
y_test = np.load('../data/processed/y_test.npy')
model = joblib.load('../data/processed/baseline_model.pkl')

# Initialize scaling manager
scaling_manager = ScalingManager()

## 2. Comprehensive Performance Analysis

In [None]:
def get_metrics(y_true, y_pred):
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    accuracy = accuracy_score(y_true, y_pred)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

scaling_methods = ['standard', 'quantile', 'robust', 'minmax']
results = {}

# Evaluate each scaling method
for method in scaling_methods:
    X_test_scaled = scaling_manager.transform(X_test, method)  # Transform the data
    y_pred = model.predict(X_test_scaled)
    results[method] = get_metrics(y_test, y_pred)

# Add test-time z-score results
X_test_zscore = scaling_manager.test_time_zscore(X_test)
y_pred = model.predict(X_test_zscore)
results['test_time_zscore'] = get_metrics(y_test, y_pred)

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
print("Performance Metrics:")
print(results_df)

## 3. Statistical Significance Testing

In [None]:
def get_predictions(X):
    return model.predict(X)

# Perform McNemar's test for statistical significance
def mcnemar_test(pred1, pred2, y_true):
    correct1 = pred1 == y_true
    correct2 = pred2 == y_true
    
    b = np.sum(~correct1 & correct2)  # method1 wrong, method2 right
    c = np.sum(correct1 & ~correct2)  # method1 right, method2 wrong
    
    statistic = (abs(b - c) - 1)**2 / (b + c)
    p_value = stats.chi2.sf(statistic, df=1)
    
    return statistic, p_value

# Compare each method against the baseline (standard scaling)
baseline_pred = model.predict(scaling_manager.transform(X_test, 'standard'))

significance_results = {}
for method in results.keys():
    if method != 'standard':
        if method == 'test_time_zscore':
            X_scaled = scaling_manager.test_time_zscore(X_test)
        else:
            X_scaled = scaling_manager.transform(X_test, method)
        
        pred = model.predict(X_scaled)
        statistic, p_value = mcnemar_test(baseline_pred, pred, y_test)
        significance_results[method] = {'statistic': statistic, 'p_value': p_value}

print("\nStatistical Significance Results (vs Standard Scaling):")
print(pd.DataFrame(significance_results).T)

## 4. Visualization of Results

In [None]:
# Plot performance metrics
plt.figure(figsize=(12, 6))
results_df.plot(kind='bar', width=0.8)
plt.title('Performance Comparison of Scaling Methods')
plt.xlabel('Scaling Method')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## 5. Robustness Analysis

In [None]:
def add_noise(X, noise_level=0.1):
    noise = np.random.normal(0, noise_level, X.shape)
    return X + noise

# Test robustness with different noise levels
noise_levels = [0.05, 0.1, 0.2]
robustness_results = {}

for noise_level in noise_levels:
    noisy_X = add_noise(X_test, noise_level)
    method_results = {}
    
    for method in scaling_methods:
        X_scaled = scaling_manager.transform(noisy_X, method)
        y_pred = model.predict(X_scaled)
        method_results[method] = accuracy_score(y_test, y_pred)
    
    # Add test-time z-score
    X_zscore = scaling_manager.test_time_zscore(noisy_X)
    y_pred = model.predict(X_zscore)
    method_results['test_time_zscore'] = accuracy_score(y_test, y_pred)
    
    robustness_results[f'noise_{noise_level}'] = method_results

robustness_df = pd.DataFrame(robustness_results)
print("Robustness Analysis (Accuracy with Different Noise Levels):")
print(robustness_df)

## 6. Conclusions and Recommendations

Based on the analysis above, we can draw the following conclusions:

1. **Performance Comparison**:
   - Compare the overall performance metrics
   - Note which method performed best for each metric

2. **Statistical Significance**:
   - Discuss which methods showed significant differences
   - Interpret p-values and their implications

3. **Robustness**:
   - Evaluate which methods were most resistant to noise
   - Consider trade-offs between performance and robustness

4. **Recommendations**:
   - Suggest the best scaling method(s) for different scenarios
   - Consider computational cost and ease of implementation
   - Provide guidelines for when to use each method