# Experiment 2: Downstream Model Impact Assessment

This experiment evaluates how imputation quality affects downstream prediction models.
We compare four imputation methods by training SVR models on imputed datasets and measuring prediction performance.

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

from models.linear_regression import LinearRegression
from models.gaussian_process import GaussianProcessRegression
from imputation.chained_imputer import ChainedImputer

# Set random seed
np.random.seed(42)
sns.set_style('whitegrid')

## 1. Generate Dataset

In [None]:
# Generate synthetic dataset with hierarchical structure
n_samples = 300
n_groups = 4

# Create group labels
groups = np.random.choice(['A', 'B', 'C', 'D'], size=n_samples)

# Group-specific offsets
group_effects = {'A': 2.0, 'B': -1.5, 'C': 1.0, 'D': -0.5}

# Generate features
X1 = np.random.randn(n_samples)
X2 = np.random.randn(n_samples)
X3 = np.random.randn(n_samples)  # Will be masked
X4 = np.random.randn(n_samples)

# Generate target with non-linear relationship
y = 3.0 * X1 + 2.0 * X2 + 1.5 * X3 + 0.5 * X1**2 + 0.3 * X2 * X3
for i, group in enumerate(groups):
    y[i] += group_effects[group]
y += np.random.randn(n_samples) * 0.5

# Create DataFrame
data = pd.DataFrame({
    'group': groups,
    'x1': X1,
    'x2': X2,
    'x3': X3,
    'x4': X4,
    'y': y
})

print(f"Dataset shape: {data.shape}")
print(data.head())

## 2. Split Data and Mask Values

In [None]:
# Split into train and test sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

print(f"Train size: {len(train_data)}")
print(f"Test size: {len(test_data)}")

# Mask 30% of x3 values in training data only
mask_ratio = 0.3
n_mask = int(len(train_data) * mask_ratio)
mask_indices = np.random.choice(train_data.index, size=n_mask, replace=False)

train_data_masked = train_data.copy()
train_data_masked.loc[mask_indices, 'x3'] = np.nan

print(f"\nMasked {n_mask} values in training data")

## 3. Perform Imputation with Four Methods

In [None]:
# Define imputation methods
methods = {
    'Ordinary LR': ChainedImputer(
        base_model=LinearRegression(),
        random_effects=None,
        n_imputations=1,
        random_state=42
    ),
    'Hierarchical LR': ChainedImputer(
        base_model=LinearRegression(),
        random_effects=['group'],
        n_imputations=1,
        random_state=42
    ),
    'Ordinary GP': ChainedImputer(
        base_model=GaussianProcessRegression(),
        random_effects=None,
        n_imputations=1,
        random_state=42
    ),
    'Hierarchical GP': ChainedImputer(
        base_model=GaussianProcessRegression(),
        random_effects=['group'],
        n_imputations=1,
        random_state=42
    )
}

# Perform imputation
imputed_datasets = {}
for method_name, imputer in methods.items():
    print(f"Imputing with {method_name}...")
    imputed = imputer.fit_transform(train_data_masked)
    imputed_datasets[method_name] = imputed[0]

print("\nImputation complete!")

## 4. Train SVR Models on Imputed Datasets

In [None]:
# Define SVR hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
    'epsilon': [0.01, 0.1, 0.2]
}

# Feature columns (exclude group and target)
feature_cols = ['x1', 'x2', 'x3', 'x4']

results = []

for method_name, train_imputed in imputed_datasets.items():
    print(f"\n{'='*50}")
    print(f"Training SVR with {method_name}")
    print('='*50)
    
    # Prepare training data
    X_train = train_imputed[feature_cols].values
    y_train = train_imputed['y'].values
    
    # Prepare test data
    X_test = test_data[feature_cols].values
    y_test = test_data['y'].values
    
    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Grid search with cross-validation
    svr = SVR(kernel='rbf')
    grid_search = GridSearchCV(
        svr, param_grid, cv=5, 
        scoring='neg_mean_squared_error',
        n_jobs=-1, verbose=0
    )
    grid_search.fit(X_train_scaled, y_train)
    
    # Best model
    best_svr = grid_search.best_estimator_
    print(f"Best parameters: {grid_search.best_params_}")
    
    # Evaluate on test set
    y_pred = best_svr.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Test MSE: {mse:.4f}")
    print(f"Test R²: {r2:.4f}")
    
    results.append({
        'Method': method_name,
        'MSE': mse,
        'R2': r2,
        'Best_Params': grid_search.best_params_
    })

## 5. Compare Results

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(results)

print("\n" + "="*60)
print("DOWNSTREAM MODEL PERFORMANCE COMPARISON")
print("="*60)
print(results_df[['Method', 'MSE', 'R2']].to_string(index=False))
print("\n" + "="*60)
print(f"Best method (lowest MSE): {results_df.loc[results_df['MSE'].idxmin(), 'Method']}")
print(f"Best method (highest R²): {results_df.loc[results_df['R2'].idxmax(), 'Method']}")
print("="*60)

## 6. Visualization

In [None]:
# Create comparison plots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# MSE comparison
ax1 = axes[0]
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
ax1.bar(results_df['Method'], results_df['MSE'], color=colors)
ax1.set_ylabel('Mean Squared Error', fontsize=12)
ax1.set_title('SVR Performance: MSE Comparison', fontsize=14, fontweight='bold')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', alpha=0.3)

# R² comparison
ax2 = axes[1]
ax2.bar(results_df['Method'], results_df['R2'], color=colors)
ax2.set_ylabel('R² Score', fontsize=12)
ax2.set_title('SVR Performance: R² Comparison', fontsize=14, fontweight='bold')
ax2.tick_params(axis='x', rotation=45)
ax2.grid(axis='y', alpha=0.3)
ax2.axhline(y=0, color='k', linestyle='--', alpha=0.3)

plt.tight_layout()
plt.savefig('experiment_2_results.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nVisualization saved as 'experiment_2_results.png'")

## Conclusion

This experiment demonstrates how imputation quality affects downstream prediction models. Better imputation methods should lead to better SVR performance, as measured by lower MSE and higher R² scores.