# Reproducibility Demonstration

This notebook demonstrates how the pipeline ensures reproducibility across multiple runs.


In [1]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd
import json
from pathlib import Path
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from pipeline import MLPipeline
import matplotlib.pyplot as plt

plt.style.use('seaborn-v0_8-darkgrid')


## Run Same Experiment Multiple Times

With fixed random seeds, we should get identical results.


In [2]:
# Load data
housing = fetch_california_housing(as_frame=True)
X = housing.data.values
y = housing.target.values

# Run experiment 3 times with same config
results = []
for run in range(3):
    # Same random seed ensures reproducibility
    np.random.seed(42)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        'run': run + 1,
        'mse': mse,
        'r2': r2
    })
    
    print(f"Run {run + 1}: MSE = {mse:.6f}, R² = {r2:.6f}")

# Check reproducibility
df_results = pd.DataFrame(results)
print(f"\nReproducibility Check:")
print(f"  MSE std: {df_results['mse'].std():.10f} (should be ~0)")
print(f"  R² std: {df_results['r2'].std():.10f} (should be ~0)")

if df_results['mse'].std() < 1e-10:
    print("✓ Results are perfectly reproducible!")
else:
    print("⚠ Results show some variation")


Run 1: MSE = 0.555892, R² = 0.575788
Run 2: MSE = 0.555892, R² = 0.575788
Run 3: MSE = 0.555892, R² = 0.575788

Reproducibility Check:
  MSE std: 0.0000000000 (should be ~0)
  R² std: 0.0000000000 (should be ~0)
✓ Results are perfectly reproducible!


## Load and Compare Saved Experiments


In [3]:
# Load experiment results
experiments_dir = Path('experiments')
experiment_summaries = []

for exp_dir in experiments_dir.iterdir():
    if exp_dir.is_dir():
        config_path = exp_dir / 'config.json'
        results_path = exp_dir / 'results.json'
        
        if config_path.exists() and results_path.exists():
            with open(config_path, 'r') as f:
                config = json.load(f)
            with open(results_path, 'r') as f:
                results = json.load(f)
            
            # Extract key metrics
            summary = {
                'experiment': exp_dir.name,
                'model_type': config.get('model_type', 'unknown'),
                'mse': results.get('test_mse', [{}])[0].get('value', None) if results.get('test_mse') else None,
                'r2': results.get('test_r2', [{}])[0].get('value', None) if results.get('test_r2') else None
            }
            experiment_summaries.append(summary)

if experiment_summaries:
    df_experiments = pd.DataFrame(experiment_summaries)
    print("SAVED EXPERIMENTS SUMMARY")
    print("=" * 60)
    print(df_experiments.to_string(index=False))
else:
    print("No saved experiments found. Run notebook 02 first.")


SAVED EXPERIMENTS SUMMARY
            experiment        model_type      mse       r2
exp1_linear_regression linear_regression 0.555892 0.575788
 exp2_ridge_regression  ridge_regression 0.555888 0.575791
