# Generalization Comparison and Conclusions

## Overview

This notebook synthesizes results from similarity analysis and strategic splitting experiments, providing comprehensive conclusions about dataset structure and its impact on model generalization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Import optimized similarity-aware split function
from similarity_split_optimized import similarity_aware_split_optimized

plt.style.use('seaborn-v0_8-darkgrid')
np.random.seed(42)

## Load All Saved Results

Load results from previous notebooks using pickle.

In [None]:
import pickle
import os

results_dir = os.path.join(os.path.dirname(os.getcwd()), 'results')

# Load all results
all_results = {}

try:
    with open(os.path.join(results_dir, '01_similarity_results.pkl'), 'rb') as f:
        all_results['similarity'] = pickle.load(f)
    print("✓ Loaded similarity analysis results")
except FileNotFoundError:
    print("⚠ Similarity results not found. Run notebook 01 first.")

try:
    with open(os.path.join(results_dir, '02_splitting_results.pkl'), 'rb') as f:
        all_results['splitting'] = pickle.load(f)
    print("✓ Loaded splitting strategy results")
except FileNotFoundError:
    print("⚠ Splitting results not found. Run notebook 02 first.")

print(f"\nLoaded {len(all_results)} result files.")

## Comprehensive Comparison Analysis

Compare all splitting strategies using loaded results.

In [None]:
# Load data (same as previous notebooks)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
           'hours-per-week', 'native-country', 'income']

try:
    df = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)
    df = df.dropna()
except Exception as e:
    print(f"Failed to load from primary URL: {e}")
    print("Trying alternative source...")
    try:
        from sklearn.datasets import fetch_openml
        adult = fetch_openml(name='adult', version=2, as_frame=True, parser='pandas')
        df = adult.frame
        df.columns = columns
        df = df.dropna()
        print("Successfully loaded UCI Adult Income dataset from OpenML")
    except Exception as e2:
        raise RuntimeError(
            f"Failed to load dataset from both sources. "
            f"Primary error: {e}, Secondary error: {e2}. "
            f"Please ensure internet connection is available or download the dataset manually."
        )

print(f"Dataset shape: {df.shape}")
print("No synthetic data is used in this project.")

# Preprocess
from sklearn.preprocessing import LabelEncoder
categorical_cols = df.select_dtypes(include=['object']).columns
df_encoded = df.copy()
for col in categorical_cols:
    if col != 'income':
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df[col].astype(str))

target_encoder = LabelEncoder()
y = target_encoder.fit_transform(df['income'])
feature_cols = [col for col in df_encoded.columns if col != 'income']
X = df_encoded[feature_cols].values

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Dataset shape: {X_scaled.shape}")
print(f"Target distribution: {np.bincount(y)}")

## Compare All Splitting Strategies

In [None]:
# Similarity-aware split (using optimized version from similarity_split_optimized.py)
# Note: Using full dataset now since the optimized version is fast enough
train_idx_sim, test_idx_sim = similarity_aware_split_optimized(X_scaled, y, test_size=0.2)
X_train_sim, X_test_sim = X_scaled[train_idx_sim], X_scaled[test_idx_sim]
y_train_sim, y_test_sim = y[train_idx_sim], y[test_idx_sim]


In [None]:
# Save final comparison results
import pickle
import os

results_dir = os.path.join(os.path.dirname(os.getcwd()), 'results')
os.makedirs(results_dir, exist_ok=True)

final_comparison = {
    'splitting_comparison': results,
    'similarity_analysis': all_results.get('similarity', {}),
    'summary': {
        'best_method': min(results.items(), key=lambda x: x[1]['gap'])[0] if results else None,
        'worst_gap': max([r['gap'] for r in results.values()]) if results else None
    }
}

with open(os.path.join(results_dir, '03_final_comparison.pkl'), 'wb') as f:
    pickle.dump(final_comparison, f)

print(f"\nFinal comparison saved to: {os.path.join(results_dir, '03_final_comparison.pkl')}")
results

## Key Findings and Conclusions

### 1. Sample Similarity Reveals Dataset Structure
- Dense regions indicate redundant information
- Isolated samples may be outliers or unique cases
- Understanding relationships helps identify potential issues

### 2. Strategic Splitting Impacts Generalization
- Similarity-aware splits can reveal harder generalization scenarios
- Stratified splits maintain class balance
- Random splits may underestimate generalization challenges

### 3. Research Implications
- Dataset structure matters for model evaluation
- Similarity analysis informs data collection strategies
- Strategic splitting provides more realistic performance estimates