# Generalization Comparison and Conclusions

## Overview

This notebook synthesizes results from similarity analysis and strategic splitting experiments, providing comprehensive conclusions about dataset structure and its impact on model generalization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
np.random.seed(42)


## Load All Saved Results

Load results from previous notebooks using pickle.

In [2]:
import pickle
import os

results_dir = os.path.join(os.path.dirname(os.getcwd()), 'results')

# Load all results
all_results = {}

try:
    with open(os.path.join(results_dir, '01_similarity_results.pkl'), 'rb') as f:
        all_results['similarity'] = pickle.load(f)
    print("✓ Loaded similarity analysis results")
except FileNotFoundError:
    print("⚠ Similarity results not found. Run notebook 01 first.")

try:
    with open(os.path.join(results_dir, '02_splitting_results.pkl'), 'rb') as f:
        all_results['splitting'] = pickle.load(f)
    print("✓ Loaded splitting strategy results")
except FileNotFoundError:
    print("⚠ Splitting results not found. Run notebook 02 first.")

print(f"\nLoaded {len(all_results)} result files.")

✓ Loaded similarity analysis results
✓ Loaded splitting strategy results

Loaded 2 result files.


## Comprehensive Comparison and Analysis

Analyze and visualize results from notebooks 01 and 02.


In [None]:
# Extract and organize results from notebook 02
import pickle
import os

results_dir = os.path.join(os.path.dirname(os.getcwd()), 'results')

# Load splitting results from notebook 02
try:
    with open(os.path.join(results_dir, '02_splitting_results.pkl'), 'rb') as f:
        results = pickle.load(f)
    print("✓ Successfully loaded splitting results from notebook 02")
    print(f"  Models: {list(results.keys())}")
    print(f"  Splitting strategies: {list(results[list(results.keys())[0]].keys())}")
except FileNotFoundError:
    print("✗ Error: Could not find splitting results from notebook 02")
    print("  Please run notebook 02 first to generate results")
    results = {}

# Load similarity analysis results from notebook 01
try:
    with open(os.path.join(results_dir, '01_similarity_results.pkl'), 'rb') as f:
        similarity_results = pickle.load(f)
    print("\n✓ Successfully loaded similarity analysis results from notebook 01")
except FileNotFoundError:
    print("\n⚠ Similarity analysis results not found. Run notebook 01 for complete analysis.")
    similarity_results = {}



Similarity-Aware:
  Train Accuracy: 0.9995, Test Accuracy: 0.8782
  Generalization Gap: 0.1212
  Train/Test F1: 0.9989/0.7272

Random Split:
  Train Accuracy: 0.9996, Test Accuracy: 0.8612
  Generalization Gap: 0.1384
  Train/Test F1: 0.9991/0.6885

Stratified Split:
  Train Accuracy: 0.9995, Test Accuracy: 0.8572
  Generalization Gap: 0.1423
  Train/Test F1: 0.9989/0.6811


In [None]:
# Save comprehensive analysis summary
import pickle
import os

results_dir = os.path.join(os.path.dirname(os.getcwd()), 'results')
os.makedirs(results_dir, exist_ok=True)

final_comparison = {
    'splitting_comparison': results,
    'similarity_analysis': similarity_results,
    'summary': {
        'models_tested': list(results.keys()) if results else [],
        'splits_tested': list(results[list(results.keys())[0]].keys()) if results else [],
    }
}

# Calculate best performing strategy for each model
if results:
    for model_name in results:
        best_split = min(results[model_name].items(), key=lambda x: x[1]['gap']) if results[model_name] else None
        final_comparison['summary'][f'{model_name}_best_split'] = best_split[0] if best_split else None
        final_comparison['summary'][f'{model_name}_min_gap'] = best_split[1]['gap'] if best_split else None

with open(os.path.join(results_dir, '03_final_comparison.pkl'), 'wb') as f:
    pickle.dump(final_comparison, f)

print(f"\nFinal comparison saved to: {os.path.join(results_dir, '03_final_comparison.pkl')}")

if results:
    print("\n" + "="*80)
    print("SUMMARY OF RESULTS FROM NOTEBOOK 02")
    print("="*80)
    for model_name, splits_res in results.items():
        print(f"\n{model_name}:")
        for split_name, metrics in splits_res.items():
            print(f"  {split_name:20s}: Gap={metrics['gap']:.4f}, Test Acc={metrics['test_accuracy']:.4f}")



Final comparison saved to: /workspaces/dataset-structure-similarity-analysis/results/03_final_comparison.pkl


{'Similarity-Aware': {'train_accuracy': 0.9994625513455411,
  'test_accuracy': 0.8782248157248157,
  'train_f1': 0.9988814317673378,
  'test_f1': 0.7272101823185414,
  'gap': 0.12123773562072537,
  'train_size': 26049,
  'test_size': 6512},
 'Random Split': {'train_accuracy': 0.9995777027027027,
  'test_accuracy': 0.861200675571933,
  'train_f1': 0.9991220368744512,
  'test_f1': 0.6884906960716747,
  'gap': 0.13837702713076971,
  'train_size': 26048,
  'test_size': 6513},
 'Stratified Split': {'train_accuracy': 0.9994625307125307,
  'test_accuracy': 0.8572086596038692,
  'train_f1': 0.9988832163369495,
  'test_f1': 0.6810699588477366,
  'gap': 0.1422538711086615,
  'train_size': 26048,
  'test_size': 6513}}

## Key Findings and Conclusions

### 1. Sample Similarity Reveals Dataset Structure
- Dense regions indicate redundant information
- Isolated samples may be outliers or unique cases
- Understanding relationships helps identify potential issues

### 2. Strategic Splitting Impacts Generalization
- Similarity-aware splits can reveal harder generalization scenarios
- Stratified splits maintain class balance
- Random splits may underestimate generalization challenges

### 3. Research Implications
- Dataset structure matters for model evaluation
- Similarity analysis informs data collection strategies
- Strategic splitting provides more realistic performance estimates

In [None]:
# Comprehensive Visualization and Analysis

fig, axes = plt.subplots(2, 2, figsize=(15, 11))

splits_list = ['Random', 'Stratified', 'Similarity-Aware']
models_list = list(results.keys())

# Subplot 1: Test Accuracy Comparison Across Splits and Models
ax1 = axes[0, 0]
x = np.arange(len(splits_list))
width = 0.35

test_accs_lr = [results['Logistic Regression'][split]['test_accuracy'] for split in splits_list]
test_accs_rf = [results['Random Forest'][split]['test_accuracy'] for split in splits_list]

ax1.bar(x - width/2, test_accs_lr, width, label='Logistic Regression', alpha=0.8)
ax1.bar(x + width/2, test_accs_rf, width, label='Random Forest', alpha=0.8)
ax1.set_xlabel('Splitting Strategy', fontsize=11)
ax1.set_ylabel('Test Accuracy', fontsize=11)
ax1.set_title('Effect of Splitting on Test Accuracy', fontsize=12, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(splits_list, rotation=15)
ax1.legend()
ax1.grid(True, alpha=0.3, axis='y')

# Subplot 2: Generalization Gap Comparison
ax2 = axes[0, 1]
gaps_lr = [results['Logistic Regression'][split]['gap'] for split in splits_list]
gaps_rf = [results['Random Forest'][split]['gap'] for split in splits_list]

x = np.arange(len(splits_list))
ax2.bar(x - width/2, gaps_lr, width, label='Logistic Regression', alpha=0.8, edgecolor='black')
ax2.bar(x + width/2, gaps_rf, width, label='Random Forest', alpha=0.8, edgecolor='black')
ax2.set_xlabel('Splitting Strategy', fontsize=11)
ax2.set_ylabel('Generalization Gap (Train - Test)', fontsize=11)
ax2.set_title('Generalization Gap Comparison', fontsize=12, fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels(splits_list, rotation=15)
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')

# Add note about lower gaps being better
ax2.text(0.5, 0.95, '(Lower is better)', transform=ax2.transAxes, 
         ha='center', va='top', fontsize=9, style='italic', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3))

# Subplot 3: Train vs Test Accuracy by Model
ax3 = axes[1, 0]
models_x = np.arange(len(models_list))
model_width = 0.25

for i, split in enumerate(splits_list):
    train_accs = [results[model][split]['train_accuracy'] for model in models_list]
    test_accs = [results[model][split]['test_accuracy'] for model in models_list]
    
    offset = (i - 1) * model_width
    ax3.bar(models_x + offset - 0.15, train_accs, model_width/2, alpha=0.7, label=f'{split} (Train)')
    ax3.bar(models_x + offset + 0.15, test_accs, model_width/2, alpha=0.7, label=f'{split} (Test)')

ax3.set_xlabel('Model', fontsize=11)
ax3.set_ylabel('Accuracy', fontsize=11)
ax3.set_title('Train vs Test Accuracy by Model', fontsize=12, fontweight='bold')
ax3.set_xticks(models_x)
ax3.set_xticklabels(models_list)
ax3.legend(fontsize=8, loc='lower right')
ax3.grid(True, alpha=0.3, axis='y')

# Subplot 4: Impact of Similarity-Aware Split on Gap Reduction
ax4 = axes[1, 1]

# Calculate gap reduction compared to random split
gap_reduction_lr = (gaps_lr[0] - np.array(gaps_lr)) / gaps_lr[0] * 100 if gaps_lr[0] != 0 else np.zeros(len(gaps_lr))
gap_reduction_rf = (gaps_rf[0] - np.array(gaps_rf)) / gaps_rf[0] * 100 if gaps_rf[0] != 0 else np.zeros(len(gaps_rf))

x = np.arange(len(splits_list))
ax4.bar(x - width/2, gap_reduction_lr, width, label='Logistic Regression', alpha=0.8)
ax4.bar(x + width/2, gap_reduction_rf, width, label='Random Forest', alpha=0.8)
ax4.axhline(y=0, color='black', linestyle='-', linewidth=0.8, alpha=0.3)
ax4.set_xlabel('Splitting Strategy', fontsize=11)
ax4.set_ylabel('Gap Reduction (%)', fontsize=11)
ax4.set_title('Gap Reduction vs Random Split', fontsize=12, fontweight='bold')
ax4.set_xticks(x)
ax4.set_xticklabels(splits_list, rotation=15)
ax4.legend()
ax4.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Print detailed comparison
print("\n" + "="*80)
print("DETAILED COMPARISON: LOGISTIC REGRESSION vs RANDOM FOREST")
print("="*80)

for split_name in splits_list:
    print(f"\n{split_name.upper()} SPLIT:")
    print("-" * 80)
    print(f"{'Metric':<25} {'Logistic Regression':<25} {'Random Forest':<25} {'Difference':<15}")
    print("-" * 80)
    
    lr_res = results['Logistic Regression'][split_name]
    rf_res = results['Random Forest'][split_name]
    
    metrics = [
        ('Train Accuracy', 'train_accuracy'),
        ('Test Accuracy', 'test_accuracy'),
        ('Train F1', 'train_f1'),
        ('Test F1', 'test_f1'),
        ('Generalization Gap', 'gap')
    ]
    
    for metric_label, metric_key in metrics:
        lr_val = lr_res[metric_key]
        rf_val = rf_res[metric_key]
        diff = rf_val - lr_val
        print(f"{metric_label:<25} {lr_val:<25.4f} {rf_val:<25.4f} {diff:+.4f}")

print("\n" + "="*80)
print("KEY FINDINGS:")
print("="*80)
print("\n1. Model Performance:")
print(f"   - Logistic Regression typically shows smaller gaps on simple splits")
print(f"   - Random Forest may capture more complex patterns")

print("\n2. Effect of Similarity-Aware Splitting:")
for model_name in models_list:
    random_gap = results[model_name]['Random']['gap']
    sim_gap = results[model_name]['Similarity-Aware']['gap']
    gap_change = ((sim_gap - random_gap) / random_gap) * 100 if random_gap != 0 else 0
    print(f"   - {model_name}: {gap_change:+.2f}% change in gap")
    if gap_change > 0:
        print(f"     (Similarity-aware split INCREASES gap - harder test set)")
    else:
        print(f"     (Similarity-aware split DECREASES gap - easier test set)")

print("\n3. Best Splitting Strategy:")
for model_name in models_list:
    best_split = min(results[model_name].items(), key=lambda x: x[1]['gap'])
    print(f"   - {model_name}: {best_split[0]} (gap: {best_split[1]['gap']:.4f})")
