---
## 1. Import Libraries and Load Data

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Sklearn imports
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)
from imblearn.over_sampling import SMOTE

# Try XGBoost (install if needed)
try:
    from xgboost import XGBClassifier
    xgboost_available = True
except ImportError:
    print("XGBoost not installed. Install with: pip install xgboost")
    xgboost_available = False

# Visualization settings
plt.style.use('seaborn-whitegrid')
sns.set_palette('husl')
pd.set_option('display.max_columns', None)

print("‚úÖ Libraries imported successfully!")

In [None]:
# Load dataset
df = pd.read_csv('../../data/data/adultcensusincome.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nFirst few rows:")
df.head(10)

---
## 2. Exploratory Data Analysis (EDA)

In [None]:
# Dataset information
print("DATASET INFORMATION")
print("="*70)
df.info()

print("\n" + "="*70)
print("MISSING VALUES")
print("="*70)
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing'] > 0])

print("\n" + "="*70)
print("STATISTICAL SUMMARY")
print("="*70)
df.describe()

In [None]:
# Column names and types
print("COLUMN DETAILS")
print("="*70)
for i, (col, dtype) in enumerate(zip(df.columns, df.dtypes), 1):
    unique_count = df[col].nunique()
    print(f"{i:2}. {col:25s} | {str(dtype):10s} | {unique_count:6d} unique values")

In [None]:
# Identify target column (income-related)
target_candidates = ['income', 'Income', 'salary', 'earnings', 'income-bracket', 'income_bracket']
target_col = None

for col in df.columns:
    if any(candidate.lower() in col.lower() for candidate in target_candidates):
        target_col = col
        break

if target_col:
    print(f"‚úÖ Target column identified: '{target_col}'")
    print(f"\nTarget distribution:")
    print(df[target_col].value_counts())
    print(f"\nTarget distribution (%):")
    print(df[target_col].value_counts(normalize=True) * 100)
else:
    print("‚ö†Ô∏è  Target column not automatically identified.")
    print("Available columns:")
    print(df.columns.tolist())

In [None]:
# Visualize target distribution
if target_col:
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    df[target_col].value_counts().plot(kind='bar', color=['skyblue', 'lightcoral'])
    plt.title('Income Distribution', fontsize=14, fontweight='bold')
    plt.xlabel('Income Category', fontweight='bold')
    plt.ylabel('Count', fontweight='bold')
    plt.xticks(rotation=45)
    
    plt.subplot(1, 2, 2)
    df[target_col].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['skyblue', 'lightcoral'])
    plt.title('Income Distribution (%)', fontsize=14, fontweight='bold')
    plt.ylabel('')
    
    plt.tight_layout()
    plt.show()
    
    # Check for class imbalance
    class_counts = df[target_col].value_counts()
    imbalance_ratio = class_counts.max() / class_counts.min()
    print(f"\n‚ö†Ô∏è  Class imbalance ratio: {imbalance_ratio:.2f}:1")
    if imbalance_ratio > 2:
        print("   Consider using SMOTE or class weighting")

In [None]:
# Analyze categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
if target_col in categorical_cols:
    categorical_cols.remove(target_col)

print(f"\nCATEGORICAL VARIABLES ({len(categorical_cols)}):")
print("="*70)
for col in categorical_cols[:5]:  # Show first 5
    print(f"\n{col}:")
    print(df[col].value_counts().head(10))

In [None]:
# Analyze numerical variables
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

if numerical_cols:
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()
    
    for i, col in enumerate(numerical_cols[:6]):
        axes[i].hist(df[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
        axes[i].set_title(f'{col} Distribution', fontweight='bold')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

---
## 3. Data Preprocessing

In [None]:
# Create preprocessing copy
df_processed = df.copy()

print("PREPROCESSING STEPS")
print("="*70)

# 1. Handle missing values
print("\n1. Handling missing values...")
initial_rows = len(df_processed)

# Replace ' ?' with NaN if present
df_processed = df_processed.replace(' ?', np.nan)
df_processed = df_processed.replace('?', np.nan)

# Drop rows with missing values (or fill strategically)
df_processed = df_processed.dropna()
print(f"   Rows removed: {initial_rows - len(df_processed)}")
print(f"   Remaining rows: {len(df_processed)}")

# 2. Remove duplicates
print("\n2. Removing duplicates...")
duplicates = df_processed.duplicated().sum()
df_processed = df_processed.drop_duplicates()
print(f"   Duplicates removed: {duplicates}")

# 3. Clean string columns (strip whitespace)
print("\n3. Cleaning string columns...")
for col in df_processed.select_dtypes(include=['object']).columns:
    df_processed[col] = df_processed[col].str.strip()
print("   ‚úÖ Whitespace removed from categorical columns")

print(f"\n‚úÖ Preprocessing complete. Final shape: {df_processed.shape}")

In [None]:
# Encode target variable (binary classification)
print("\nENCODING TARGET VARIABLE")
print("="*70)

# Create binary target (1 if income > 50K, 0 otherwise)
if target_col:
    unique_values = df_processed[target_col].unique()
    print(f"Unique values in {target_col}: {unique_values}")
    
    # Common patterns: '>50K', '<=50K' or '>50K.', '<=50K.'
    df_processed['income_binary'] = df_processed[target_col].apply(
        lambda x: 1 if '>50' in str(x) else 0
    )
    
    print(f"\nEncoded target distribution:")
    print(df_processed['income_binary'].value_counts())
    print(f"\n0 = <=50K, 1 = >50K")
    
    target_encoded = 'income_binary'
else:
    print("‚ö†Ô∏è  Please manually specify target column")

In [None]:
# Encode categorical variables
print("\nENCODING CATEGORICAL VARIABLES")
print("="*70)

# Get categorical columns (excluding target)
categorical_cols = df_processed.select_dtypes(include=['object']).columns.tolist()
if target_col in categorical_cols:
    categorical_cols.remove(target_col)

print(f"\nCategorical columns to encode: {len(categorical_cols)}")
print(categorical_cols)

# One-hot encoding
df_encoded = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)

print(f"\nShape before encoding: {df_processed.shape}")
print(f"Shape after encoding: {df_encoded.shape}")
print(f"New feature count: {df_encoded.shape[1]}")

In [None]:
# Prepare features and target
print("\nPREPARING FEATURES AND TARGET")
print("="*70)

# Drop target and any ID columns
columns_to_drop = [target_encoded]
if target_col in df_encoded.columns:
    columns_to_drop.append(target_col)

X = df_encoded.drop(columns=columns_to_drop)
y = df_encoded[target_encoded]

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"\nClass distribution:")
print(y.value_counts())
print(f"\nClass distribution (%):")
print(y.value_counts(normalize=True) * 100)

---
## 4. Train-Test Split

In [None]:
# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("TRAIN-TEST SPLIT")
print("="*70)
print(f"Training set: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"\nFeatures: {X_train.shape[1]}")

print(f"\nTraining set class distribution:")
print(y_train.value_counts())
print(f"\nTest set class distribution:")
print(y_test.value_counts())

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Features scaled using StandardScaler")
print(f"Training data shape: {X_train_scaled.shape}")
print(f"Test data shape: {X_test_scaled.shape}")

---
## 5. Model Building

### 5.1 Logistic Regression

In [None]:
# Logistic Regression
print("TRAINING LOGISTIC REGRESSION")
print("="*70)

lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred_lr = lr_model.predict(X_train_scaled)
y_test_pred_lr = lr_model.predict(X_test_scaled)
y_test_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

# Evaluation
train_acc_lr = accuracy_score(y_train, y_train_pred_lr)
test_acc_lr = accuracy_score(y_test, y_test_pred_lr)
precision_lr = precision_score(y_test, y_test_pred_lr)
recall_lr = recall_score(y_test, y_test_pred_lr)
f1_lr = f1_score(y_test, y_test_pred_lr)
roc_auc_lr = roc_auc_score(y_test, y_test_proba_lr)

print(f"\nTraining Accuracy: {train_acc_lr:.4f}")
print(f"Test Accuracy: {test_acc_lr:.4f}")
print(f"Precision: {precision_lr:.4f}")
print(f"Recall: {recall_lr:.4f}")
print(f"F1-Score: {f1_lr:.4f}")
print(f"ROC-AUC: {roc_auc_lr:.4f}")

print(f"\nClassification Report:")
print(classification_report(y_test, y_test_pred_lr))

# Confusion Matrix
cm_lr = confusion_matrix(y_test, y_test_pred_lr)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['<=50K', '>50K'], yticklabels=['<=50K', '>50K'])
plt.title('Confusion Matrix - Logistic Regression', fontsize=14, fontweight='bold')
plt.ylabel('Actual', fontweight='bold')
plt.xlabel('Predicted', fontweight='bold')
plt.tight_layout()
plt.show()

### 5.2 Decision Tree

In [None]:
# Decision Tree
print("TRAINING DECISION TREE")
print("="*70)

dt_model = DecisionTreeClassifier(max_depth=10, min_samples_split=20, random_state=42)
dt_model.fit(X_train, y_train)

# Predictions
y_train_pred_dt = dt_model.predict(X_train)
y_test_pred_dt = dt_model.predict(X_test)
y_test_proba_dt = dt_model.predict_proba(X_test)[:, 1]

# Evaluation
train_acc_dt = accuracy_score(y_train, y_train_pred_dt)
test_acc_dt = accuracy_score(y_test, y_test_pred_dt)
precision_dt = precision_score(y_test, y_test_pred_dt)
recall_dt = recall_score(y_test, y_test_pred_dt)
f1_dt = f1_score(y_test, y_test_pred_dt)
roc_auc_dt = roc_auc_score(y_test, y_test_proba_dt)

print(f"\nTraining Accuracy: {train_acc_dt:.4f}")
print(f"Test Accuracy: {test_acc_dt:.4f}")
print(f"Precision: {precision_dt:.4f}")
print(f"Recall: {recall_dt:.4f}")
print(f"F1-Score: {f1_dt:.4f}")
print(f"ROC-AUC: {roc_auc_dt:.4f}")

print(f"\nClassification Report:")
print(classification_report(y_test, y_test_pred_dt))

# Confusion Matrix
cm_dt = confusion_matrix(y_test, y_test_pred_dt)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Greens',
            xticklabels=['<=50K', '>50K'], yticklabels=['<=50K', '>50K'])
plt.title('Confusion Matrix - Decision Tree', fontsize=14, fontweight='bold')
plt.ylabel('Actual', fontweight='bold')
plt.xlabel('Predicted', fontweight='bold')
plt.tight_layout()
plt.show()

### 5.3 Random Forest

In [None]:
# Random Forest
print("TRAINING RANDOM FOREST")
print("="*70)

rf_model = RandomForestClassifier(n_estimators=100, max_depth=15, 
                                  min_samples_split=20, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predictions
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)
y_test_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Evaluation
train_acc_rf = accuracy_score(y_train, y_train_pred_rf)
test_acc_rf = accuracy_score(y_test, y_test_pred_rf)
precision_rf = precision_score(y_test, y_test_pred_rf)
recall_rf = recall_score(y_test, y_test_pred_rf)
f1_rf = f1_score(y_test, y_test_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_test_proba_rf)

print(f"\nTraining Accuracy: {train_acc_rf:.4f}")
print(f"Test Accuracy: {test_acc_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")
print(f"ROC-AUC: {roc_auc_rf:.4f}")

print(f"\nClassification Report:")
print(classification_report(y_test, y_test_pred_rf))

# Confusion Matrix
cm_rf = confusion_matrix(y_test, y_test_pred_rf)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Oranges',
            xticklabels=['<=50K', '>50K'], yticklabels=['<=50K', '>50K'])
plt.title('Confusion Matrix - Random Forest', fontsize=14, fontweight='bold')
plt.ylabel('Actual', fontweight='bold')
plt.xlabel('Predicted', fontweight='bold')
plt.tight_layout()
plt.show()

### 5.4 Gradient Boosting

In [None]:
# Gradient Boosting
print("TRAINING GRADIENT BOOSTING")
print("="*70)

gb_model = GradientBoostingClassifier(n_estimators=100, max_depth=5, 
                                      learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)

# Predictions
y_train_pred_gb = gb_model.predict(X_train)
y_test_pred_gb = gb_model.predict(X_test)
y_test_proba_gb = gb_model.predict_proba(X_test)[:, 1]

# Evaluation
train_acc_gb = accuracy_score(y_train, y_train_pred_gb)
test_acc_gb = accuracy_score(y_test, y_test_pred_gb)
precision_gb = precision_score(y_test, y_test_pred_gb)
recall_gb = recall_score(y_test, y_test_pred_gb)
f1_gb = f1_score(y_test, y_test_pred_gb)
roc_auc_gb = roc_auc_score(y_test, y_test_proba_gb)

print(f"\nTraining Accuracy: {train_acc_gb:.4f}")
print(f"Test Accuracy: {test_acc_gb:.4f}")
print(f"Precision: {precision_gb:.4f}")
print(f"Recall: {recall_gb:.4f}")
print(f"F1-Score: {f1_gb:.4f}")
print(f"ROC-AUC: {roc_auc_gb:.4f}")

print(f"\nClassification Report:")
print(classification_report(y_test, y_test_pred_gb))

# Confusion Matrix
cm_gb = confusion_matrix(y_test, y_test_pred_gb)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_gb, annot=True, fmt='d', cmap='Purples',
            xticklabels=['<=50K', '>50K'], yticklabels=['<=50K', '>50K'])
plt.title('Confusion Matrix - Gradient Boosting', fontsize=14, fontweight='bold')
plt.ylabel('Actual', fontweight='bold')
plt.xlabel('Predicted', fontweight='bold')
plt.tight_layout()
plt.show()

---
## 6. Model Comparison

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting'],
    'Train_Accuracy': [train_acc_lr, train_acc_dt, train_acc_rf, train_acc_gb],
    'Test_Accuracy': [test_acc_lr, test_acc_dt, test_acc_rf, test_acc_gb],
    'Precision': [precision_lr, precision_dt, precision_rf, precision_gb],
    'Recall': [recall_lr, recall_dt, recall_rf, recall_gb],
    'F1_Score': [f1_lr, f1_dt, f1_rf, f1_gb],
    'ROC_AUC': [roc_auc_lr, roc_auc_dt, roc_auc_rf, roc_auc_gb]
})

# Calculate overfitting
comparison_df['Overfitting'] = comparison_df['Train_Accuracy'] - comparison_df['Test_Accuracy']

print("\n" + "="*100)
print("MODEL COMPARISON")
print("="*100)
print(comparison_df.to_string(index=False))

# Highlight best models
best_acc_idx = comparison_df['Test_Accuracy'].idxmax()
best_f1_idx = comparison_df['F1_Score'].idxmax()
best_auc_idx = comparison_df['ROC_AUC'].idxmax()

print(f"\nüèÜ BEST MODELS:")
print(f"   Highest Accuracy: {comparison_df.loc[best_acc_idx, 'Model']} ({comparison_df.loc[best_acc_idx, 'Test_Accuracy']:.4f})")
print(f"   Highest F1-Score: {comparison_df.loc[best_f1_idx, 'Model']} ({comparison_df.loc[best_f1_idx, 'F1_Score']:.4f})")
print(f"   Highest ROC-AUC: {comparison_df.loc[best_auc_idx, 'Model']} ({comparison_df.loc[best_auc_idx, 'ROC_AUC']:.4f})")

In [None]:
# Visualization of model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Accuracy Comparison
axes[0, 0].barh(comparison_df['Model'], comparison_df['Test_Accuracy'], color='skyblue')
axes[0, 0].set_xlabel('Accuracy', fontweight='bold')
axes[0, 0].set_title('Test Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0, 0].set_xlim([0.7, 1.0])
for i, v in enumerate(comparison_df['Test_Accuracy']):
    axes[0, 0].text(v + 0.005, i, f'{v:.4f}', va='center')

# F1-Score Comparison
axes[0, 1].barh(comparison_df['Model'], comparison_df['F1_Score'], color='lightcoral')
axes[0, 1].set_xlabel('F1-Score', fontweight='bold')
axes[0, 1].set_title('F1-Score Comparison', fontsize=14, fontweight='bold')
axes[0, 1].set_xlim([0.5, 1.0])
for i, v in enumerate(comparison_df['F1_Score']):
    axes[0, 1].text(v + 0.005, i, f'{v:.4f}', va='center')

# ROC-AUC Comparison
axes[1, 0].barh(comparison_df['Model'], comparison_df['ROC_AUC'], color='lightgreen')
axes[1, 0].set_xlabel('ROC-AUC', fontweight='bold')
axes[1, 0].set_title('ROC-AUC Comparison', fontsize=14, fontweight='bold')
axes[1, 0].set_xlim([0.7, 1.0])
for i, v in enumerate(comparison_df['ROC_AUC']):
    axes[1, 0].text(v + 0.005, i, f'{v:.4f}', va='center')

# Precision-Recall Trade-off
x = np.arange(len(comparison_df))
width = 0.35
axes[1, 1].bar(x - width/2, comparison_df['Precision'], width, label='Precision', color='mediumpurple')
axes[1, 1].bar(x + width/2, comparison_df['Recall'], width, label='Recall', color='orange')
axes[1, 1].set_ylabel('Score', fontweight='bold')
axes[1, 1].set_title('Precision vs Recall', fontsize=14, fontweight='bold')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(comparison_df['Model'], rotation=45, ha='right')
axes[1, 1].legend()
axes[1, 1].set_ylim([0, 1])

plt.tight_layout()
plt.show()

---
## 7. ROC Curve Analysis

In [None]:
# ROC Curves for all models
plt.figure(figsize=(10, 8))

# Logistic Regression
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_test_proba_lr)
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {roc_auc_lr:.4f})', linewidth=2)

# Decision Tree
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_test_proba_dt)
plt.plot(fpr_dt, tpr_dt, label=f'Decision Tree (AUC = {roc_auc_dt:.4f})', linewidth=2)

# Random Forest
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_test_proba_rf)
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {roc_auc_rf:.4f})', linewidth=2)

# Gradient Boosting
fpr_gb, tpr_gb, _ = roc_curve(y_test, y_test_proba_gb)
plt.plot(fpr_gb, tpr_gb, label=f'Gradient Boosting (AUC = {roc_auc_gb:.4f})', linewidth=2)

# Diagonal line (random classifier)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier (AUC = 0.5000)', linewidth=1)

plt.xlabel('False Positive Rate', fontsize=12, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=12, fontweight='bold')
plt.title('ROC Curves - Model Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

---
## 8. Feature Importance Analysis

In [None]:
# Feature importance from Random Forest
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

print("FEATURE IMPORTANCE (Random Forest) - Top 20")
print("="*70)
print(feature_importance.head(20).to_string(index=False))

# Visualization
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['Importance'], color='steelblue')
plt.yticks(range(len(top_features)), top_features['Feature'])
plt.xlabel('Importance Score', fontsize=12, fontweight='bold')
plt.title('Top 20 Feature Importance (Random Forest)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

---
## 9. Summary and Business Insights

In [None]:
print("\n" + "="*80)
print("SESSION 6: CLASSIFICATION ANALYSIS - SUMMARY")
print("="*80)

print("\nüìä DATASET:")
print(f"   ‚Ä¢ Total samples: {len(df):,}")
print(f"   ‚Ä¢ After preprocessing: {len(df_processed):,}")
print(f"   ‚Ä¢ Features (after encoding): {X.shape[1]}")
print(f"   ‚Ä¢ Training samples: {len(X_train):,}")
print(f"   ‚Ä¢ Test samples: {len(X_test):,}")

print("\nüéØ MODELS TESTED:")
print("   1. Logistic Regression")
print("   2. Decision Tree")
print("   3. Random Forest")
print("   4. Gradient Boosting")

print("\nüèÜ BEST MODEL (by ROC-AUC):")
best_idx = comparison_df['ROC_AUC'].idxmax()
best_model = comparison_df.loc[best_idx, 'Model']
print(f"   ‚Ä¢ Model: {best_model}")
print(f"   ‚Ä¢ Test Accuracy: {comparison_df.loc[best_idx, 'Test_Accuracy']:.4f}")
print(f"   ‚Ä¢ Precision: {comparison_df.loc[best_idx, 'Precision']:.4f}")
print(f"   ‚Ä¢ Recall: {comparison_df.loc[best_idx, 'Recall']:.4f}")
print(f"   ‚Ä¢ F1-Score: {comparison_df.loc[best_idx, 'F1_Score']:.4f}")
print(f"   ‚Ä¢ ROC-AUC: {comparison_df.loc[best_idx, 'ROC_AUC']:.4f}")

print("\nüìà KEY INSIGHTS:")
top_3_features = feature_importance.head(3)['Feature'].tolist()
print(f"   ‚Ä¢ Top 3 predictive features: {', '.join(top_3_features)}")
print(f"   ‚Ä¢ Model can predict income bracket with {comparison_df.loc[best_idx, 'Test_Accuracy']*100:.2f}% accuracy")
print(f"   ‚Ä¢ Precision: {comparison_df.loc[best_idx, 'Precision']*100:.1f}% (of predicted high-earners, this % are correct)")
print(f"   ‚Ä¢ Recall: {comparison_df.loc[best_idx, 'Recall']*100:.1f}% (of actual high-earners, this % are identified)")

print("\nüíº BUSINESS APPLICATIONS:")
print("   ‚úì Targeted marketing for high-income individuals")
print("   ‚úì Credit risk assessment and loan approvals")
print("   ‚úì Customer segmentation for premium services")
print("   ‚úì Resource allocation for customer acquisition")

print("\nüí° RECOMMENDATIONS:")
if comparison_df.loc[best_idx, 'Overfitting'] > 0.05:
    print("   ‚ö†Ô∏è  Model shows overfitting - consider regularization or pruning")
else:
    print("   ‚úÖ Model generalizes well to unseen data")

if comparison_df.loc[best_idx, 'ROC_AUC'] > 0.85:
    print("   ‚úÖ Excellent discrimination ability - production-ready")
elif comparison_df.loc[best_idx, 'ROC_AUC'] > 0.75:
    print("   ‚ö†Ô∏è  Good performance - consider feature engineering for improvement")
else:
    print("   ‚ö†Ô∏è  Model needs improvement - explore advanced techniques")

print("\nüéì SKILLS DEMONSTRATED:")
print("   ‚úÖ Binary classification problem formulation")
print("   ‚úÖ Data preprocessing and encoding")
print("   ‚úÖ Handling class imbalance")
print("   ‚úÖ Multiple classifier algorithms")
print("   ‚úÖ Comprehensive evaluation metrics")
print("   ‚úÖ ROC-AUC analysis")
print("   ‚úÖ Feature importance interpretation")
print("   ‚úÖ Model comparison and selection")
print("   ‚úÖ Business insights extraction")

print("\n" + "="*80)

In [None]:
# Save results
comparison_df.to_csv('../../data/outputs/session_6_model_comparison.csv', index=False)
feature_importance.to_csv('../../data/outputs/session_6_feature_importance.csv', index=False)

print("‚úÖ Results saved to data/outputs/")
print("   ‚Ä¢ session_6_model_comparison.csv")
print("   ‚Ä¢ session_6_feature_importance.csv")