# Adult Income Prediction - Model Evaluation

This notebook evaluates the trained income prediction model, including performance metrics, feature importance, and error analysis.

In [None]:
# Import libraries
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '..'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from src.data_loader import load_raw_data
from src.train import preprocess_data
from src.predict import load_model
from src.config import config

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

In [None]:
# Load data and model
df_raw = load_raw_data()
df_processed = preprocess_data(df_raw)

# Split features and target
X = df_processed.drop(config.TARGET, axis=1)
y = df_processed[config.TARGET]

# Load trained model
model = load_model()
print("Model loaded successfully")
print(f"Model type: {type(model)}")

In [None]:
# Make predictions
y_pred = model.predict(X)
y_pred_proba = model.predict_proba(X)[:, 1]

print("Predictions completed")
print(f"Predicted classes shape: {y_pred.shape}")
print(f"Predicted probabilities shape: {y_pred_proba.shape}")

In [None]:
# Classification report
print("Classification Report:")
print(classification_report(y, y_pred, target_names=['<=50K', '>50K']))

# Accuracy
accuracy = (y_pred == y).mean()
print(f"Overall Accuracy: {accuracy:.4f}")

In [None]:
# Confusion matrix
cm = confusion_matrix(y, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['<=50K', '>50K'], 
            yticklabels=['<=50K', '>50K'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# ROC curve
fpr, tpr, thresholds = roc_curve(y, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Feature importance
# Get the Random Forest from the pipeline
rf_model = model.named_steps['classifier']

# Get feature names after preprocessing
# Since we have OneHotEncoder, we need to get the transformed feature names
preprocessor = model.named_steps['preprocessor']

# Fit preprocessor to get feature names
X_transformed = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names_out()

# Get feature importances
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

# Plot top 20 features
plt.figure(figsize=(12, 8))
sns.barplot(data=feature_importance_df.head(20), x='importance', y='feature')
plt.title('Top 20 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

print("Top 10 most important features:")
print(feature_importance_df.head(10))