# Model Evaluation

Evaluate model performance using various metrics and visualizations.

### Step 1: Load Trained Models

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                             roc_auc_score, confusion_matrix, roc_curve, classification_report)

# Load training data
training_data = joblib.load('models/training_data.pkl')

male_model = training_data['male_model']
female_model = training_data['female_model']
X_test_male = training_data['X_test_male']
X_test_female = training_data['X_test_female']
y_test_male = training_data['y_test_male']
y_test_female = training_data['y_test_female']
feature_cols = training_data['feature_cols']

print('✓ Models loaded successfully!')

### Step 2: Generate Predictions

In [None]:
# Male model predictions
y_pred_male = male_model.predict(X_test_male)
y_pred_proba_male = male_model.predict_proba(X_test_male)[:, 1]

# Female model predictions
y_pred_female = female_model.predict(X_test_female)
y_pred_proba_female = female_model.predict_proba(X_test_female)[:, 1]

print('✓ Predictions generated')

### Step 3: Calculate Male Model Metrics

In [None]:
print('═' * 60)
print('MALE MODEL PERFORMANCE')
print('═' * 60)

male_accuracy = accuracy_score(y_test_male, y_pred_male)
male_precision = precision_score(y_test_male, y_pred_male)
male_recall = recall_score(y_test_male, y_pred_male)
male_f1 = f1_score(y_test_male, y_pred_male)
male_auc = roc_auc_score(y_test_male, y_pred_proba_male)

print(f'Accuracy:   {male_accuracy:.4f} ({male_accuracy*100:.2f}%)')
print(f'Precision:  {male_precision:.4f}')
print(f'Recall:     {male_recall:.4f}')
print(f'F1-Score:   {male_f1:.4f}')
print(f'AUC-ROC:    {male_auc:.4f}')

print('\nDetailed Classification Report:')
print(classification_report(y_test_male, y_pred_male))

### Step 4: Calculate Female Model Metrics

In [None]:
print('═' * 60)
print('FEMALE MODEL PERFORMANCE')
print('═' * 60)

female_accuracy = accuracy_score(y_test_female, y_pred_female)
female_precision = precision_score(y_test_female, y_pred_female)
female_recall = recall_score(y_test_female, y_pred_female)
female_f1 = f1_score(y_test_female, y_pred_female)
female_auc = roc_auc_score(y_test_female, y_pred_proba_female)

print(f'Accuracy:   {female_accuracy:.4f} ({female_accuracy*100:.2f}%)')
print(f'Precision:  {female_precision:.4f}')
print(f'Recall:     {female_recall:.4f}')
print(f'F1-Score:   {female_f1:.4f}')
print(f'AUC-ROC:    {female_auc:.4f}')

print('\nDetailed Classification Report:')
print(classification_report(y_test_female, y_pred_female))

### Step 5: Confusion Matrix Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Male confusion matrix
cm_male = confusion_matrix(y_test_male, y_pred_male)
sns.heatmap(cm_male, annot=True, fmt='d', cmap='Blues', ax=axes[0], cbar=False)
axes[0].set_title('Male Model - Confusion Matrix', fontweight='bold', fontsize=12)
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

# Female confusion matrix
cm_female = confusion_matrix(y_test_female, y_pred_female)
sns.heatmap(cm_female, annot=True, fmt='d', cmap='Oranges', ax=axes[1], cbar=False)
axes[1].set_title('Female Model - Confusion Matrix', fontweight='bold', fontsize=12)
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')

plt.tight_layout()
plt.savefig('figures/confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()
print('✓ Confusion matrix visualization saved!')

### Step 6: ROC Curve Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Male ROC curve
fpr_male, tpr_male, _ = roc_curve(y_test_male, y_pred_proba_male)
axes[0].plot(fpr_male, tpr_male, color='steelblue', lw=2, label=f'AUC = {male_auc:.3f}')
axes[0].plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--', label='Random')
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('Male Model - ROC Curve', fontweight='bold', fontsize=12)
axes[0].legend()
axes[0].grid(alpha=0.3)

# Female ROC curve
fpr_female, tpr_female, _ = roc_curve(y_test_female, y_pred_proba_female)
axes[1].plot(fpr_female, tpr_female, color='salmon', lw=2, label=f'AUC = {female_auc:.3f}')
axes[1].plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--', label='Random')
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('Female Model - ROC Curve', fontweight='bold', fontsize=12)
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('figures/roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()
print('✓ ROC curve visualization saved!')

### Step 7: Performance Comparison

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC'],
    'Male Model': [male_accuracy, male_precision, male_recall, male_f1, male_auc],
    'Female Model': [female_accuracy, female_precision, female_recall, female_f1, female_auc]
})

print('═' * 60)
print('PERFORMANCE COMPARISON')
print('═' * 60)
print(comparison_df.to_string(index=False))

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(comparison_df))
width = 0.35

bars1 = ax.bar(x - width/2, comparison_df['Male Model'], width, label='Male Model', color='steelblue')
bars2 = ax.bar(x + width/2, comparison_df['Female Model'], width, label='Female Model', color='salmon')

ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison', fontweight='bold', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Metric'])
ax.legend()
ax.grid(alpha=0.3, axis='y')
ax.set_ylim(0.7, 1.0)

plt.tight_layout()
plt.savefig('figures/performance_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
print('\n✓ Performance comparison visualization saved!')

### Step 8: Feature Importance

In [None]:
# Get feature importance from both models
male_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': male_model.feature_importances_
}).sort_values('Importance', ascending=False).head(10)

female_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': female_model.feature_importances_
}).sort_values('Importance', ascending=False).head(10)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Male model feature importance
axes[0].barh(male_importance['Feature'], male_importance['Importance'], color='steelblue')
axes[0].set_title('Male Model - Top 10 Important Features', fontweight='bold')
axes[0].set_xlabel('Importance')
axes[0].invert_yaxis()

# Female model feature importance
axes[1].barh(female_importance['Feature'], female_importance['Importance'], color='salmon')
axes[1].set_title('Female Model - Top 10 Important Features', fontweight='bold')
axes[1].set_xlabel('Importance')
axes[1].invert_yaxis()

plt.tight_layout()
plt.savefig('figures/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()
print('✓ Feature importance visualization saved!')

### Step 9: Save Evaluation Results

In [None]:
# Save comparison to CSV
comparison_df.to_csv('outputs/model_performance_comparison.csv', index=False)

# Save feature importance
male_importance.to_csv('outputs/male_feature_importance.csv', index=False)
female_importance.to_csv('outputs/female_feature_importance.csv', index=False)

print('═' * 60)
print('EVALUATION COMPLETE')
print('═' * 60)
print('✓ All evaluation results saved!')
print('\nNext: Proceed to 06_SHAP_Explainability.ipynb for model interpretation.')

### Next Notebook

Proceed to **06_SHAP_Explainability.ipynb** for detailed model interpretation using SHAP values.