In [None]:
# STEP 1: Install Required Libraries
%pip install imbalanced-learn fairlearn matplotlib seaborn --quiet

# STEP 2: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
from fairlearn.metrics import MetricFrame, selection_rate, true_positive_rate
import warnings

warnings.filterwarnings("ignore")
sns.set(style="whitegrid")

# STEP 3: Load Dataset
file_path = 'train.csv'
data = pd.read_csv(file_path)
data.columns = data.columns.str.strip()

# 🧭 Initial Exploration
print("📄 Dataset Preview:")
display(data.head())

print("\n📊 Data Types and Missing Values:")
display(data.info())

# 🔍 Missing Value Visualization
plt.figure(figsize=(10, 4))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Value Heatmap")
plt.show()

# 📊 Class Balance Visualization
plt.figure(figsize=(5, 3))
sns.countplot(data['Loan_Status'], palette='Set2')
plt.title("Loan Approval Distribution")
plt.show()

# 🧠 Preprocessing
data['Dependents'] = data['Dependents'].replace('3+', 3)
data['Dependents'] = pd.to_numeric(data['Dependents'], errors='coerce')

data.fillna(data.mean(numeric_only=True), inplace=True)
for col in ['Gender', 'Married', 'Self_Employed', 'Credit_History']:
    data[col].fillna(data[col].mode()[0], inplace=True)

# 🧬 Encode Categorical Columns
label_encoder = LabelEncoder()
for col in ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']:
    data[col] = label_encoder.fit_transform(data[col])

# 🎯 Features & Target
X = data.drop(columns=['Loan_ID', 'Loan_Status'])
y = data['Loan_Status']
sensitive_attr = data['Gender']

# 🧪 Train-Test Split
X_train, X_test, y_train, y_test, gender_train, gender_test = train_test_split(
    X, y, sensitive_attr, test_size=0.2, random_state=42
)

# ⚖️ Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 🤖 Model Training (Before SMOTE)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"\n📊 Original Accuracy: {accuracy:.2f}")
print("🧮 Confusion Matrix (Before SMOTE):")
ConfusionMatrixDisplay(conf_matrix).plot(cmap='Blues')
plt.title("Confusion Matrix Before SMOTE")
plt.show()

# 📈 Fairness Evaluation (Before SMOTE)
metric_frame = MetricFrame(metrics={
    'accuracy': accuracy_score,
    'selection_rate': selection_rate,
    'true_positive_rate': true_positive_rate
}, y_true=y_test, y_pred=y_pred, sensitive_features=gender_test)

print("\n📊 Fairness Metrics (Before SMOTE):")
display(metric_frame.by_group)

# 💉 Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# 🎯 Train Model After SMOTE
model_resampled = LogisticRegression(max_iter=1000)
model_resampled.fit(X_train_resampled, y_train_resampled)
y_pred_resampled = model_resampled.predict(X_test)

accuracy_res = accuracy_score(y_test, y_pred_resampled)
conf_matrix_res = confusion_matrix(y_test, y_pred_resampled)

print(f"\n📊 Accuracy After SMOTE: {accuracy_res:.2f}")
print("🧮 Confusion Matrix (After SMOTE):")
ConfusionMatrixDisplay(conf_matrix_res).plot(cmap='Oranges')
plt.title("Confusion Matrix After SMOTE")
plt.show()

# 📈 Fairness Evaluation (After SMOTE)
metric_frame_res = MetricFrame(metrics={
    'accuracy': accuracy_score,
    'selection_rate': selection_rate,
    'true_positive_rate': true_positive_rate
}, y_true=y_test, y_pred=y_pred_resampled, sensitive_features=gender_test)

print("\n📊 Fairness Metrics (After SMOTE):")
display(metric_frame_res.by_group)

# 📌 Feature Importance
feature_names = X.columns
importance = model.coef_[0]
feature_importance = pd.Series(importance, index=feature_names).sort_values(ascending=False)

print("\n📌 Feature Importance:")
display(feature_importance)

plt.figure(figsize=(8, 5))
sns.barplot(x=feature_importance.values, y=feature_importance.index, palette='viridis')
plt.title("Feature Importance from Logistic Regression")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

# 📊 Visual Comparison of Fairness Metrics - Improved Side-by-Side Bars
groups = ['Female', 'Male']
x = np.arange(len(groups))  # label locations
width = 0.35  # width of the bars

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Accuracy
axes[0].bar(x - width/2, metric_frame.by_group['accuracy'], width, label='Before SMOTE', color='skyblue')
axes[0].bar(x + width/2, metric_frame_res.by_group['accuracy'], width, label='After SMOTE', color='salmon')
axes[0].set_title('Accuracy by Gender')
axes[0].set_xticks(x)
axes[0].set_xticklabels(groups)
axes[0].set_ylim(0, 1)

# True Positive Rate
axes[1].bar(x - width/2, metric_frame.by_group['true_positive_rate'], width, label='Before SMOTE', color='skyblue')
axes[1].bar(x + width/2, metric_frame_res.by_group['true_positive_rate'], width, label='After SMOTE', color='salmon')
axes[1].set_title('TPR by Gender')
axes[1].set_xticks(x)
axes[1].set_xticklabels(groups)
axes[1].set_ylim(0, 1)

# Selection Rate
axes[2].bar(x - width/2, metric_frame.by_group['selection_rate'], width, label='Before SMOTE', color='skyblue')
axes[2].bar(x + width/2, metric_frame_res.by_group['selection_rate'], width, label='After SMOTE', color='salmon')
axes[2].set_title('Selection Rate by Gender')
axes[2].set_xticks(x)
axes[2].set_xticklabels(groups)
axes[2].set_ylim(0, 1)

for ax in axes:
    ax.legend()
    ax.set_ylabel("Rate")

plt.suptitle("📊 Fairness Metrics Comparison: Before vs After SMOTE", fontsize=16)
plt.tight_layout()
plt.show()

# ✅ Final Summary
print("\n✅ Summary:")
print(f"Original Accuracy: {accuracy:.2f}")
print(f"SMOTE Accuracy:    {accuracy_res:.2f}")

print("\nFairness Metrics Before SMOTE:\n", metric_frame.by_group)
print("\nFairness Metrics After SMOTE:\n", metric_frame_res.by_group)
