In [None]:
# ==========================================
#   CREDIT SCORING MODEL - CODEALPHA TASK 1
# ==========================================

# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, 
    f1_score, roc_auc_score
)

# ------------------------------------------
# 1. Load Dataset (You can replace with real data)
# ------------------------------------------

data = pd.DataFrame({
    "income": np.random.randint(20000, 150000, 800),
    "debts": np.random.randint(0, 50000, 800),
    "payment_history": np.random.randint(300, 850, 800),  # credit score style
    "late_payments": np.random.randint(0, 12, 800),
})

# Create a synthetic label: 1 = risky, 0 = safe
data["default"] = np.where(
    (data["debts"] > 30000) |
    (data["late_payments"] > 4) |
    (data["payment_history"] < 550),
    1, 0
)

print("Dataset Preview:")
print(data.head())

# ------------------------------------------
# 2. Split Data
# ------------------------------------------

X = data.drop("default", axis=1)
y = data["default"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------------------------
# 3. Feature Scaling
# ------------------------------------------

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ------------------------------------------
# 4. Train 3 Models
# ------------------------------------------

models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(max_depth=6),
    "Random Forest": RandomForestClassifier(n_estimators=200)
}

results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_prob),
    }

# ------------------------------------------
# 5. Display Results
# ------------------------------------------

print("\nModel Performance Comparison:\n")
for model_name, metrics in results.items():
    print(f"--- {model_name} ---")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")
    print()

# ------------------------------------------
# 6. Best Model Selection
# ------------------------------------------
best_model = max(results.items(), key=lambda x: x[1]["ROC-AUC"])
print("Best Model:", best_model[0])
