In [5]:
!pip install -q kaggle xgboost imbalanced-learn shap

# 1) Kaggle Setup

import os
from pathlib import Path

KAGGLE_DIR = Path('/root/.kaggle')
KAGGLE_DIR.mkdir(parents=True, exist_ok=True)
kaggle_json_path = KAGGLE_DIR / 'kaggle.json'

if not kaggle_json_path.exists():
    try:
        from google.colab import files
        print("Please upload your kaggle.json API token.")
        uploaded = files.upload()
        if 'kaggle.json' in uploaded:
            with open(kaggle_json_path, 'wb') as f:
                f.write(uploaded['kaggle.json'])
            os.chmod(kaggle_json_path, 0o600)
            print(" kaggle.json uploaded and saved.")
        else:
            print(" kaggle.json not uploaded.")
    except Exception as e:
        print(" Error uploading kaggle.json:", e)
else:
    print(" kaggle.json already present.")


# 2) Download dataset from Kaggle

dataset_slug = "laotse/credit-risk-dataset"
download_dir = Path("kaggle_dataset")
download_dir.mkdir(exist_ok=True)

import subprocess

def download_kaggle_dataset(slug, out_dir="kaggle_dataset"):
    try:
        cmd = ["kaggle", "datasets", "download", "-d", slug, "-p", out_dir, "--unzip"]
        subprocess.check_call(cmd)
        print(f" Downloaded and unzipped dataset: {slug}")
        return True
    except Exception as e:
        print(" Kaggle download failed:", e)
        return False

downloaded = download_kaggle_dataset(dataset_slug, str(download_dir))


# 3) Load dataset

import pandas as pd

data_files = list(download_dir.glob("*.csv"))
if not data_files:
    raise FileNotFoundError("No CSV dataset found. Please upload manually.")

df = pd.read_csv(data_files[0])
print(" Loaded:", data_files[0].name)
print("Shape:", df.shape)
display(df.head())


# 4) Inspect and clean data

print("\nMissing values:\n", df.isnull().sum())

target_col = "loan_status"

features = [
    "person_age",
    "person_income",
    "person_emp_length",
    "loan_amnt",
    "loan_int_rate",
    "loan_percent_income",
    "cb_person_cred_hist_length",
    "person_home_ownership",
    "loan_intent",
    "loan_grade",
    "cb_person_default_on_file",
]

X = df[features]
y = df[target_col]

# 5) Handle missing values manually

# Fill numerical missing with median, categorical with mode
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns

for col in num_cols:
    X[col].fillna(X[col].median(), inplace=True)

for col in cat_cols:
    X[col].fillna(X[col].mode()[0], inplace=True)

print(" Missing values handled.")


# 6) Encode categorical features manually

X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
print(" Categorical encoding complete.")
print("New shape after encoding:", X.shape)

# 7) Scale numeric features manually

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
print(" Numerical features scaled.")


# 8) Train-test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print("Train/Test shapes:", X_train.shape, X_test.shape)


# 9) Handle class imbalance (SMOTE)

from imblearn.over_sampling import SMOTE
from collections import Counter

print("Before SMOTE:", Counter(y_train))
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)
print("After SMOTE:", Counter(y_train_bal))


# 10) Train models

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix, classification_report
)

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42, n_jobs=-1)
}

results = {}
fitted_models = {}

for name, model in models.items():
    print(f"\nðŸš€ Training {name} ...")
    model.fit(X_train_bal, y_train_bal)
    fitted_models[name] = model

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba)

    results[name] = {"Accuracy": acc, "Precision": prec, "Recall": rec, "ROC_AUC": roc}

    print(f"{name} Results:")
    print(f"Accuracy={acc:.3f}, Precision={prec:.3f}, Recall={rec:.3f}, ROC_AUC={roc:.3f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))


# 11) Compare models and save the best

best_model_name = max(results, key=lambda m: results[m]["ROC_AUC"])
best_model = fitted_models[best_model_name]
print(f"\n Best model: {best_model_name}")
print(results[best_model_name])

import joblib
joblib.dump({"scaler": scaler, "model": best_model, "columns": X.columns.tolist()}, "best_credit_model_manual.joblib")
print(" Saved best model to best_credit_model_manual.joblib")





 kaggle.json already present.
 Downloaded and unzipped dataset: laotse/credit-risk-dataset
 Loaded: credit_risk_dataset.csv
Shape: (32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4



Missing values:
 person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64
 Missing values handled.
 Categorical encoding complete.
New shape after encoding: (32581, 22)
 Numerical features scaled.
Train/Test shapes: (26064, 22) (6517, 22)
Before SMOTE: Counter({0: 20378, 1: 5686})


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col].fillna(X[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) inst

After SMOTE: Counter({0: 20378, 1: 20378})

ðŸš€ Training LogisticRegression ...
LogisticRegression Results:
Accuracy=0.812, Precision=0.549, Recall=0.771, ROC_AUC=0.867
Confusion Matrix:
 [[4196  899]
 [ 326 1096]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.82      0.87      5095
           1       0.55      0.77      0.64      1422

    accuracy                           0.81      6517
   macro avg       0.74      0.80      0.76      6517
weighted avg       0.85      0.81      0.82      6517


ðŸš€ Training RandomForest ...
RandomForest Results:
Accuracy=0.925, Precision=0.898, Recall=0.738, ROC_AUC=0.926
Confusion Matrix:
 [[4976  119]
 [ 372 1050]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.98      0.95      5095
           1       0.90      0.74      0.81      1422

    accuracy                           0.92      6517
   macro avg       0.91      0

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Results:
Accuracy=0.932, Precision=0.950, Recall=0.728, ROC_AUC=0.948
Confusion Matrix:
 [[5040   55]
 [ 387 1035]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.99      0.96      5095
           1       0.95      0.73      0.82      1422

    accuracy                           0.93      6517
   macro avg       0.94      0.86      0.89      6517
weighted avg       0.93      0.93      0.93      6517


 Best model: XGBoost
{'Accuracy': 0.9321773822310879, 'Precision': 0.9495412844036697, 'Recall': 0.7278481012658228, 'ROC_AUC': np.float64(0.9476116928844225)}
 Saved best model to best_credit_model_manual.joblib


In [6]:

import joblib
import numpy as np
import pandas as pd

# Load your trained model
saved = joblib.load("best_credit_model_manual.joblib")
model = saved["model"]
scaler = saved["scaler"]
columns = saved["columns"]

# Example applicant data
applicant_data = {
    "person_age": 28,
    "person_income": 55000,
    "person_emp_length": 5.0,
    "loan_amnt": 10000,
    "loan_int_rate": 12.5,
    "loan_percent_income": 0.18,
    "cb_person_cred_hist_length": 5,
    "person_home_ownership": "RENT",
    "loan_intent": "PERSONAL",
    "loan_grade": "C",
    "cb_person_default_on_file": "N"
}

# Convert to DataFrame
applicant_df = pd.DataFrame([applicant_data])

# One-hot encode with same columns as training
applicant_df = pd.get_dummies(applicant_df)
for col in columns:
    if col not in applicant_df.columns:
        applicant_df[col] = 0
applicant_df = applicant_df[columns]

# Scale numeric columns
num_cols = ["person_age", "person_income", "person_emp_length",
            "loan_amnt", "loan_int_rate", "loan_percent_income",
            "cb_person_cred_hist_length"]
applicant_df[num_cols] = scaler.transform(applicant_df[num_cols])

# Predict probability
prob_default = model.predict_proba(applicant_df)[:, 1][0]

# Try different thresholds
for threshold in [0.3, 0.4, 0.5]:
    creditworthy = "YES" if prob_default < threshold else "NO"
    print(f"Threshold={threshold:.1f} â†’ Probability of Default={prob_default:.3f} â†’ Creditworthy? {creditworthy}")


Threshold=0.3 â†’ Probability of Default=0.068 â†’ Creditworthy? YES
Threshold=0.4 â†’ Probability of Default=0.068 â†’ Creditworthy? YES
Threshold=0.5 â†’ Probability of Default=0.068 â†’ Creditworthy? YES
