## Load & Inspect the Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier
from sklearn.model_selection import ParameterSampler
from tqdm.notebook import tqdm

# Load dataset
df = pd.read_csv("prosperLoanData.csv")

# Drop rows with no ProsperScore
df = df.dropna(subset=["ProsperScore"])
df["ProsperScore"] = df["ProsperScore"].astype(int)


## Data Preprocessing

In [2]:
drop_cols = [
    "ListingKey", "ListingNumber", "LoanKey", "LoanNumber", "MemberKey", "GroupKey",
    "LoanOriginationDate", "ClosedDate", "ListingCreationDate", "DateCreditPulled",
    "FirstRecordedCreditLine"
]
df = df.drop(columns=drop_cols, errors="ignore")

cat_cols = ["CreditGrade", "ProsperRating (Alpha)", "BorrowerState", "Occupation", "EmploymentStatus", "IncomeRange"]
num_cols = ["StatedMonthlyIncome", "DebtToIncomeRatio", "DelinquenciesLast7Years"]

df = df[cat_cols + num_cols + ["ProsperScore"]]
df = df.dropna(thresh=5)  # Keep rows with at least 5 non-null values
df = df.dropna(subset=["ProsperScore"])  # Ensure target is not null
df = df.fillna("Unknown")  # Fill remaining categorical NAs
df = df[cat_cols + num_cols + ["ProsperScore"]]

X = pd.get_dummies(df.drop("ProsperScore", axis=1), drop_first=True)
y = df["ProsperScore"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

y_train0 = y_train - 1  # Shift labels to start from 0
y_test0 = y_test - 1




## Model Training

In [3]:
sample_weights = compute_sample_weight("balanced", y_train0)

param_dist = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

base_params = dict(
    objective="multi:softprob",
    eval_metric="mlogloss",
    num_class=y.nunique(),
    n_jobs=-1
)

best_score = -1
best_params = None

for trial in tqdm(list(ParameterSampler(param_dist, n_iter=10, random_state=42)), desc="Hyperparam tuning"):
    model = XGBClassifier(**{**base_params, **trial})
    model.fit(X_train, y_train0, sample_weight=sample_weights, eval_set=[(X_test, y_test0)], verbose=False)
    score = model.score(X_test, y_test0)
    if score > best_score:
        best_score = score
        best_params = {**base_params, **trial}

final_model = XGBClassifier(**best_params)
final_model.fit(X_train, y_train0, sample_weight=sample_weights)
y_pred_raw = final_model.predict(X_test)




Hyperparam tuning:   0%|          | 0/10 [00:00<?, ?it/s]

## Model Evaluation

In [None]:
# Custom relaxed accuracy: allow ±2 off
def relaxed_accuracy(y_true, y_pred, margin=2):
    return np.mean(np.abs(y_true - y_pred) <= margin)

relaxed_acc = relaxed_accuracy(y_test0, y_pred_raw)

print("✅ Relaxed Accuracy (±2 allowed):", round(relaxed_acc, 4))
print("\n🧠 Standard Accuracy:", accuracy_score(y_test0, y_pred_raw))
print("\n📝 Classification Report:")
print(classification_report(y_test0, y_pred_raw))


✅ Relaxed Accuracy (±2 allowed): 0.7585

🧠 Standard Accuracy: 0.2814212480113134

📝 Classification Report:
              precision    recall  f1-score   support

           0       0.11      0.54      0.18       198
           1       0.27      0.43      0.33      1153
           2       0.21      0.24      0.23      1528
           3       0.31      0.27      0.29      2519
           4       0.26      0.08      0.12      1963
           5       0.30      0.12      0.18      2456
           6       0.28      0.25      0.27      2120
           7       0.34      0.36      0.35      2411
           8       0.34      0.53      0.42      1382
           9       0.49      0.41      0.45       950
          10       0.11      0.43      0.18       291

    accuracy                           0.28     16971
   macro avg       0.27      0.33      0.27     16971
weighted avg       0.30      0.28      0.27     16971



In [5]:
from xgboost import Booster
import json

# Save the XGBoost model using native method
final_model.get_booster().save_model("risk_model.json")

# Save the feature column order as JSON
with open("feature_columns.json", "w") as f:
    json.dump(X.columns.tolist(), f)
