# Jupyter Notebook

In [1]:
import pandas as pd

train = pd.read_csv("train.csv")

if "id" in train.columns:
    train = train.drop(columns=["id"])

## Feature preparation & scaling

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Separate target and features

# I guess makes sense to drop the "grade_subgrade" column since it has 30 unique values -> very many columns when one-hot encoding
# Or if not drop, then at least group the categories somehow. Maybe this is better. This way we at least account for this feature.
X = train.drop(columns=["loan_paid_back", "grade_subgrade"])
y = train["loan_paid_back"]

train_small = train.sample(100_000, random_state=42)
X_small = train_small.drop(columns=["loan_paid_back"])
y_small = train_small["loan_paid_back"]

# Categorical & numeric columns already detected earlier
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["float64", "int64"]).columns.tolist()

for col in categorical_cols:
    print(col, X[col].nunique())

# Preprocessor pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_small, y_small, test_size=0.2, random_state=42
)

# Fit preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

X_train_processed.shape, X_val_processed.shape


gender 3
marital_status 4
education_level 5
employment_status 5
loan_purpose 8


((80000, 30), (20000, 30))

## KNN Classifier

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline

k_values = [75,125,250]

results_knn = {}

for k in k_values:
    print(f"Starting training with k={k}")
    knn = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", KNeighborsClassifier(n_neighbors=k))
    ])

    knn.fit(X_train, y_train)
    predictions_proba = knn.predict_proba(X_val)[:, 1]
    predictions = knn.predict(X_val)
    
    auc = roc_auc_score(y_val, predictions_proba)
    accuracy = accuracy_score(y_val, predictions)
    precision = precision_score(y_val, predictions)
    recall = recall_score(y_val, predictions)
    
    results_knn[k] = {
        "AUC": auc,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall
    }
    print(f"KNN (k={k}) - AUC: {auc:.5f}, Accuracy: {accuracy:.5f}, Precision: {precision:.5f}, Recall: {recall:.5f}")

results_knn

Starting training with k=75
KNN (k=75) - AUC: 0.89901, Accuracy: 0.88950, Precision: 0.88497, Recall: 0.99029
Starting training with k=125
KNN (k=125) - AUC: 0.89968, Accuracy: 0.88655, Precision: 0.88078, Recall: 0.99217
Starting training with k=250
KNN (k=250) - AUC: 0.89948, Accuracy: 0.87720, Precision: 0.87102, Recall: 0.99324


{75: {'AUC': 0.8990111035175692,
  'Accuracy': 0.8895,
  'Precision': 0.8849706129303107,
  'Recall': 0.9902912621359223},
 125: {'AUC': 0.8996806909165844,
  'Accuracy': 0.88655,
  'Precision': 0.8807829181494662,
  'Recall': 0.99217037269026},
 250: {'AUC': 0.8994824179751943,
  'Accuracy': 0.8772,
  'Precision': 0.8710244438341115,
  'Recall': 0.9932352020043846}}

## Support Vector Machine (SVM)

In [3]:
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

# Sample 25,000 rows for SVM training (SVM is computationally expensive)
svm_sample_size = 25000
svm_sample_indices = X_train.sample(n=min(svm_sample_size, len(X_train)), random_state=42).index
X_train_svm = X_train.loc[svm_sample_indices]
y_train_svm = y_train.loc[svm_sample_indices]

svm_models = [
    ("Linear SVM", SVC(kernel="linear", probability=True, C=1))
]

results_svm = {}

for name, model in svm_models:
    print(f"Starting with {name}")
    classifier = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", model)
    ])

    print(f"Starting fitting")
    classifier.fit(X_train_svm, y_train_svm)
    print(f"Starting predicting")
    preds_proba = classifier.predict_proba(X_val)[:, 1]
    preds = classifier.predict(X_val)
    print(f"Starting metric calculations")
    auc = roc_auc_score(y_val, preds_proba)
    accuracy = accuracy_score(y_val, preds)
    precision = precision_score(y_val, preds)
    recall = recall_score(y_val, preds)
    
    results_svm[name] = {
        "AUC": auc,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall
    }
    print(f"{name} - AUC: {auc:.5f}, Accuracy: {accuracy:.5f}, Precision: {precision:.5f}, Recall: {recall:.5f}")

results_svm

Starting with Linear SVM
Starting fitting
Starting predicting
Starting metric calculations
Linear SVM - AUC: 0.89206, Accuracy: 0.89745, Precision: 0.89886, Recall: 0.98202


{'Linear SVM': {'AUC': 0.8920575562015888,
  'Accuracy': 0.89745,
  'Precision': 0.8988648090815273,
  'Recall': 0.9820231756968368}}

## Gradient Boosting

In [17]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline

params = [
    {"n_estimators": 200, "learning_rate": 0.05, "max_depth": 3},
    {"n_estimators": 300, "learning_rate": 0.1,  "max_depth": 3},
    {"n_estimators": 500, "learning_rate": 0.1,  "max_depth": 4},
]

results_gb = {}

for p in params:
    model = GradientBoostingClassifier(
        n_estimators=p["n_estimators"],
        learning_rate=p["learning_rate"],
        max_depth=p["max_depth"]
    )

    classifier = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", model)
    ])

    classifier.fit(X_train, y_train)
    preds_proba = classifier.predict_proba(X_val)[:, 1]
    preds = classifier.predict(X_val)
    
    auc = roc_auc_score(y_val, preds_proba)
    accuracy = accuracy_score(y_val, preds)
    precision = precision_score(y_val, preds)
    recall = recall_score(y_val, preds)
    
    name = f"GB: est={p['n_estimators']}, lr={p['learning_rate']}, depth={p['max_depth']}"
    results_gb[name] = {
        "AUC": auc,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall
    }
    print(f"{name} - AUC: {auc:.5f}, Accuracy: {accuracy:.5f}, Precision: {precision:.5f}, Recall: {recall:.5f}")

results_gb

GB: est=200, lr=0.05, depth=3 - AUC: 0.91680, Accuracy: 0.90265, Precision: 0.90674, Recall: 0.97870
GB: est=300, lr=0.1, depth=3 - AUC: 0.91963, Accuracy: 0.90465, Precision: 0.90952, Recall: 0.97783
GB: est=500, lr=0.1, depth=4 - AUC: 0.92057, Accuracy: 0.90565, Precision: 0.91144, Recall: 0.97670


{'GB: est=200, lr=0.05, depth=3': {'AUC': 0.9167968965569432,
  'Accuracy': 0.90265,
  'Precision': 0.9067432683379758,
  'Recall': 0.978703413717507},
 'GB: est=300, lr=0.1, depth=3': {'AUC': 0.9196331814754317,
  'Accuracy': 0.90465,
  'Precision': 0.9095199254253088,
  'Recall': 0.9778264954588162},
 'GB: est=500, lr=0.1, depth=4': {'AUC': 0.9205683125765743,
  'Accuracy': 0.90565,
  'Precision': 0.9114449380406827,
  'Recall': 0.9766990291262136}}

# XGBoost

In [1]:
# Install xgboost in the current Python environment (if not already installed)
# This ensures it's installed in the same environment that Jupyter is using
# Using %pip ensures installation in the kernel's Python environment
%pip install xgboost --quiet
print("XGBoost installation check complete.")


Note: you may need to restart the kernel to use updated packages.
XGBoost installation check complete.


In [5]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

# Prepare data for XGBoost native categorical handling
# Convert categorical columns to pandas category dtype (XGBoost native format)
X_train_xgb = X_train.copy()
X_val_xgb = X_val.copy()

# Get categorical columns (same as defined earlier)
categorical_cols_xgb = X_train_xgb.select_dtypes(include=["object"]).columns.tolist()

# Convert categorical columns to category dtype for XGBoost native handling
for col in categorical_cols_xgb:
    X_train_xgb[col] = X_train_xgb[col].astype("category")
    X_val_xgb[col] = X_val_xgb[col].astype("category")

# XGBoost parameters to try
xgb_params = [
    {"n_estimators": 200, "learning_rate": 0.05, "max_depth": 3, "subsample": 0.8},
    {"n_estimators": 300, "learning_rate": 0.1, "max_depth": 4, "subsample": 0.8},
    {"n_estimators": 500, "learning_rate": 0.1, "max_depth": 5, "subsample": 0.9},
]

results_xgb = {}

for p in xgb_params:
    # enable_categorical=True tells XGBoost to use native categorical handling
    model = XGBClassifier(
        n_estimators=p["n_estimators"],
        learning_rate=p["learning_rate"],
        max_depth=p["max_depth"],
        subsample=p["subsample"],
        random_state=42,
        eval_metric="logloss",
        enable_categorical=True  # Enable native categorical feature handling
    )

    print(f"Training XGBoost: est={p['n_estimators']}, lr={p['learning_rate']}, depth={p['max_depth']}")
    model.fit(X_train_xgb, y_train)
    
    preds_proba = model.predict_proba(X_val_xgb)[:, 1]
    preds = model.predict(X_val_xgb)
    
    auc = roc_auc_score(y_val, preds_proba)
    accuracy = accuracy_score(y_val, preds)
    precision = precision_score(y_val, preds)
    recall = recall_score(y_val, preds)
    
    name = f"XGBoost: est={p['n_estimators']}, lr={p['learning_rate']}, depth={p['max_depth']}"
    results_xgb[name] = {
        "AUC": auc,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall
    }
    print(f"{name} - AUC: {auc:.5f}, Accuracy: {accuracy:.5f}, Precision: {precision:.5f}, Recall: {recall:.5f}\n")

results_xgb


Training XGBoost: est=200, lr=0.05, depth=3
XGBoost: est=200, lr=0.05, depth=3 - AUC: 0.91694, Accuracy: 0.90285, Precision: 0.90681, Recall: 0.97889

Training XGBoost: est=300, lr=0.1, depth=4
XGBoost: est=300, lr=0.1, depth=4 - AUC: 0.91876, Accuracy: 0.90210, Precision: 0.90768, Recall: 0.97670

Training XGBoost: est=500, lr=0.1, depth=5
XGBoost: est=500, lr=0.1, depth=5 - AUC: 0.91744, Accuracy: 0.90285, Precision: 0.90957, Recall: 0.97526



{'XGBoost: est=200, lr=0.05, depth=3': {'AUC': 0.9169449046493046,
  'Accuracy': 0.90285,
  'Precision': 0.9068121155854706,
  'Recall': 0.9788913247729408},
 'XGBoost: est=300, lr=0.1, depth=4': {'AUC': 0.9187580173637266,
  'Accuracy': 0.9021,
  'Precision': 0.9076779789277606,
  'Recall': 0.9766990291262136},
 'XGBoost: est=500, lr=0.1, depth=5': {'AUC': 0.9174411807737729,
  'Accuracy': 0.90285,
  'Precision': 0.9095688748685594,
  'Recall': 0.9752583777012214}}