In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from catboost import CatBoostClassifier
import pickle
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("C:/Users/ML Projects/predicting_loan_payback/datasets/train.csv")

In [None]:
df.head(2)

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(['id'], axis=1)
df.head(2)

In [None]:
df["credit_dti_interaction"] = df["credit_score"] / (df["debt_to_income_ratio"] + 1e-6)
df.head(2)

In [None]:
df = df.drop(['marital_status', 'gender', 'loan_purpose', 'education_level', 'interest_rate', 'grade_subgrade', 'annual_income', 'loan_amount'], axis=1)
df.head(2)

In [None]:
X = df.drop("loan_paid_back", axis=1)
y = df["loan_paid_back"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

In [None]:
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns
cat_idx = [X_train.columns.get_loc(col) for col in cat_cols]

print("Categorical columns:", list(cat_cols))

In [None]:
model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    iterations=5000,                   
    learning_rate=0.015,               
    depth=8,
    l2_leaf_reg=5,                     
    random_state=42,
    bootstrap_type="Bernoulli",
    subsample=0.7,
    auto_class_weights="Balanced",     
    boost_from_average=True,
    early_stopping_rounds=200,
    verbose=200
)

In [None]:
model.fit(
    X_train, y_train,
    cat_features=cat_idx,
    eval_set=(X_test, y_test),
    use_best_model=True
)

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

In [None]:
model.get_feature_importance(prettified=True)

In [None]:
test_df = pd.read_csv("C:/Users/ML Projects/predicting_loan_payback/datasets/test.csv")

In [None]:
test_df.head(2)

In [None]:
test_df.shape

In [None]:
test_ids = test_df["id"]

In [None]:
test_df["credit_dti_interaction"] = test_df["credit_score"] / (test_df["debt_to_income_ratio"] + 1e-6)
test_df.head(2)

In [None]:
test_df = test_df.drop(['marital_status', 'gender', 'loan_purpose', 'education_level', 'interest_rate', 'grade_subgrade', 'annual_income', 'loan_amount'], axis=1)
test_df.head(2)

In [None]:
test_df = test_df.drop(['id'], axis=1)
test_df.head(2)

In [None]:
test_pred = model.predict_proba(test_df)[:, 1]

In [None]:
submission = pd.DataFrame({
    "id": test_ids,
    "loan_paid_back": test_pred
})

submission.to_csv("submission.csv", index=False)
print("submission.csv created successfully!")

In [None]:
with open("catboost_credit_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved successfully!")