loan_prediction_full.py

In [None]:
# == IMPORT ==
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, roc_curve
)
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import joblib

In [None]:
# Atur style visualisasi
sns.set(style="whitegrid", palette="muted")
plt.rcParams["figure.figsize"] = (6,4)

In [None]:
# == LOAD DATA ==
print("== LOAD DATA ==")
df = pd.read_csv(r"C:\Users\Avita\Documents\matematika\Rakamin\Data Science_IDX Partners\python\data.csv", low_memory=False)
print("Jumlah baris:", df.shape[0], "Jumlah kolom:", df.shape[1])
print("Contoh data:")
print(df.head())

In [None]:
# == DATA UNDERSTANDING ==
print("\n== DATA UNDERSTANDING ==")
print("Tipe data tiap kolom:")
print(df.dtypes.head(20))
print("\nStatistik deskriptif:")
print(df.describe(include="all").transpose().head(20))

In [None]:
# == CLEANING DATA ==
print("\n== CLEANING DATA ==")
df['target'] = df['loan_status'].map({
    'Fully Paid': 0,
    'Current': 0,
    'Charged Off': 1,
    'Default': 1,
    'Late (31-120 days)': 1,
    'Late (16-30 days)': 1,
    'Does not meet the credit policy. Status:Charged Off': 1
}).fillna(0).astype(int)

In [None]:
print("Distribusi target setelah mapping:\n", df['target'].value_counts())

In [None]:
# Drop kolom yang semuanya NaN
df = df.dropna(axis=1, how='all')

In [None]:
# Simpan sample hasil cleaning
df.sample(10).to_csv("sample_cleaned_head.csv", index=False)

In [None]:
# == VISUALISASI DATA UNDERSTANDING ==
# Distribusi target
sns.countplot(x="target", data=df)
plt.title("Distribusi Target (0=Good, 1=Bad Loan)")
plt.show()

In [None]:
# == FEATURE ENGINEERING VISUAL ==
# Annual income vs target
sns.boxplot(x="target", y="annual_inc", data=df)
plt.title("Annual Income vs Target")
plt.ylim(0, df['annual_inc'].quantile(0.95))  # supaya tidak terlalu ekstrem
plt.show()

In [None]:
# == EDA ==
# Distribusi annual_inc
sns.histplot(df['annual_inc'], bins=50, kde=True)
plt.title("Distribusi Pendapatan Tahunan (annual_inc)")
plt.show()

In [None]:
# Distribusi funded_amnt
sns.histplot(df['funded_amnt'], bins=50, kde=True)
plt.title("Distribusi Funded Amount")
plt.show()

In [None]:
# Distribusi int_rate
sns.histplot(df['int_rate'], bins=50, kde=True)
plt.title("Distribusi Interest Rate (int_rate)")
plt.show()

In [None]:
# Distribusi loan_amnt
sns.histplot(df['loan_amnt'], bins=50, kde=True)
plt.title("Distribusi Loan Amount (loan_amnt)")
plt.show()

In [None]:
# Korelasi antar fitur numerik
num_cols = ['loan_amnt','funded_amnt','int_rate','annual_inc','dti','target']
sns.heatmap(df[num_cols].corr(), annot=True, cmap="coolwarm")
plt.title("Heatmap Korelasi Fitur Numerik")
plt.show()

In [None]:
# Buang outlier annual_inc di atas 99%
q99 = df['annual_inc'].quantile(0.99)
df = df[df['annual_inc'] < q99]

In [None]:
# == DATA PREPARATION ==
print("\n== DATA PREPARATION ==")
features = ['loan_amnt', 'int_rate', 'annual_inc', 'dti']
X = df[features].fillna(0)
y = df['target']

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Distribusi sebelum SMOTE
sns.countplot(x=y_train)
plt.title("Distribusi Target Sebelum SMOTE")
plt.show()

In [None]:
# SMOTE untuk handle imbalance
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [None]:
# Distribusi sesudah SMOTE
sns.countplot(x=y_train_res)
plt.title("Distribusi Target Sesudah SMOTE")
plt.show()

In [None]:
# == MODELING ==
print("\n== MODELING ==")

In [None]:
# Logistic Regression
print("\n-- Logistic Regression --")
logreg = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
logreg.fit(X_train_res, y_train_res)
y_pred_log = logreg.predict(X_test)
print(classification_report(y_test, y_pred_log))
roc_log = roc_auc_score(y_test, logreg.predict_proba(X_test)[:,1])
print("ROC AUC Logistic Regression:", roc_log)

In [None]:
# Confusion matrix Logistic Regression
cm_log = confusion_matrix(y_test, y_pred_log)
sns.heatmap(cm_log, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Random Forest
print("\n-- Random Forest --")
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    class_weight="balanced",
    random_state=42
)
rf.fit(X_train_res, y_train_res)
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))
roc_rf = roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])
print("ROC AUC Random Forest:", roc_rf)

In [None]:
# Confusion matrix Random Forest
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt="d", cmap="Greens")
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# ROC Curve untuk kedua model
fpr_log, tpr_log, _ = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf.predict_proba(X_test)[:,1])

In [None]:
plt.plot(fpr_log, tpr_log, label=f"Logistic Regression (AUC={roc_log:.3f})")
plt.plot(fpr_rf, tpr_rf, label=f"Random Forest (AUC={roc_rf:.3f})")
plt.plot([0,1], [0,1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

In [None]:
# == EVALUASI MODEL ==
print("\n== RINGKASAN HASIL EVALUASI ==")
if roc_log > roc_rf:
    best_model = logreg
    best_name = "Logistic Regression"
    best_score = roc_log
else:
    best_model = rf
    best_name = "Random Forest"
    best_score = roc_rf

In [None]:
print(f"Best model: {best_name} dengan ROC AUC {best_score:.3f}")

In [None]:
# Simpan model terbaik
joblib.dump(best_model, "best_model.joblib")
print("Model terbaik disimpan ke best_model.joblib")