In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv("loan_data.csv")


In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
sns.pairplot(data=df)

In [None]:
df.loan_status.value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['person_education']= label_encoder.fit_transform(df['person_education'])
df['person_home_ownership']= label_encoder.fit_transform(df['person_home_ownership'])
df['loan_intent']= label_encoder.fit_transform(df['loan_intent'])

In [None]:
df["previous_loan_defaults_on_file"]=pd.get_dummies(df['previous_loan_defaults_on_file'],drop_first=True)
df["person_gender"]=pd.get_dummies(df['person_gender'],drop_first=True)

In [None]:
df["person_gender"]=df["person_gender"].astype(float)
df["person_education"]=df["person_education"].astype(float)
df["person_emp_exp"]=df["person_emp_exp"].astype(float)
df["person_home_ownership"]=df["person_home_ownership"].astype(float)
df["loan_intent"]=df["loan_intent"].astype(float)
df["credit_score"]=df["credit_score"].astype(float)
df["previous_loan_defaults_on_file"]=df["previous_loan_defaults_on_file"].astype(float)
df["loan_status"]=df["loan_status"].astype(float)

In [None]:
df.info()

In [None]:
df.plot(kind="box")
plt.xticks(rotation=90)
plt.show()

In [None]:
def outlier(df,fts):
    df_no_outliers=df.copy()
    for ft in fts:
        threshold=2
        mean= np.mean(df_no_outliers[ft])
        std=np.std(df_no_outliers[ft])
        z_score=(df_no_outliers[ft] - mean)/std
        df_no_outliers=df_no_outliers[np.abs(z_score)<threshold]
    return df_no_outliers

In [None]:
fts=['person_age', 'person_gender', 'person_education', 'person_income',
       'person_emp_exp', 'person_home_ownership', 'loan_amnt', 'loan_intent',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'credit_score', 'previous_loan_defaults_on_file']
df=outlier(df,fts)


In [None]:
df.shape

In [None]:
sns.histplot(df['person_income'], kde=True)

In [None]:
X=df.drop("loan_status",axis=1)
y=df["loan_status"]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=24)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train_scaled, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

In [None]:
model=RandomForestClassifier(class_weight='balanced',random_state=24)
model.fit(X_resampled,y_resampled)

In [None]:
y_pred=model.predict(X_test_scaled)

In [None]:
print("Accuracy:",accuracy_score(y_test,y_pred))

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid = RandomizedSearchCV(RandomForestClassifier(random_state=42),
                          param_distributions=param_grid,
                          n_iter=10,
                          scoring='roc_auc',
                          cv=3,
                          random_state=42)
grid.fit(X_resampled, y_resampled)
print("Best Params:", grid.best_params_)

# Re-train best model
best_model = grid.best_estimator_
best_model.fit(X_resampled, y_resampled)

In [None]:
y_pred=best_model.predict(X_test_scaled)

In [None]:
y_probs = model.predict_proba(X_test_scaled)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)

print(roc_auc_score(y_test,y_probs))

In [None]:
print("C_report:",classification_report(y_test,y_pred))
print("Accuracy:",accuracy_score(y_test,y_pred))
auc=accuracy_score(y_test,y_pred)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {auc:.2f})")
plt.plot([0, 1], [0, 1], linestyle='--', color='red', label="Random Guess (AUC = 0.5)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve with AUC Score")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')

In [None]:
importances = model.feature_importances_
features = X.columns
forest_importances = pd.Series(importances, index=features)

forest_importances.sort_values(ascending=False).plot(kind='bar', figsize=(10,5))
plt.title("Feature Importances")
plt.show()

In [None]:
#Overfitting check
y_train_pred = model.predict(X_resampled)
y_test_pred = model.predict(X_test_scaled)
print("Accuracy of train model:",accuracy_score(y_resampled,y_train_pred))
print("Accuracy of test model:",accuracy_score(y_test,y_test_pred))
