In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, precision_score, recall_score, roc_curve
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from catboost import CatBoostClassifier
import warnings

In [22]:
df = pd.read_csv("./data/loan_approval_dataset.csv")

In [23]:
#top 5 row
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [24]:
df.shape

(4269, 13)

# Preparing dataset

In [25]:
X = df.drop(columns=[' loan_status'],axis=1)
X = X.drop(columns=['loan_id'])

In [26]:
X.shape

(4269, 11)

In [27]:
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000


In [28]:
status_mapping = {' Rejected': 0, ' Approved': 1}
y = df[' loan_status'].map(status_mapping)

In [29]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [30]:
X = preprocessor.fit_transform(X)

In [31]:
X.shape

(4269, 13)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((3415, 13), (854, 13))

### Evaluation metrics

In [43]:
def evaluate_metric(trues, predicted):
    #f1_score, accuracy_score, confusion_matrix, precision_score, recall_score, roc_curve
    f1_scores = f1_score(trues, predicted)
    accuracy_scores = accuracy_score(trues, predicted)
    precision_scores = precision_score(trues, predicted)
    recall_scores = recall_score(trues, predicted)
    return f1_scores, accuracy_scores, precision_scores, recall_scores

In [17]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier()
}
model_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #train score
    f1_train_scores, accuracy_train_scores, precision_train_scores, recall_train_scores = evaluate_metric(y_train, y_train_pred)
    #Test score
    f1_test_scores, accuracy_test_scores, precision_test_scores, recall_test_scores = evaluate_metric(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- F1_train_scores: {:.4f}".format(f1_train_scores))
    print("- Accuracy_train_scores: {:.4f}".format(accuracy_train_scores))
    print("- Precision_train_scores: {:.4f}".format(precision_train_scores))
    print("- Recall_train_scores: {:.4f}".format(recall_train_scores))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- F1_test_scores: {:.4f}".format(f1_test_scores))
    print("- Accuracy_test_scores: {:.4f}".format(accuracy_test_scores))
    print("- Precision_test_scores: {:.4f}".format(precision_test_scores))
    print("- Recall_test_scores: {:.4f}".format(recall_test_scores))
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- F1_train_scores: 0.9358
- Accuracy_train_scores: 0.9204
- Precision_train_scores: 0.9358
- Recall_train_scores: 0.9358
----------------------------------
Model performance for Test set
- F1_test_scores: 0.9248
- Accuracy_test_scores: 0.9052
- Precision_test_scores: 0.9205
- Recall_test_scores: 0.9291




  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K-Neighbors Classifier
Model performance for Training set
- F1_train_scores: 0.9543
- Accuracy_train_scores: 0.9432
- Precision_train_scores: 0.9538
- Recall_train_scores: 0.9547
----------------------------------
Model performance for Test set
- F1_test_scores: 0.9158
- Accuracy_test_scores: 0.8958
- Precision_test_scores: 0.9290
- Recall_test_scores: 0.9030


Decision Tree Classifier
Model performance for Training set
- F1_train_scores: 1.0000
- Accuracy_train_scores: 1.0000
- Precision_train_scores: 1.0000
- Recall_train_scores: 1.0000
----------------------------------
Model performance for Test set
- F1_test_scores: 0.9823
- Accuracy_test_scores: 0.9778
- Precision_test_scores: 0.9814
- Recall_test_scores: 0.9832


Random Forest Classifier
Model performance for Training set
- F1_train_scores: 1.0000
- Accuracy_train_scores: 1.0000
- Precision_train_scores: 1.0000
- Recall_train_scores: 1.0000
----------------------------------
Model performance for Test set
- F1_test_scores: 0.982

In [20]:
cbcr = CatBoostClassifier(verbose=False)
cbc_model = cbcr.fit(X_train, y_train)
y_pred = cbcr.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(f"Accuracy : {score}")

Accuracy : 0.9765807962529274


In [21]:
pred_df = pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value
1703,0,0
1173,1,1
308,0,0
1322,1,1
3271,1,1
...,...,...
912,1,1
443,1,1
1483,1,1
668,0,0


In [31]:
param={
    'iterations': [100, 200, 300],  # Number of boosting iterations
    'learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage
    'depth': [4, 6, 8],  # Depth of the trees
    }

In [44]:
model = CatBoostClassifier(verbose=False)
gs = GridSearchCV(model, param, cv=3)
gs.fit(X, y)

model.set_params(**gs.best_params_)
model.fit(X, y)

<catboost.core.CatBoostClassifier at 0x2764996f160>

In [45]:
pred = model.predict(X_test)

In [46]:
accuracy_score(y_test, pred)*100

100.0