import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

read the data

In [None]:
dataset = pd.read_csv('loan_approval_dataset.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
print(dataset.isna().sum())
x, y

loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64


(array([[1, 2, ' Graduate', ..., 17600000, 22700000, 8000000],
        [2, 0, ' Not Graduate', ..., 2200000, 8800000, 3300000],
        [3, 3, ' Graduate', ..., 4500000, 33300000, 12800000],
        ...,
        [4267, 2, ' Not Graduate', ..., 12400000, 18100000, 7300000],
        [4268, 1, ' Not Graduate', ..., 700000, 14100000, 5800000],
        [4269, 1, ' Graduate', ..., 11800000, 35700000, 12000000]],
       dtype=object),
 array([' Approved', ' Rejected', ' Rejected', ..., ' Rejected',
        ' Approved', ' Approved'], dtype=object))

encoding independent variable

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
x = x[:, 1:]
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [1, 2])], remainder = 'passthrough')
x = ct.fit_transform(x)
x

array([[1.0, 0.0, 1.0, ..., 17600000, 22700000, 8000000],
       [0.0, 1.0, 0.0, ..., 2200000, 8800000, 3300000],
       [1.0, 0.0, 1.0, ..., 4500000, 33300000, 12800000],
       ...,
       [0.0, 1.0, 1.0, ..., 12400000, 18100000, 7300000],
       [0.0, 1.0, 1.0, ..., 700000, 14100000, 5800000],
       [1.0, 0.0, 1.0, ..., 11800000, 35700000, 12000000]], dtype=object)

encode dependent variable

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 1, 1, ..., 1, 0, 0])

split the data

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 2)

standardise the data

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

create 3 classification models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

models = [RandomForestClassifier(), XGBClassifier(), SVC(kernel='linear')]
hyperparameters = [

    {
        'n_estimators': [50, 100, 200, 400],
        'criterion': ['gini', 'entropy']
    },
    {
        'n_estimators': [50, 100, 200, 400],
        'max_depth': [3, 5, 7, 9]
    },
    {
        'kernel': ['linear', 'poly', 'rbf'],
        'C': [1, 5, 10, 20],
    }
]


train the models

In [None]:
from sklearn.model_selection import GridSearchCV

for i, model in enumerate(models):
  grid_search = GridSearchCV(model, hyperparameters[i], cv = 5, scoring = 'accuracy')
  grid_search.fit(x_train, y_train)

  print('details for model', i, 'which is', model)
  print('best params for the model are', grid_search.best_params_)
  print('best score for the model is', grid_search.best_score_)
  print('-------------------------------------------------------')

details for model 0 which is RandomForestClassifier()
best params for the model are {'criterion': 'entropy', 'n_estimators': 100}
best score for the model is 0.9792093704245974
-------------------------------------------------------
details for model 1 which is XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, rand

create the models with the best parameters and train them

In [None]:
models = [RandomForestClassifier(criterion = 'gini', n_estimators = 50), XGBClassifier(max_depth = 5, n_estimators = 100), SVC(kernel='rbf', C = 20)]
for model in models:
  model.fit(x_train, y_train)

evaluate the models

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

for i, model in enumerate(models):
  y_pred = model.predict(x_test)
  acc = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)

  print('details for model', i, 'which is', model)
  print('accuracy score for the model =', acc)
  print('precision score for the model =', precision)
  print('recall score for the model =', recall)
  print('-------------------------------------------------------')


details for model 0 which is RandomForestClassifier(n_estimators=50)
accuracy score for the model = 0.9824355971896955
precision score for the model = 0.9843260188087775
recall score for the model = 0.9691358024691358
-------------------------------------------------------
details for model 1 which is XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
     