#### **About Dataset**


The loan approval dataset is a collection of financial records and associated information used to determine the eligibility of individuals or organizations for obtaining loans from a lending institution. It includes various factors such as cibil score, income, employment status, loan term, loan amount, assets value, and loan status. This dataset is commonly used in machine learning and data analysis to develop models and algorithms that predict the likelihood of loan approval based on the given features.

## $Importing  Some  Important  Libraries$

In [305]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.api.types import is_numeric_dtype
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
import pandas as pd
import pickle

## $Importing Data$

In [306]:
df = pd.read_csv(r"C:\Users\Lenovo\Downloads\loan_approval_dataset.csv")
df.head() 

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


## $DataPre-Processing$

#### **1)** Dropping irrelevant Columns.

In [307]:
df.drop('loan_id', axis=1, inplace=True)
df.head() 

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


#### **2.** Handling null values.

In [308]:
df.isna().sum().sum() 

0

* No null values.

#### **3.** Checking Duplicate values.

In [309]:
df.duplicated().sum()

0

* No duplicate values.

#### **4.** Checking Datatypes.

In [310]:
df.dtypes

 no_of_dependents             int64
 education                   object
 self_employed               object
 income_annum                 int64
 loan_amount                  int64
 loan_term                    int64
 cibil_score                  int64
 residential_assets_value     int64
 commercial_assets_value      int64
 luxury_assets_value          int64
 bank_asset_value             int64
 loan_status                 object
dtype: object

* education, self_employed, loan_status Columns have object datatype values. 

In [311]:
df[' education'] = df[' education'].replace({' Graduate' : 1 , ' Not Graduate' : 0})
df[' self_employed'] = df[' self_employed'].replace({' No': 0 , ' Yes':1})
df[' loan_status'] = df[' loan_status'].replace({' Approved' : 1, ' Rejected' : 0})

In [312]:
df.dtypes

 no_of_dependents            int64
 education                   int64
 self_employed               int64
 income_annum                int64
 loan_amount                 int64
 loan_term                   int64
 cibil_score                 int64
 residential_assets_value    int64
 commercial_assets_value     int64
 luxury_assets_value         int64
 bank_asset_value            int64
 loan_status                 int64
dtype: object

#### **5.** Checking Target Column.

In [313]:
df[' loan_status'].value_counts() 

 loan_status
1    2656
0    1613
Name: count, dtype: int64

* Thier is Imbalance in Our Target Column 'loan_status'.

#### **6.** Statistical Analysis of Data.

In [314]:
df.describe() 

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2.498712,0.502225,0.503631,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0,0.62216
std,1.69591,0.500054,0.500045,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0,0.484904
min,0.0,0.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0,0.0
25%,1.0,0.0,0.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0,0.0
50%,3.0,1.0,1.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0,1.0
75%,4.0,1.0,1.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0,1.0
max,5.0,1.0,1.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0,1.0


-  Data contains features with vastly different scales.

#### **7.** Checking Highly Correlated Columns.

In [315]:
corr_matrix = df.corr()
threshold = 0.95

high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i + 1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

high_corr_df = pd.DataFrame(high_corr_pairs, columns=['Feature1', 'Feature2', 'Correlation'])

high_corr_df

Unnamed: 0,Feature1,Feature2,Correlation


* No Highly  Correlated Columns are available.

## *Splitting the Data into dependent and independent Features.*

In [316]:
df.shape  

(4269, 12)

In [317]:
x = df.drop(' loan_status', axis=1)
y = df[' loan_status']

print(x.shape)
print(y.shape) 

(4269, 11)
(4269,)


## *Scaling the Dataset*

In [274]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
x_scaled_df = pd.DataFrame(x_scaled, columns=x.columns)

class_distribution = y.value_counts(normalize=True)
correlation_with_target = x_scaled_df.corrwith(y)
class_distribution, correlation_with_target.sort_values(ascending=False).head(10)

( loan_status
 1    0.62216
 0    0.37784
 Name: proportion, dtype: float64,
  cibil_score                 0.770518
  loan_amount                 0.016150
  commercial_assets_value     0.008246
  education                   0.004918
  self_employed               0.000345
  bank_asset_value           -0.006778
  residential_assets_value   -0.014367
  income_annum               -0.015189
  luxury_assets_value        -0.015465
  no_of_dependents           -0.018114
 dtype: float64)

In [275]:
x_scaled_df.describe() 

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,4.9932730000000006e-17,-4.0778400000000005e-17,-6.324812000000001e-17,1.681069e-16,7.323467e-17,-9.736882000000001e-17,2.62979e-16,3.828176e-17,-1.664424e-18,-5.908706e-17,4.9932730000000006e-17
std,1.000117,1.000117,1.000117,1.000117,1.000117,1.000117,1.000117,1.000117,1.000117,1.000117,1.000117
min,-1.473548,-1.004461,-1.007288,-1.731375,-1.64045,-1.559151,-1.739665,-1.164503,-1.133237,-1.628783,-1.531382
25%,-0.8838247,-1.004461,-1.007288,-0.8405896,-0.8220748,-0.8584443,-0.8522468,-0.8108132,-0.8370048,-0.8378082,-0.8236472
50%,0.2956212,0.9955592,0.9927645,0.01456473,-0.0700541,-0.1577371,0.0003709143,-0.2879675,-0.2901149,-0.05781874,-0.1159123
75%,0.8853442,0.9955592,0.9927645,0.869719,0.7040849,0.8933237,0.8587888,0.5885679,0.5985812,0.7221707,0.6533647
max,1.475067,0.9955592,0.9927645,1.724873,2.694728,1.594031,1.740407,3.325819,3.287456,2.64468,2.991967


In [276]:
x_scaled_df.head() 

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,-0.294102,0.995559,-1.007288,1.617979,1.633052,0.192617,1.032792,-0.780058,2.877289,0.832028,0.930304
1,-1.473548,-1.004461,0.992765,-0.34175,-0.324414,-0.508091,-1.061051,-0.733924,-0.631921,-0.694993,-0.515936
2,0.295621,0.995559,-1.007288,1.439822,1.610933,1.594031,-0.54484,-0.0573,-0.107818,1.99652,2.407316
3,0.295621,0.995559,-1.007288,1.119139,1.721525,-0.508091,-0.771045,1.649637,-0.381263,0.897943,0.899533
4,1.475067,-1.004461,0.992765,1.689242,1.002681,1.594031,-1.264055,0.757724,0.735304,1.568075,0.007172


In [277]:
y.head() 

0    1
1    0
2    0
3    0
4    0
Name:  loan_status, dtype: int64

In [278]:
y.value_counts() 

 loan_status
1    2656
0    1613
Name: count, dtype: int64

## *Splitting the Data into Train and Test Data.*

In [318]:
x_train ,x_test,y_train ,y_test = train_test_split(x_scaled_df, y, test_size=0.3, random_state=42)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(2988, 11)
(1281, 11)
(2988,)
(1281,)


## *Performing Over Sampler to balance the imbalanced Data.*

In [319]:
ros = RandomOverSampler(random_state=42)
x_train_resampled, y_train_resampled = ros.fit_resample(x_train,y_train)

In [320]:
print(x_train.shape, y_train.shape)

(2988, 11) (2988,)


In [321]:
print(x_train_resampled.shape, y_train_resampled.shape) 

(3692, 11) (3692,)


In [322]:
y_train.value_counts() 

 loan_status
1    1846
0    1142
Name: count, dtype: int64

In [323]:
y_train_resampled.value_counts() 

 loan_status
0    1846
1    1846
Name: count, dtype: int64

## *Building and Training Different Classification Models*

In [324]:
# Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(x_train_resampled, y_train_resampled)

# Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(x_train_resampled, y_train_resampled)

# Random Forest
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(x_train_resampled, y_train_resampled)

# AdaBoost Clasiifier
adab = AdaBoostClassifier(n_estimators=100)
adab.fit(x_train_resampled,y_train_resampled)

# kNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train_resampled,y_train_resampled)

# Gradient Boosting
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(x_train_resampled, y_train_resampled)

# Bagging Classifier (using Random Forest as base)
bagging_clf = BaggingClassifier(estimator=RandomForestClassifier(), random_state=42)
bagging_clf.fit(x_train_resampled, y_train_resampled)

print("Model Building Completed")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Building Completed


## *Generating Predictions for all Classification Models.*

In [325]:
ypred_logreg = log_reg.predict(x_test)
ypred_dt = dt.predict(x_test)
ypred_rf_clf = rf_clf.predict(x_test)
ypred_adab = adab.predict(x_test)
ypred_knn = knn.predict(x_test)
ypred_gb_clf = gb_clf.predict(x_test)
ypred_bg_rf = bagging_clf.predict(x_test) 

## *Evaluation of the Models*

In [326]:
def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

results = {
    "Model": ["Logistic Regression", "Decision Tree Classifier", "Random Forest", "AdaBoost Classifier", "kNN Classifier", "Gradient Boosting", "Bagging Classifier (using Random Forest Classifier)"],
    "Accuracy": [evaluate_model(y_test, ypred_logreg)[0],
                 evaluate_model(y_test, ypred_dt)[0],
                 evaluate_model(y_test, ypred_rf_clf)[0],
                 evaluate_model(y_test, ypred_adab)[0],
                 evaluate_model(y_test, ypred_knn)[0],
                 evaluate_model(y_test, ypred_gb_clf)[0],
                 evaluate_model(y_test, ypred_bg_rf)[0]],
    "Precision": [evaluate_model(y_test, ypred_logreg)[1],
                 evaluate_model(y_test, ypred_dt)[1],
                 evaluate_model(y_test, ypred_rf_clf)[1],
                 evaluate_model(y_test, ypred_adab)[1],
                 evaluate_model(y_test, ypred_knn)[1],
                 evaluate_model(y_test, ypred_gb_clf)[1],
                 evaluate_model(y_test, ypred_bg_rf)[1]],
    "Recall": [evaluate_model(y_test, ypred_logreg)[2],
                 evaluate_model(y_test, ypred_dt)[2],
                 evaluate_model(y_test, ypred_rf_clf)[2],
                 evaluate_model(y_test, ypred_adab)[2],
                 evaluate_model(y_test, ypred_knn)[2],
                 evaluate_model(y_test, ypred_gb_clf)[2],
                 evaluate_model(y_test, ypred_bg_rf)[2]],
    "F1 Score": [evaluate_model(y_test, ypred_logreg)[3],
                 evaluate_model(y_test, ypred_dt)[3],
                 evaluate_model(y_test, ypred_rf_clf)[3],
                 evaluate_model(y_test, ypred_adab)[3],
                 evaluate_model(y_test, ypred_knn)[3],
                 evaluate_model(y_test, ypred_gb_clf)[3],
                 evaluate_model(y_test, ypred_bg_rf)[3]]
}

results_df = pd.DataFrame(results)

results_df 

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.69477,0.730473,0.819753,0.772542
1,Decision Tree Classifier,0.977361,0.982695,0.981481,0.982088
2,Random Forest,0.979703,0.983951,0.983951,0.983951
3,AdaBoost Classifier,0.969555,0.975339,0.976543,0.975941
4,kNN Classifier,0.532397,0.654466,0.551852,0.598794
5,Gradient Boosting,0.9758,0.981459,0.980247,0.980852
6,Bagging Classifier (using Random Forest Classi...,0.973458,0.987437,0.97037,0.978829


## *Training and Testing Scores of all Models :-*

In [327]:
models = ["Logistic Regression", "Decision Tree Classifier", "Random Forest",
               "AdaBoost Classifier", "kNN Classifier", "Gradient Boosting",
               "Bagging Classifier (using Random Forest as base)"]

training_scores = [log_reg.score(x_train_resampled,y_train_resampled), dt.score(x_train_resampled,y_train_resampled), rf_clf.score(x_train_resampled,y_train_resampled),
                    adab.score(x_train_resampled,y_train_resampled), knn.score(x_train_resampled,y_train_resampled), gb_clf.score(x_train_resampled,y_train_resampled), bagging_clf.score(x_train_resampled,y_train_resampled)]
testing_scores = [log_reg.score(x_test,y_test), dt.score(x_test,y_test), rf_clf.score(x_test,y_test),
                   adab.score(x_test,y_test), knn.score(x_test,y_test), gb_clf.score(x_test,y_test), bagging_clf.score(x_test,y_test)]
                  

Model_Performance = pd.DataFrame({"Models": models, "Training Scores": training_scores, "Testing Scores": testing_scores})

Model_Performance

Unnamed: 0,Models,Training Scores,Testing Scores
0,Logistic Regression,0.672264,0.69477
1,Decision Tree Classifier,1.0,0.977361
2,Random Forest,1.0,0.979703
3,AdaBoost Classifier,0.98402,0.969555
4,kNN Classifier,0.74052,0.532397
5,Gradient Boosting,0.995125,0.9758
6,Bagging Classifier (using Random Forest as base),0.998917,0.973458


## *ROC-AUC Scores of all Models.*

In [328]:
def compute_roc_auc(model, x_test, y_test):

    y_prob = model.predict_proba(x_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_prob)
    return auc_score

results = {'Model': [], 'ROC-AUC': []}


models = [log_reg, dt, rf_clf, adab, knn, gb_clf, bagging_clf]

for model in models:
    auc_score = compute_roc_auc(model, x_test, y_test)
    results['Model'].append(model.__class__.__name__)
    results['ROC-AUC'].append(auc_score)

roc_auc_scores = pd.DataFrame(results)

roc_auc_scores

Unnamed: 0,Model,ROC-AUC
0,LogisticRegression,0.755611
1,DecisionTreeClassifier,0.975879
2,RandomForestClassifier,0.998262
3,AdaBoostClassifier,0.995427
4,KNeighborsClassifier,0.533055
5,GradientBoostingClassifier,0.997539
6,BaggingClassifier,0.997718


## Conclusion:-
* **Rankings of the Algorithms based on their performances.**

            1. Random Forest:

            Accuracy: 0.978923
            Precision: 0.981550
            Recall: 0.985185
            F1 Score: 0.983364
            ROC-AUC: 0.998224
            Testing Score: 0.978923

            2. Bagging Classifier (using Random Forest as base):

            Accuracy: 0.975020
            Precision: 0.987469
            Recall: 0.972840
            F1 Score: 0.980100
            ROC-AUC: 0.997741
            Testing Score: 0.975020

            3. Gradient Boosting:

            Accuracy: 0.975800
            Precision: 0.981459
            Recall: 0.980247
            F1 Score: 0.980852
            ROC-AUC: 0.997539
            Testing Score: 0.975800

            4. Decision Tree Classifier:

            Accuracy: 0.977361
            Precision: 0.982695
            Recall: 0.981481
            F1 Score: 0.982088
            ROC-AUC: 0.975879
            Testing Score: 0.977361

            5. AdaBoost Classifier:

            Accuracy: 0.969555
            Precision: 0.975339
            Recall: 0.976543
            F1 Score: 0.975941
            ROC-AUC: 0.995427
            Testing Score: 0.969555

            6. Logistic Regression:

            Accuracy: 0.905543
            Precision: 0.951507
            Recall: 0.896296
            F1 Score: 0.923077
            ROC-AUC: 0.963634
            Testing Score: 0.905543

            7. kNN Classifier:

            Accuracy: 0.861827
            Precision: 0.938974
            Recall: 0.835802
            F1 Score: 0.884389
            ROC-AUC: 0.942893
            Testing Score: 0.861827

## *Saving the Best Models.*

**I will only save 4 best out 7 Models.** So, user will have 4 options to choose among the best 4 models.

In [331]:
rf = RandomForestClassifier(random_state=42)
rf.fit(x,y)
bg = BaggingClassifier(estimator=RandomForestClassifier(), random_state=42)
bg.fit(x,y)
gb = GradientBoostingClassifier(random_state=42)
gb.fit(x,y)
Dt = DecisionTreeClassifier(random_state=42)
Dt.fit(x,y)

In [332]:
pickle.dump(rf,open("D:\PYTHON1\Data Science Project 2\\rf.pkl", "wb"))
pickle.dump(bg,open("D:\PYTHON1\Data Science Project 2\\bg.pkl", "wb"))
pickle.dump(gb,open("D:\PYTHON1\Data Science Project 2\\gb.pkl", "wb"))
pickle.dump(Dt,open("D:\PYTHON1\Data Science Project 2\\Dt.pkl", "wb"))

In [333]:
Loaded_model1 = pickle.load(open("D:\PYTHON1\Data Science Project 2\\rf.pkl", 'rb'))
Loaded_model2 = pickle.load(open("D:\PYTHON1\Data Science Project 2\\bg.pkl", 'rb'))
Loaded_model3 = pickle.load(open("D:\PYTHON1\Data Science Project 2\\gb.pkl", 'rb'))
Loaded_model4 = pickle.load(open("D:\PYTHON1\Data Science Project 2\\Dt.pkl", 'rb'))

In [334]:
predictions1 = Loaded_model1.predict(x)[:15]
predictions2 = Loaded_model2.predict(x)[:15]
predictions3 = Loaded_model3.predict(x)[:15]
predictions4 = Loaded_model4.predict(x)[:15]

print(predictions1,predictions2,predictions3,predictions4)

[1 0 0 0 0 0 1 0 1 0 1 0 0 1 0] [1 0 0 0 0 0 1 0 1 0 1 0 0 1 0] [1 0 0 0 0 0 1 0 1 0 1 0 0 1 0] [1 0 0 0 0 0 1 0 1 0 1 0 0 1 0]


In [335]:
y.head(10) 

0    1
1    0
2    0
3    0
4    0
5    0
6    1
7    0
8    1
9    0
Name:  loan_status, dtype: int64

In [336]:
df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,1,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,0,0,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,1,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,3,1,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,0,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0
