**Import Important Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier


## **Read and Explore Data**

In [2]:
df = pd.read_csv(r"C:\Users\PC\Desktop\task 3 models\Loan Prediction Dataset.csv")

In [5]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [7]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [9]:
# check for missing values
df.isna().sum()

Unnamed: 0,0
Loan_ID,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14


**Columns has Missing Values**

- Categorical:
  - Gender  
  - Married  
  - Dependents  
  - Self_Employed  

- Numerical:
  - LoanAmount  
  - Loan_Amount_Term  
  - Credit_History  


**Handling missing values**


In [48]:
# Handling missing values in categorical columns

categorical_cols = ['Gender', 'Married', 'Dependents','Self_Employed']
for i in categorical_cols:
    mode = df[i].mode()[0]
    df[i] = df[i].fillna(mode)

In [49]:
# Handling missing values in numerical columns
# All cols has outliers or skewed data -> handle missing values with median

imputer = SimpleImputer(strategy = 'median')
numerical_cols = ['LoanAmount', 'Loan_Amount_Term', 'Credit_History']
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

In [50]:
df.isna().sum() # Now there are no missing values

Unnamed: 0,0
Loan_ID,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0


In [51]:
# drop unimportant cols
df = df.drop(['Loan_ID'], axis=1)

In [14]:
df['Dependents'].unique()   # cols has 3+ -> handle it with one-hot encoding

array(['0', '1', '2', '3+'], dtype=object)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    object 
 1   Married            614 non-null    object 
 2   Dependents         614 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      614 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


  **Encode categorical variables**


In [52]:
encoded_cols = ['Gender', 'Married', 'Self_Employed', 'Education', 'Property_Area',  'Loan_Status','Dependents']
df= pd.get_dummies(df,columns=encoded_cols, drop_first=True)


In [53]:
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Self_Employed_Yes,Education_Not Graduate,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y,Dependents_1,Dependents_2,Dependents_3+
0,5849,0.0,128.0,360.0,1.0,True,False,False,False,False,True,True,False,False,False
1,4583,1508.0,128.0,360.0,1.0,True,True,False,False,False,False,False,True,False,False
2,3000,0.0,66.0,360.0,1.0,True,True,True,False,False,True,True,False,False,False
3,2583,2358.0,120.0,360.0,1.0,True,True,False,True,False,True,True,False,False,False
4,6000,0.0,141.0,360.0,1.0,True,False,False,False,False,True,True,False,False,False


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ApplicantIncome          614 non-null    int64  
 1   CoapplicantIncome        614 non-null    float64
 2   LoanAmount               614 non-null    float64
 3   Loan_Amount_Term         614 non-null    float64
 4   Credit_History           614 non-null    float64
 5   Gender_Male              614 non-null    bool   
 6   Married_Yes              614 non-null    bool   
 7   Self_Employed_Yes        614 non-null    bool   
 8   Education_Not Graduate   614 non-null    bool   
 9   Property_Area_Semiurban  614 non-null    bool   
 10  Property_Area_Urban      614 non-null    bool   
 11  Loan_Status_Y            614 non-null    bool   
 12  Dependents_1             614 non-null    bool   
 13  Dependents_2             614 non-null    bool   
 14  Dependents_3+            6

**Split Data**

In [55]:
# Split Data
x = df.drop(['Loan_Status_Y'], axis=1)
y = df['Loan_Status_Y']

In [56]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)


In [41]:
# from imblearn.over_sampling import SMOTE

# smote = SMOTE()
# x_train, y_train = smote.fit_resample(x_train, y_train)

**Feature Scaling**

In [57]:
robust = RobustScaler()
x_train = robust.fit_transform(x_train)
x_test = robust.transform(x_test)


**Function to print oututs for modeling**

In [58]:
def print_score(clf, x_train,y_train, x_test, y_test, train):
    if train:
        pred = clf.predict(x_train)
        print("Train Results")
        print("==============================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred)*100: .2f}%")
        print("______________________________________________")

        clf_report = classification_report(y_train , pred)
        print(f"classification Report: \n{clf_report}")
        print("______________________________________________")

        print(f"Confusion Matrix:\n {confusion_matrix(y_train, pred)} ")
        print("____________________________________________________________________________________________")

    else:
        pred = clf.predict(x_test)
        print("Test Results")
        print("=============================================")
        print(f"Accuracy Score {accuracy_score(y_test, pred)*100: .2f}%")
        print("______________________________________________")

        clf_report = classification_report(y_test , pred)
        print(f"classification Report: \n{clf_report}")
        print("______________________________________________")

        print(f"Confusion Matrix:\n {confusion_matrix(y_test, pred)} ")
        print("______________________________________________")


## **Modeling**

### **Logistic Regression**

**Logistic Regreission without Grid search**





In [59]:
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(x_train, y_train)

print_score(log_reg, x_train, y_train, x_test , y_test, train=True)
print_score(log_reg, x_train, y_train, x_test , y_test, train=False)

Train Results
Accuracy Score:  82.05%
______________________________________________
classification Report: 
              precision    recall  f1-score   support

       False       0.92      0.43      0.59       127
        True       0.80      0.98      0.89       302

    accuracy                           0.82       429
   macro avg       0.86      0.71      0.74       429
weighted avg       0.84      0.82      0.80       429

______________________________________________
Confusion Matrix:
 [[ 55  72]
 [  5 297]] 
____________________________________________________________________________________________
Test Results
Accuracy Score  78.38%
______________________________________________
classification Report: 
              precision    recall  f1-score   support

       False       0.93      0.42      0.57        65
        True       0.76      0.98      0.86       120

    accuracy                           0.78       185
   macro avg       0.84      0.70      0.71       185
we

**Logistic Regreission with Grid search**

In [60]:
log_reg = LogisticRegression(max_iter=10000, random_state=42, class_weight='balanced')


param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

grid = GridSearchCV(estimator=log_reg,  param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(x_train, y_train)

best_params = grid.best_score_
print("Best Parameters:", best_params)

print_score(grid, x_train, y_train, x_test , y_test, train=True)
print_score(grid, x_train, y_train, x_test , y_test, train=False)

Best Parameters: 0.7342818057455541
Train Results
Accuracy Score:  82.05%
______________________________________________
classification Report: 
              precision    recall  f1-score   support

       False       0.92      0.43      0.59       127
        True       0.80      0.98      0.89       302

    accuracy                           0.82       429
   macro avg       0.86      0.71      0.74       429
weighted avg       0.84      0.82      0.80       429

______________________________________________
Confusion Matrix:
 [[ 55  72]
 [  5 297]] 
____________________________________________________________________________________________
Test Results
Accuracy Score  78.38%
______________________________________________
classification Report: 
              precision    recall  f1-score   support

       False       0.93      0.42      0.57        65
        True       0.76      0.98      0.86       120

    accuracy                           0.78       185
   macro avg       0

### **Decision Tree**

**Decision Tree without Grid search**

In [61]:
DT = DecisionTreeClassifier(random_state=42)
DT.fit(x_train, y_train)

In [62]:
print_score(DT, x_train, y_train, x_test , y_test, train=True)
print_score(DT, x_train, y_train, x_test , y_test, train=False)

Train Results
Accuracy Score:  100.00%
______________________________________________
classification Report: 
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       127
        True       1.00      1.00      1.00       302

    accuracy                           1.00       429
   macro avg       1.00      1.00      1.00       429
weighted avg       1.00      1.00      1.00       429

______________________________________________
Confusion Matrix:
 [[127   0]
 [  0 302]] 
____________________________________________________________________________________________
Test Results
Accuracy Score  69.73%
______________________________________________
classification Report: 
              precision    recall  f1-score   support

       False       0.58      0.49      0.53        65
        True       0.75      0.81      0.78       120

    accuracy                           0.70       185
   macro avg       0.66      0.65      0.65       185
w

**Applay Decision Tree with Grid Search**

In [63]:
params = {
    "criterion" : ("gini", "entropy"),
    "splitter" : ("best", "random"),
    "max_depth" : list(range(1,20)),
    "min_samples_split": [2,3,4],
    "min_samples_leaf" : list(range(1,20))
    }

DT = DecisionTreeClassifier(random_state=42, class_weight='balanced')
grid_search = GridSearchCV(DT , params, scoring="accuracy", n_jobs=-1, verbose=1, cv=3)

grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
print(f"best_params : {best_params}")

print_score(grid_search, x_train, y_train, x_test, y_test, train=True)
print_score(grid_search, x_train, y_train, x_test, y_test, train=False)

Fitting 3 folds for each of 4332 candidates, totalling 12996 fits
best_params : {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}
Train Results
Accuracy Score:  82.05%
______________________________________________
classification Report: 
              precision    recall  f1-score   support

       False       0.92      0.43      0.59       127
        True       0.80      0.98      0.89       302

    accuracy                           0.82       429
   macro avg       0.86      0.71      0.74       429
weighted avg       0.84      0.82      0.80       429

______________________________________________
Confusion Matrix:
 [[ 55  72]
 [  5 297]] 
____________________________________________________________________________________________
Test Results
Accuracy Score  78.38%
______________________________________________
classification Report: 
              precision    recall  f1-score   support

       False       0.93      0.42 

_____________________________

## **Random Forest**

**Random Forest without GridSearch**

In [29]:
random_forest = RandomForestClassifier(n_estimators=100, random_state = 42)


In [30]:
random_forest.fit(x_train, y_train)

In [31]:
print_score(random_forest, x_train, y_train, x_test, y_test, train=True)
print_score(random_forest, x_train, y_train, x_test, y_test, train=False)

Train Results
Accuracy Score:  100.00%
______________________________________________
classification Report: 
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       127
        True       1.00      1.00      1.00       302

    accuracy                           1.00       429
   macro avg       1.00      1.00      1.00       429
weighted avg       1.00      1.00      1.00       429

______________________________________________
Confusion Matrix:
 [[127   0]
 [  0 302]] 
____________________________________________________________________________________________
Test Results
Accuracy Score  75.68%
______________________________________________
classification Report: 
              precision    recall  f1-score   support

       False       0.78      0.43      0.55        65
        True       0.75      0.93      0.83       120

    accuracy                           0.76       185
   macro avg       0.76      0.68      0.69       185
w

**Random Forest with Grid search**

In [32]:
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]


In [33]:
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap}

In [34]:
random_forest = RandomForestClassifier(random_state=42)

random_forest_cv = RandomizedSearchCV(estimator=random_forest, scoring='f1',param_distributions=random_grid, n_iter=100, cv=3,
                               verbose=2, random_state=42, n_jobs=-1)


In [35]:
random_forest_cv.fit(x_train, y_train)
rf_best_params = random_forest_cv.best_params_
print(f"Best paramters: {rf_best_params})")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best paramters: {'n_estimators': 1800, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 90, 'bootstrap': True})


In [37]:
random_forest = RandomForestClassifier(**rf_best_params)
random_forest.fit(x_train, y_train)

In [38]:
print_score(random_forest_cv, x_train, y_train, x_test, y_test, train=True)
print_score(random_forest_cv, x_train, y_train, x_test, y_test, train=False)

Train Results
Accuracy Score:  82.75%
______________________________________________
classification Report: 
              precision    recall  f1-score   support

       False       0.92      0.46      0.61       127
        True       0.81      0.98      0.89       302

    accuracy                           0.83       429
   macro avg       0.87      0.72      0.75       429
weighted avg       0.84      0.83      0.81       429

______________________________________________
Confusion Matrix:
 [[ 58  69]
 [  5 297]] 
____________________________________________________________________________________________
Test Results
Accuracy Score  78.38%
______________________________________________
classification Report: 
              precision    recall  f1-score   support

       False       0.93      0.42      0.57        65
        True       0.76      0.98      0.86       120

    accuracy                           0.78       185
   macro avg       0.84      0.70      0.71       185
we

In [71]:
models = {
    "Logistic Regression": grid,
    "Decision Tree": grid_search,
    "Random Forest": random_forest
}

acc_train = {}
acc_test = {}

for name, model in models.items():
    acc_train[name] = accuracy_score(y_train, model.predict(x_train))
    acc_test[name] = accuracy_score(y_test, model.predict(x_test))

print("Final Test Accuracies With Grid Search:")
print("=================================")
for name  in models:
    print(f"{name}-> Train: {acc_train[name]*100:.2f}% | Test: {acc_test[name]*100:.2f}")


Final Test Accuracies With Grid Search:
Logistic Regression-> Train: 82.05% | Test: 78.38
Decision Tree-> Train: 82.05% | Test: 78.38
Random Forest-> Train: 82.75% | Test: 78.38
