# Loan Prediction Project

In [37]:
## Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
df = pd.read_csv("Loan_default.csv")
df.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [39]:
df.shape

(255347, 18)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255347 entries, 0 to 255346
Data columns (total 18 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   LoanID          255347 non-null  object 
 1   Age             255347 non-null  int64  
 2   Income          255347 non-null  int64  
 3   LoanAmount      255347 non-null  int64  
 4   CreditScore     255347 non-null  int64  
 5   MonthsEmployed  255347 non-null  int64  
 6   NumCreditLines  255347 non-null  int64  
 7   InterestRate    255347 non-null  float64
 8   LoanTerm        255347 non-null  int64  
 9   DTIRatio        255347 non-null  float64
 10  Education       255347 non-null  object 
 11  EmploymentType  255347 non-null  object 
 12  MaritalStatus   255347 non-null  object 
 13  HasMortgage     255347 non-null  object 
 14  HasDependents   255347 non-null  object 
 15  LoanPurpose     255347 non-null  object 
 16  HasCoSigner     255347 non-null  object 
 17  Default   

In [41]:
##Checking null values
df.isnull().sum()

LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64

In [42]:
## Checking values of the target column
df['Default'].value_counts()

Default
0    225694
1     29653
Name: count, dtype: int64

In [43]:
(df['Default'].value_counts()/len(df)) * 100

Default
0    88.387175
1    11.612825
Name: count, dtype: float64

## Obervations:
1. The dataset is highly imbalanced 88.4% output is belongs to 0 (preson will not default) and 11.6% belongs to 1 (person will deafult). {Need to handle}

### To handle this problem I will follow the following approach.
1. Preform EDA and preprocessing so that data becomes ready to feed to the different models(that we would make in next few steps).
2. Create multiple classification models on the inbalanced dataset. (Goal: See how well your model handles the imbalance without any help.)
3. Create models with the hyperparameter class_weight = "balanced" on the same imbalanced dataset.
4. Apply SMOTE to make data balanced and create models using balanced data.
5. Pick the best preforming models of step 2, 3, 4 and compare there preformance, to find the best model.
6. Hyperparameter tune the best model to make it more better. Also reduce the irrelevant features using multicolinearity(if needed).

## Step1: EDA and Preprocessing.

In [44]:
df.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


#### Till now we find that
1. There a no null values
2. 8 Categorical columns which need to get handle

In [45]:
df.drop('LoanID', axis = 1, inplace = True)

In [46]:
cat_columns = df.select_dtypes(include='object')
cat_columns

Unnamed: 0,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
0,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes
1,Master's,Full-time,Married,No,No,Other,Yes
2,Master's,Unemployed,Divorced,Yes,Yes,Auto,No
3,High School,Full-time,Married,No,No,Business,No
4,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No
...,...,...,...,...,...,...,...
255342,Bachelor's,Full-time,Married,No,No,Other,No
255343,High School,Part-time,Divorced,No,No,Home,No
255344,High School,Self-employed,Married,Yes,Yes,Auto,Yes
255345,High School,Part-time,Single,Yes,Yes,Other,No


In [47]:
for x in cat_columns:
    print(df[x].value_counts())
    print()

Education
Bachelor's     64366
High School    63903
Master's       63541
PhD            63537
Name: count, dtype: int64

EmploymentType
Part-time        64161
Unemployed       63824
Self-employed    63706
Full-time        63656
Name: count, dtype: int64

MaritalStatus
Married     85302
Divorced    85033
Single      85012
Name: count, dtype: int64

HasMortgage
Yes    127677
No     127670
Name: count, dtype: int64

HasDependents
Yes    127742
No     127605
Name: count, dtype: int64

LoanPurpose
Business     51298
Home         51286
Education    51005
Other        50914
Auto         50844
Name: count, dtype: int64

HasCoSigner
Yes    127701
No     127646
Name: count, dtype: int64



In [48]:
cat_columns.columns

Index(['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage',
       'HasDependents', 'LoanPurpose', 'HasCoSigner'],
      dtype='object')

## Observations:
Columns ['HasMortgage', 'HasDependents', 'HasCoSigner'] contain values yes & no so we just simply map yes with 1 and no with 0.

In [49]:
## First of all I will split the dataset in training & testing to avoid dataleakage.
X = df.drop('Default', axis = 1)
y = df['Default']

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state = 42)

In [51]:
map_columns = ['HasMortgage', 'HasDependents', 'HasCoSigner']
for x in map_columns:
    X_train[x] = X_train[x].map({"Yes" : 1, "No" : 0})
    X_test[x] = X_test[x].map({"Yes" : 1, "No" : 0})

In [52]:
## Now we have to apply label encoding on the remaining categorical columns.
label_columns = ['Education', 'EmploymentType', 'MaritalStatus', 'LoanPurpose']
from sklearn.preprocessing import LabelEncoder
encoders = {}
for x in label_columns:
    le = LabelEncoder()
    X_train[x] = le.fit_transform(X_train[x])
    X_test[x] = le.transform(X_test[x])
    encoders[x] = le

In [53]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 178742 entries, 73275 to 121958
Data columns (total 16 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Age             178742 non-null  int64  
 1   Income          178742 non-null  int64  
 2   LoanAmount      178742 non-null  int64  
 3   CreditScore     178742 non-null  int64  
 4   MonthsEmployed  178742 non-null  int64  
 5   NumCreditLines  178742 non-null  int64  
 6   InterestRate    178742 non-null  float64
 7   LoanTerm        178742 non-null  int64  
 8   DTIRatio        178742 non-null  float64
 9   Education       178742 non-null  int64  
 10  EmploymentType  178742 non-null  int64  
 11  MaritalStatus   178742 non-null  int64  
 12  HasMortgage     178742 non-null  int64  
 13  HasDependents   178742 non-null  int64  
 14  LoanPurpose     178742 non-null  int64  
 15  HasCoSigner     178742 non-null  int64  
dtypes: float64(2), int64(14)
memory usage: 23.2 MB


In [54]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76605 entries, 51139 to 3198
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             76605 non-null  int64  
 1   Income          76605 non-null  int64  
 2   LoanAmount      76605 non-null  int64  
 3   CreditScore     76605 non-null  int64  
 4   MonthsEmployed  76605 non-null  int64  
 5   NumCreditLines  76605 non-null  int64  
 6   InterestRate    76605 non-null  float64
 7   LoanTerm        76605 non-null  int64  
 8   DTIRatio        76605 non-null  float64
 9   Education       76605 non-null  int64  
 10  EmploymentType  76605 non-null  int64  
 11  MaritalStatus   76605 non-null  int64  
 12  HasMortgage     76605 non-null  int64  
 13  HasDependents   76605 non-null  int64  
 14  LoanPurpose     76605 non-null  int64  
 15  HasCoSigner     76605 non-null  int64  
dtypes: float64(2), int64(14)
memory usage: 9.9 MB


In [55]:
## Till now we encoded the data into numeric 

In [56]:
## Standardize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Step2: Model Creation on the imbalanced dataset(X_train_scaled, X_test_scaled).


In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
models = {
    "LogisticRegression" : LogisticRegression(),
    #"SVC" : SVC(),
    "Naive-Bayes" : GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "DecisionTree" : DecisionTreeClassifier(),
    "RandomForest" : RandomForestClassifier(),
    "Adaboost" : AdaBoostClassifier()
}
models

{'LogisticRegression': LogisticRegression(),
 'Naive-Bayes': GaussianNB(),
 'KNN': KNeighborsClassifier(),
 'DecisionTree': DecisionTreeClassifier(),
 'RandomForest': RandomForestClassifier(),
 'Adaboost': AdaBoostClassifier()}

In [58]:
from sklearn.metrics import f1_score, classification_report

In [59]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    print(f"{list(models.keys())[i]}, score : ", f1_score(y_test, y_pred)*100)
    print(classification_report(y_pred, y_test))
    print()

LogisticRegression, score :  5.973970556859398
              precision    recall  f1-score   support

           0       1.00      0.89      0.94     76155
           1       0.03      0.62      0.06       450

    accuracy                           0.88     76605
   macro avg       0.51      0.75      0.50     76605
weighted avg       0.99      0.88      0.93     76605


Naive-Bayes, score :  3.1940494421352
              precision    recall  f1-score   support

           0       1.00      0.89      0.94     76387
           1       0.02      0.67      0.03       218

    accuracy                           0.88     76605
   macro avg       0.51      0.78      0.49     76605
weighted avg       1.00      0.88      0.94     76605


KNN, score :  11.299852289512554
              precision    recall  f1-score   support

           0       0.98      0.89      0.93     74697
           1       0.07      0.32      0.11      1908

    accuracy                           0.87     76605
   macro

### How to find out which model preform well?
#### As these model to their predictions on imbalanced dataset so simply check the precision, recall, f1-score value in the classification report for the minority class (which is 1).
### On this basis the Decision Tree model preform well on the imbalanced data.

## Create models with the hyperparameter class_weight = "balanced" on the same imbalanced dataset.
#### In an imbalanced dataset, the model can become biased toward the majority class, ignoring the minority class. class_weight='balanced' helps combat this by penalizing misclassification of the minority class more heavily during training.

In [60]:
models = {
    "LogisticRegression" : LogisticRegression(class_weight = "balanced"),
    "DecisionTree" : DecisionTreeClassifier(class_weight="balanced"),
    "RandomForest" : RandomForestClassifier(class_weight="balanced")
}
models

{'LogisticRegression': LogisticRegression(class_weight='balanced'),
 'DecisionTree': DecisionTreeClassifier(class_weight='balanced'),
 'RandomForest': RandomForestClassifier(class_weight='balanced')}

In [61]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    print(f"{list(models.keys())[i]}, score : ", f1_score(y_test, y_pred)*100)
    print(classification_report(y_pred, y_test))
    print()

LogisticRegression, score :  33.03348460030611
              precision    recall  f1-score   support

           0       0.67      0.94      0.78     48288
           1       0.69      0.22      0.33     28317

    accuracy                           0.67     76605
   macro avg       0.68      0.58      0.56     76605
weighted avg       0.68      0.67      0.62     76605


DecisionTree, score :  19.826800364630813
              precision    recall  f1-score   support

           0       0.90      0.89      0.90     67977
           1       0.19      0.20      0.20      8628

    accuracy                           0.82     76605
   macro avg       0.55      0.55      0.55     76605
weighted avg       0.82      0.82      0.82     76605


RandomForest, score :  5.05539421318705
              precision    recall  f1-score   support

           0       1.00      0.89      0.94     76232
           1       0.03      0.63      0.05       373

    accuracy                           0.88     766

### With class_weight = "balanced", logistic and decision tree both preform better. Thus we can consider them for further tunning.

## Step4: Apply SMOTE to make data balanced and create models using balanced data.

In [62]:
y_train.value_counts()

Default
0    158013
1     20729
Name: count, dtype: int64

In [63]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [64]:
y_train_resampled.value_counts()

Default
1    158013
0    158013
Name: count, dtype: int64

In [65]:
## Model Training
models = {
    "LogisticRegression" : LogisticRegression(),
    "Naive-Bayes" : GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "DecisionTree" : DecisionTreeClassifier(),
    "RandomForest" : RandomForestClassifier(),
    "Adaboost" : AdaBoostClassifier()
}
models

{'LogisticRegression': LogisticRegression(),
 'Naive-Bayes': GaussianNB(),
 'KNN': KNeighborsClassifier(),
 'DecisionTree': DecisionTreeClassifier(),
 'RandomForest': RandomForestClassifier(),
 'Adaboost': AdaBoostClassifier()}

In [66]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_resampled, y_train_resampled)

    y_pred = model.predict(X_test_scaled)

    print(f"{list(models.keys())[i]}, score : ", f1_score(y_test, y_pred)*100)
    print(classification_report(y_pred, y_test))
    print()

LogisticRegression, score :  33.21025136073958
              precision    recall  f1-score   support

           0       0.68      0.94      0.79     48968
           1       0.68      0.22      0.33     27637

    accuracy                           0.68     76605
   macro avg       0.68      0.58      0.56     76605
weighted avg       0.68      0.68      0.63     76605


Naive-Bayes, score :  32.87535449764586
              precision    recall  f1-score   support

           0       0.68      0.94      0.79     49210
           1       0.67      0.22      0.33     27395

    accuracy                           0.68     76605
   macro avg       0.68      0.58      0.56     76605
weighted avg       0.68      0.68      0.63     76605


KNN, score :  25.901330977285035
              precision    recall  f1-score   support

           0       0.68      0.91      0.78     50442
           1       0.51      0.17      0.26     26163

    accuracy                           0.66     76605
   mac

## Logistic Regression gives the best results so I am considiring it as final model.

In [72]:
final_model = LogisticRegression()
final_model.fit(X_train_resampled, y_train_resampled)
y_pred_train = final_model.predict(X_train_scaled)
y_pred_test = final_model.predict(X_test_scaled)

train_score = f1_score(y_train, y_pred_train)
print("Training accuracy : ", train_score*100)
test_score = f1_score(y_test, y_pred_test)
print("Test accuracy : ", test_score*100)

train_report = classification_report(y_train, y_pred_train)
print("Training report", train_report)
test_report = classification_report(y_test, y_pred_test)
print("Test report", test_report)

Training accuracy :  33.04200931454782
Test accuracy :  33.21025136073958
Training report               precision    recall  f1-score   support

           0       0.94      0.68      0.79    158013
           1       0.22      0.68      0.33     20729

    accuracy                           0.68    178742
   macro avg       0.58      0.68      0.56    178742
weighted avg       0.86      0.68      0.74    178742

Test report               precision    recall  f1-score   support

           0       0.94      0.68      0.79     67681
           1       0.22      0.68      0.33      8924

    accuracy                           0.68     76605
   macro avg       0.58      0.68      0.56     76605
weighted avg       0.86      0.68      0.74     76605



In [70]:
from sklearn.model_selection import GridSearchCV

params = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'saga']
}

grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid=params, scoring='f1', cv=5)
grid.fit(X_train_resampled, y_train_resampled)
print("Best Params:", grid.best_params_)

Best Params: {'C': 1, 'penalty': 'l2', 'solver': 'saga'}


In [75]:
final_model = LogisticRegression(C = 1, penalty='l2', solver='saga', class_weight = "balanced")
final_model.fit(X_train_scaled, y_train)
y_pred_train = final_model.predict(X_train_scaled)
y_pred_test = final_model.predict(X_test_scaled)

train_score = f1_score(y_train, y_pred_train)
print("Training F1-score : ", train_score*100)
test_score = f1_score(y_test, y_pred_test)
print("Test F1-score : ", test_score*100)

train_report = classification_report(y_train, y_pred_train)
print("Training report", train_report)
test_report = classification_report(y_test, y_pred_test)
print("Test report", test_report)

Training F1-score :  32.87036503417513
Test F1-score :  33.0343716433942
Training report               precision    recall  f1-score   support

           0       0.94      0.67      0.78    158013
           1       0.22      0.69      0.33     20729

    accuracy                           0.67    178742
   macro avg       0.58      0.68      0.56    178742
weighted avg       0.86      0.67      0.73    178742

Test report               precision    recall  f1-score   support

           0       0.94      0.67      0.78     67681
           1       0.22      0.69      0.33      8924

    accuracy                           0.67     76605
   macro avg       0.58      0.68      0.56     76605
weighted avg       0.86      0.67      0.73     76605

