<center><span style="color:#b30000;font-size:40px;"><strong>Modeling Phase </strong></span></center>
</center>

<span style="color:#2929a3;font-size:20px;">Import Libraries</span>

In [397]:
import pandas as pd
import numpy as np
import plotly.express as px
from imblearn.pipeline import pipeline as ImbPipe
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, FunctionTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier

<span style="color:#2929a3;font-size:20px;">Read Dataset</span>

In [398]:
df = pd.read_pickle('../Data/New_Data')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Loan_Per_Month,Income_After_Loan,Income_Exceeds_Loan
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,,,0
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128000.0,360.0,1.0,Rural,N,356.0,4227.0,1
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66000.0,360.0,1.0,Urban,Y,183.0,2817.0,1
3,LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120000.0,360.0,1.0,Urban,Y,333.0,2250.0,1
4,LP001008,Male,No,0.0,Graduate,No,6000,0.0,141000.0,360.0,1.0,Urban,Y,392.0,5608.0,1


In [399]:
## Drop Unnecessary Columns
unnecessary_cols = ['Loan_ID','ApplicantIncome','LoanAmount','Loan_Amount_Term']
df.drop(unnecessary_cols, axis=1, inplace=True)

## Check Duplicates
df.duplicated().sum()

0

In [400]:
## Seperate Data to X and Y
X = df.drop('Loan_Status', axis=1)
Y = df['Loan_Status']

In [401]:
## Encode Target
Y = Y.map({'Y':1, 'N':0})

In [402]:
## Seperate X to Numeric Dataframe and Categorical Dataframe
Num_X = X.select_dtypes(include='number')
Cat_X = X.select_dtypes(include='object_')

In [403]:
## Create Get_Log Function
def Get_Log(v):
    return np.log(v.astype(float))

In [404]:
## Create Function transformer for obtaining Log 
log_transformer = FunctionTransformer(Get_Log)

## Create Column transformer
transformer = ColumnTransformer(transformers=[("log_transformer", log_transformer, ['Loan_Per_Month','Income_After_Loan'])], remainder='passthrough')

## Create Numerical Branch
Num_steps = []
Num_steps.append(("transformer", transformer))
Num_steps.append(("KNN_Imputer",KNNImputer()))
Num_steps.append(("Scaler",StandardScaler()))

Num_pipeline = Pipeline(steps=Num_steps)

In [405]:
## Create Categorical Branch
Cat_steps = []
Cat_steps.append(("Cat_Imputer",SimpleImputer(strategy='most_frequent')))
Cat_steps.append(("Encoder",OneHotEncoder(sparse_output=False, drop='first')))
Cat_pipeline = Pipeline(steps=Cat_steps)

In [406]:
## Column Transformer
Preprocessor = ColumnTransformer(transformers=[('Num_pipeline',Num_pipeline,Num_X.columns.tolist()),('Cat_pipeline',Cat_pipeline,Cat_X.columns.tolist())], remainder='passthrough')

In [407]:
## Models List
Models = []
Models.append(("Logistic Regression", LogisticRegression(max_iter=1000)))
Models.append(("KNN", KNeighborsClassifier(n_neighbors=5)))
Models.append(("SVM",SVC()))
Models.append(("Decision Tree", DecisionTreeClassifier()))
Models.append(("Random Forest",RandomForestClassifier()))
Models.append(("XGBoost", XGBClassifier()))

In [408]:
Y.value_counts()

Loan_Status
1    422
0    191
Name: count, dtype: int64

In [409]:
from imblearn.pipeline import Pipeline as ImbPipe

In [410]:
## Final Pipeline
for model in Models:
    imb_steps = []
    imb_steps.append(('Preprocessor',Preprocessor))
    imb_steps.append(("SMOTETomek",SMOTETomek(sampling_strategy={0:300,1:422}, random_state=1)))
    imb_steps.append(model)
    pipeline = ImbPipe(imb_steps)
    res = cross_validate(pipeline, X, Y, cv=5, scoring='precision', return_train_score=True, n_jobs=-1)
    print(f"Train Accuracy of {model[0]} is ",res['train_score'].mean())
    print(f"Test Accuracy of {model[0]} is ",res['test_score'].mean())
    print("*" * 70)

Train Accuracy of Logistic Regression is  0.8064034937428721
Test Accuracy of Logistic Regression is  0.8029480231423953
**********************************************************************
Train Accuracy of KNN is  0.8962004088522388
Test Accuracy of KNN is  0.7882787243665217
**********************************************************************
Train Accuracy of SVM is  0.8073347446730577
Test Accuracy of SVM is  0.7918902080238885
**********************************************************************
Train Accuracy of Decision Tree is  0.9772238433815449
Test Accuracy of Decision Tree is  0.7960828842554859
**********************************************************************
Train Accuracy of Random Forest is  0.9727751378232078
Test Accuracy of Random Forest is  0.7963171106956896
**********************************************************************
Train Accuracy of XGBoost is  0.9732999566089964
Test Accuracy of XGBoost is  0.7884512357078733
*******************************

<span style="color:#2929a3;font-size:20px;">Create Decision Tree Pipeline</span>

In [411]:
steps = []
steps.append(('Preprocessor',Preprocessor))
steps.append(("SMOTETomek",SMOTETomek(sampling_strategy={0:300,1:422}, random_state=1)))
steps.append(("Model",DecisionTreeClassifier()))
pipeline = ImbPipe(steps=steps)
pipeline.fit(X,Y)

  result = func(self.values, **kwargs)


<span style="color:#2929a3;font-size:20px;">Hyperparameter Tuning</span>

In [412]:
from sklearn.model_selection import GridSearchCV

In [413]:
param = [
    {'Model__max_depth':[8,9,10,11],
     'Model__min_samples_split':[6,7,8,9,10,11],
    'Model__criterion':['gini','entropy']}
]

In [414]:
grid = GridSearchCV(estimator=pipeline, param_grid=param, cv=5, scoring='precision' , return_train_score=True, n_jobs=-1)

In [415]:
grid.fit(X,Y)

  result = func(self.values, **kwargs)


In [416]:
grid.best_params_

{'Model__criterion': 'gini',
 'Model__max_depth': 10,
 'Model__min_samples_split': 11}

In [417]:
grid.cv_results_['mean_train_score'][grid.best_index_]

0.9190090006405794

In [418]:
grid.cv_results_['mean_test_score'][grid.best_index_]

0.8164306272353551

<span style="color:#2929a3;font-size:20px;">Final Model</span>

In [419]:
Final_Model = grid.best_estimator_

In [420]:
Final_Model.fit(X,Y)

  result = func(self.values, **kwargs)


<span style="color:#2929a3;font-size:20px;">Dump Final Model</span>

In [421]:
import joblib
joblib.dump(Final_Model, "../Data/Model.pkl")

['../Data/Model.pkl']

<span style="color:#2929a3;font-size:20px;">Dump Inputs Names</span>

In [422]:
joblib.dump(X.columns.tolist(), "../Data/Inputs.pkl")

['../Data/Inputs.pkl']

In [425]:
df[df['Loan_Status'] == 'Y']

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,CoapplicantIncome,Credit_History,Property_Area,Loan_Status,Loan_Per_Month,Income_After_Loan,Income_Exceeds_Loan
0,Male,No,0.0,Graduate,No,0.0,1.0,Urban,Y,,,0
2,Male,Yes,0.0,Graduate,Yes,0.0,1.0,Urban,Y,183.0,2817.0,1
3,Male,Yes,0.0,Not Graduate,No,2358.0,1.0,Urban,Y,333.0,2250.0,1
4,Male,No,0.0,Graduate,No,0.0,1.0,Urban,Y,392.0,5608.0,1
5,Male,Yes,2.0,Graduate,Yes,4196.0,1.0,Urban,Y,742.0,4675.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
607,Male,Yes,0.0,Graduate,No,1950.0,1.0,Rural,Y,300.0,2932.0,1
608,Female,No,0.0,Graduate,No,0.0,1.0,Rural,Y,197.0,2703.0,1
609,Male,Yes,3.0,Graduate,No,0.0,1.0,Rural,Y,222.0,3884.0,1
610,Male,Yes,1.0,Graduate,No,240.0,1.0,Urban,Y,703.0,7369.0,1
