# Loan Dataset Machine Learning Practice

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_encoded = pd.read_csv("datasets/loan_encoded.csv")
df_encoded.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,...,Loan_Amount_Term_60.0,Loan_Amount_Term_84.0,Loan_Amount_Term_120.0,Loan_Amount_Term_180.0,Loan_Amount_Term_240.0,Loan_Amount_Term_300.0,Loan_Amount_Term_360.0,Loan_Amount_Term_480.0,Loan_Status_N,Loan_Status_Y
0,0.497164,-0.874587,0.150494,False,True,True,False,True,False,False,...,False,False,False,False,False,False,True,False,False,True
1,-0.013767,0.054395,-0.179896,False,True,False,True,False,True,False,...,False,False,False,False,False,False,True,False,True,False
2,-0.652632,-0.874587,-1.292433,False,True,False,True,True,False,False,...,False,False,False,False,False,False,True,False,False,True
3,-0.820924,0.578025,-0.323449,False,True,False,True,True,False,False,...,False,False,False,False,False,False,True,False,False,True
4,0.558104,-0.874587,0.053377,False,True,True,False,True,False,False,...,False,False,False,False,False,False,True,False,False,True


### Revert Scaling Test

In [3]:
import joblib
scaler = joblib.load('scaler/scaler.pkl')

In [4]:
# These are 3 columns that we scaled in the previous project
scaled_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

df_original = df_encoded.copy()
df_original[scaled_cols] = scaler.inverse_transform(df_encoded[scaled_cols])
df_original.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,...,Loan_Amount_Term_60.0,Loan_Amount_Term_84.0,Loan_Amount_Term_120.0,Loan_Amount_Term_180.0,Loan_Amount_Term_240.0,Loan_Amount_Term_300.0,Loan_Amount_Term_360.0,Loan_Amount_Term_480.0,Loan_Status_N,Loan_Status_Y
0,5849.0,0.0,146.412162,False,True,True,False,True,False,False,...,False,False,False,False,False,False,True,False,False,True
1,4583.0,1508.0,128.0,False,True,False,True,False,True,False,...,False,False,False,False,False,False,True,False,True,False
2,3000.0,0.0,66.0,False,True,False,True,True,False,False,...,False,False,False,False,False,False,True,False,False,True
3,2583.0,2358.0,120.0,False,True,False,True,True,False,False,...,False,False,False,False,False,False,True,False,False,True
4,6000.0,0.0,141.0,False,True,True,False,True,False,False,...,False,False,False,False,False,False,True,False,False,True


### Based on the reference from 
https://github.com/MomorioUHT/Data-Cleaning-Preprocessing/blob/main/Loan-preprocessing.ipynb<br/>
the data after inverse scaling is correct!<br/>
(but we will still use the 'df_encoded' for the machine learning)

### Imports for essential libraries

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV

### Train test splitting

In [6]:
X = df_encoded.drop(["Loan_Status_N", "Loan_Status_Y"], axis=1)
y = np.where(df_encoded["Loan_Status_Y"], 1, 0)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

### See the amount of class 0s and 1s

In [8]:
pd.Series(y).value_counts()

1    422
0    192
Name: count, dtype: int64

In [9]:
pd.Series(y).value_counts(normalize=True)

1    0.687296
0    0.312704
Name: proportion, dtype: float64

## Attempt #1 DecisionTreeClassifier

In [11]:
from sklearn.tree import DecisionTreeClassifier

dtreeclassifier = DecisionTreeClassifier(random_state=42)

param_grid = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [3, 5, 7, 10, None],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": ["sqrt", "log2", None],
}

grid_seach_dtc = GridSearchCV(
    estimator=dtreeclassifier,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=1, #Core
    verbose=1
)

grid_seach_dtc.fit(X_train, y_train)

print("Best parameters:", grid_seach_dtc.best_params_)
print("Best score:", grid_seach_dtc.best_score_)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits
Best parameters: {'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 20}
Best score: 0.7943104514533086
