# Import Required packages

In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

# Read Data File

In [2]:
df = pd.read_csv("loan_data.csv")

In [3]:
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,5849.0,0.0,0.0,360.0,Yes,1
1,4583.0,1508.0,128.0,360.0,Yes,0
2,3000.0,0.0,66.0,360.0,Yes,1
3,2583.0,2358.0,120.0,360.0,Yes,1
4,6000.0,0.0,141.0,360.0,Yes,1


# EDA

In [4]:
print ("Total number of rows in dataset = {}".format(df.shape[0]))
print ("Total number of columns in dataset = {}".format(df.shape[1]))

Total number of rows in dataset = 614
Total number of columns in dataset = 6


In [5]:
df.isnull().sum()

ApplicantIncome      2
CoapplicantIncome    2
LoanAmount           3
Loan_Amount_Term     2
Credit_History       0
Loan_Status          0
dtype: int64

In [6]:
df.nunique()

ApplicantIncome      503
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      11
Credit_History         2
Loan_Status            2
dtype: int64

In [7]:
df.Loan_Amount_Term.value_counts(dropna=False)

360.0    511
180.0     43
480.0     15
0.0       14
300.0     13
84.0       4
240.0      4
120.0      3
NaN        2
36.0       2
60.0       2
12.0       1
Name: Loan_Amount_Term, dtype: int64

In [8]:
df.Loan_Status.value_counts()

1    422
0    192
Name: Loan_Status, dtype: int64

In [9]:
df.Credit_History.value_counts()

Yes    475
No     139
Name: Credit_History, dtype: int64

# Feature Engineering/Selection

In [10]:
# Drop 2 rows with missing values of Loan_Amount_Term
df.dropna(subset=["Loan_Amount_Term"], inplace=True)

In [11]:
df.dtypes

ApplicantIncome      float64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History        object
Loan_Status            int64
dtype: object

In [12]:
df.Loan_Amount_Term = df.Loan_Amount_Term.astype(int)

In [13]:
df.dtypes

ApplicantIncome      float64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term       int32
Credit_History        object
Loan_Status            int64
dtype: object

In [14]:
target_col = "Loan_Status"
X = df.loc[:, df.columns != target_col]
y = df.loc[:, target_col]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [16]:
X_train.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
563,5800.0,0.0,132.0,360,Yes
289,9508.0,0.0,187.0,360,Yes
324,15000.0,0.0,300.0,360,Yes
132,2718.0,0.0,70.0,360,Yes
174,4344.0,736.0,87.0,360,Yes


# Train Catboost Model

In [17]:
model_cb = CatBoostClassifier(task_type='CPU', iterations=50, 
                              random_state = 2021, 
                              eval_metric="F1")

In [18]:
cat_features = ["Loan_Amount_Term", "Credit_History"]

In [19]:
model_cb.fit(X_train, y_train, cat_features= cat_features, plot=True, 
             eval_set=(X_test, y_test))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.09775
0:	learn: 0.8419580	test: 0.8444444	best: 0.8444444 (0)	total: 140ms	remaining: 6.86s
1:	learn: 0.8419580	test: 0.8444444	best: 0.8444444 (0)	total: 158ms	remaining: 3.8s
2:	learn: 0.8419580	test: 0.8444444	best: 0.8444444 (0)	total: 178ms	remaining: 2.79s
3:	learn: 0.8419580	test: 0.8444444	best: 0.8444444 (0)	total: 202ms	remaining: 2.32s
4:	learn: 0.8419580	test: 0.8444444	best: 0.8444444 (0)	total: 249ms	remaining: 2.24s
5:	learn: 0.8419580	test: 0.8444444	best: 0.8444444 (0)	total: 275ms	remaining: 2.02s
6:	learn: 0.8419580	test: 0.8444444	best: 0.8444444 (0)	total: 296ms	remaining: 1.82s
7:	learn: 0.8419580	test: 0.8444444	best: 0.8444444 (0)	total: 326ms	remaining: 1.71s
8:	learn: 0.8419580	test: 0.8444444	best: 0.8444444 (0)	total: 355ms	remaining: 1.62s
9:	learn: 0.8419580	test: 0.8444444	best: 0.8444444 (0)	total: 367ms	remaining: 1.47s
10:	learn: 0.8419580	test: 0.8444444	best: 0.8444444 (0)	total: 378ms	remaining: 1.34s
11:	learn: 0.8419580	test

<catboost.core.CatBoostClassifier at 0x15bf4c49a90>

# Predict & Evaluate 

In [20]:
from utils import predict_and_evaluate

In [21]:
res = predict_and_evaluate(model_cb, X_test, y_test)

Algorithm,Accuracy,False Positives,False Negatives,Precision,Recall,F1 Score
CatBoostClassifier,0.77,20,8,0.79,0.9,0.84
