#Implementasi Naive Bayes Studi Kasus Peminjaman Uang (Tugas 7)


**Preprocessing**

In [1]:
### Data Wrangling 
import pandas as pd
import numpy as np
from collections import OrderedDict

### Modelling 
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

### Remove unnecessary warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning, append=True)

In [3]:
# load data set
dataset_url = "https://raw.githubusercontent.com/M-ILHAM-197/dataset/main/credit_score.csv"
df = pd.read_csv(dataset_url)
df.head()

Unnamed: 0.1,Unnamed: 0,kode_kontrak,pendapatan_setahun_juta,kpr_aktif,durasi_pinjaman_bulan,jumlah_tanggungan,rata_rata_overdue,risk_rating
0,1,AGR-000001,295,YA,48,5,61 - 90 days,4
1,2,AGR-000011,271,YA,36,5,61 - 90 days,4
2,3,AGR-000030,159,TIDAK,12,0,0 - 30 days,1
3,4,AGR-000043,210,YA,12,3,46 - 60 days,3
4,5,AGR-000049,165,TIDAK,36,0,31 - 45 days,2


In [4]:
# select the class feature
labels = df["risk_rating"]

In [5]:
# create a dataframe with all training data except the target column
X = df.drop(columns=["risk_rating"])

# check that the target variable has been removed
X.head()

Unnamed: 0.1,Unnamed: 0,kode_kontrak,pendapatan_setahun_juta,kpr_aktif,durasi_pinjaman_bulan,jumlah_tanggungan,rata_rata_overdue
0,1,AGR-000001,295,YA,48,5,61 - 90 days
1,2,AGR-000011,271,YA,36,5,61 - 90 days
2,3,AGR-000030,159,TIDAK,12,0,0 - 30 days
3,4,AGR-000043,210,YA,12,3,46 - 60 days
4,5,AGR-000049,165,TIDAK,36,0,31 - 45 days




```
# Range Days
['>90', '0-30', '31-45', '46-60', '61-90']
```



In [7]:
split_overdue_X = pd.get_dummies(X["rata_rata_overdue"], prefix="overdue")
X = X.join(split_overdue_X)

X = X.drop(columns = "rata_rata_overdue")



```
# KPR values
['yes', 'no']
```



In [8]:
# 
KPR_status = pd.get_dummies(X["kpr_aktif"], prefix="KPR")
X = X.join(KPR_status)

# remove "rata_rata_overdue" feature
X = X.drop(columns = "kpr_aktif")

In [9]:
X

Unnamed: 0.1,Unnamed: 0,kode_kontrak,pendapatan_setahun_juta,durasi_pinjaman_bulan,jumlah_tanggungan,overdue_0 - 30 days,overdue_31 - 45 days,overdue_46 - 60 days,overdue_61 - 90 days,overdue_> 90 days,KPR_TIDAK,KPR_YA
0,1,AGR-000001,295,48,5,0,0,0,1,0,0,1
1,2,AGR-000011,271,36,5,0,0,0,1,0,0,1
2,3,AGR-000030,159,12,0,1,0,0,0,0,1,0
3,4,AGR-000043,210,12,3,0,0,1,0,0,0,1
4,5,AGR-000049,165,36,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
895,896,AGR-010739,112,48,5,0,0,0,0,1,0,1
896,897,AGR-010744,120,48,2,0,0,1,0,0,0,1
897,898,AGR-010758,166,24,2,1,0,0,0,0,1,0
898,899,AGR-010775,196,48,0,0,1,0,0,0,1,0




```
# features will be normalize
['pendapatan_setahun_juta', 'durasi_pinjaman_bulan', 'jumlah_tanggungan']
```



In [10]:
# normalize feature 'pendapatan_setahun_juta', 'durasi_pinjaman_bulan', 'jumlah_tanggungan'
old_normalize_feature_labels = ['pendapatan_setahun_juta', 'durasi_pinjaman_bulan', 'jumlah_tanggungan']
new_normalized_feature_labels = ['norm_pendapatan_setahun_juta', 'norm_durasi_pinjaman_bulan', 'norm_jumlah_tanggungan']
normalize_feature = df[old_normalize_feature_labels]

In [11]:
normalize_feature

Unnamed: 0,pendapatan_setahun_juta,durasi_pinjaman_bulan,jumlah_tanggungan
0,295,48,5
1,271,36,5
2,159,12,0
3,210,12,3
4,165,36,0
...,...,...,...
895,112,48,5
896,120,48,2
897,166,24,2
898,196,48,0


In [12]:
scaler = MinMaxScaler()

In [13]:
scaler.fit(normalize_feature)

MinMaxScaler()



```
MinMaxScaler()
```



In [14]:
normalized_feature = scaler.transform(normalize_feature)

In [15]:
normalized_feature_df = pd.DataFrame(normalized_feature, columns = new_normalized_feature_labels)

In [16]:
normalized_feature_df

Unnamed: 0,norm_pendapatan_setahun_juta,norm_durasi_pinjaman_bulan,norm_jumlah_tanggungan
0,0.978261,1.000000,0.833333
1,0.873913,0.666667,0.833333
2,0.386957,0.000000,0.000000
3,0.608696,0.000000,0.500000
4,0.413043,0.666667,0.000000
...,...,...,...
895,0.182609,1.000000,0.833333
896,0.217391,1.000000,0.333333
897,0.417391,0.333333,0.333333
898,0.547826,1.000000,0.000000


In [17]:
X = X.drop(columns = old_normalize_feature_labels)

In [18]:
X = X.join(normalized_feature_df)

In [19]:
X = X.join(labels)

In [20]:
X

Unnamed: 0.1,Unnamed: 0,kode_kontrak,overdue_0 - 30 days,overdue_31 - 45 days,overdue_46 - 60 days,overdue_61 - 90 days,overdue_> 90 days,KPR_TIDAK,KPR_YA,norm_pendapatan_setahun_juta,norm_durasi_pinjaman_bulan,norm_jumlah_tanggungan,risk_rating
0,1,AGR-000001,0,0,0,1,0,0,1,0.978261,1.000000,0.833333,4
1,2,AGR-000011,0,0,0,1,0,0,1,0.873913,0.666667,0.833333,4
2,3,AGR-000030,1,0,0,0,0,1,0,0.386957,0.000000,0.000000,1
3,4,AGR-000043,0,0,1,0,0,0,1,0.608696,0.000000,0.500000,3
4,5,AGR-000049,0,1,0,0,0,1,0,0.413043,0.666667,0.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,896,AGR-010739,0,0,0,0,1,0,1,0.182609,1.000000,0.833333,5
896,897,AGR-010744,0,0,1,0,0,0,1,0.217391,1.000000,0.333333,3
897,898,AGR-010758,1,0,0,0,0,1,0,0.417391,0.333333,0.333333,1
898,899,AGR-010775,0,1,0,0,0,1,0,0.547826,1.000000,0.000000,2


In [21]:
subject_lables = ["Unnamed: 0",  "kode_kontrak"]
X = X.drop(columns = subject_lables)

In [22]:
# percent_amount_of_test_data = / HUNDRED_PERCENT
percent_amount_of_test_data = 0.3

In [23]:
X

Unnamed: 0,overdue_0 - 30 days,overdue_31 - 45 days,overdue_46 - 60 days,overdue_61 - 90 days,overdue_> 90 days,KPR_TIDAK,KPR_YA,norm_pendapatan_setahun_juta,norm_durasi_pinjaman_bulan,norm_jumlah_tanggungan,risk_rating
0,0,0,0,1,0,0,1,0.978261,1.000000,0.833333,4
1,0,0,0,1,0,0,1,0.873913,0.666667,0.833333,4
2,1,0,0,0,0,1,0,0.386957,0.000000,0.000000,1
3,0,0,1,0,0,0,1,0.608696,0.000000,0.500000,3
4,0,1,0,0,0,1,0,0.413043,0.666667,0.000000,2
...,...,...,...,...,...,...,...,...,...,...,...
895,0,0,0,0,1,0,1,0.182609,1.000000,0.833333,5
896,0,0,1,0,0,0,1,0.217391,1.000000,0.333333,3
897,1,0,0,0,0,1,0,0.417391,0.333333,0.333333,1
898,0,1,0,0,0,1,0,0.547826,1.000000,0.000000,2


**Hitung Data**



```
# data latih (nilai data)
X_train 

# data tes (nilai data)
X_test 

# data latih (kelas data)
y_train

# data tes (kelas data)
y_test
```



In [25]:
# separate target 

# values
matrices_X = X.iloc[:,0:10].values

# classes
matrices_Y = X.iloc[:,10].values

In [26]:
X_1 = X.iloc[:,0:10].values
Y_1 = X.iloc[:, -1].values

In [27]:
# X_train, X_test, y_train, y_test = train_test_split(matrices_X, matrices_Y, test_size = percent_amount_of_test_data, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X_1, Y_1, test_size = percent_amount_of_test_data, random_state=0)

In [28]:
X_1

array([[0.        , 0.        , 0.        , ..., 0.97826087, 1.        ,
        0.83333333],
       [0.        , 0.        , 0.        , ..., 0.87391304, 0.66666667,
        0.83333333],
       [1.        , 0.        , 0.        , ..., 0.38695652, 0.        ,
        0.        ],
       ...,
       [1.        , 0.        , 0.        , ..., 0.4173913 , 0.33333333,
        0.33333333],
       [0.        , 1.        , 0.        , ..., 0.54782609, 1.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.5826087 , 0.33333333,
        0.33333333]])

In [29]:
Y_1

array([4, 4, 1, 3, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2,
       1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2,
       1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1,
       2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1,
       2, 2, 1, 2, 2, 4, 4, 1, 1, 1, 3, 5, 3, 3, 5, 3, 5, 3, 1, 3, 1, 3,
       5, 1, 3, 3, 5, 1, 3, 4, 2, 3, 2, 1, 4, 3, 3, 3, 1, 4, 1, 4, 3, 1,
       3, 5, 3, 5, 5, 5, 3, 5, 3, 1, 5, 5, 3, 3, 1, 5, 1, 1, 2, 3, 3, 4,
       1, 4, 1, 3, 3, 3, 3, 1, 2, 2, 3, 4, 4, 3, 2, 1, 3, 3, 3, 3, 4, 4,
       1, 3, 1, 2, 3, 3, 3, 5, 2, 3, 5, 3, 1, 5, 5, 1, 5, 5, 3, 2, 5, 3,
       5, 1, 2, 3, 3, 5, 5, 3, 1, 1, 1, 3, 1, 1, 5, 1, 3, 5, 3, 2, 3, 3,
       1, 1, 1, 3, 5, 3, 5, 2, 3, 3, 3, 1, 5, 3, 3, 3, 1, 1, 2, 1, 5, 3,
       3, 5, 4, 3, 4, 5, 3, 3, 5, 1, 1, 3, 3, 2, 3, 1, 2, 3, 1, 3, 2, 2,
       2, 4, 3, 2, 3, 1, 2, 5, 5, 4, 3, 1, 2, 2, 5, 4, 5, 2, 3, 3, 1, 3,
       5, 1, 2, 5, 3, 1, 5, 3, 1, 4, 2, 1, 3, 3, 3,

Membuat Model

In [30]:
### Dictionary to store model and its accuracy

model_accuracy = OrderedDict()

### Dictionary to store model and its precision

model_precision = OrderedDict()

### Dictionary to store model and its recall

model_recall = OrderedDict()

**Implementasi Gaussian Naive Bayes**

In [31]:
### Applying Naive Bayes Classification model

naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(X_train, y_train)
Y_pred_nb = naive_bayes_classifier.predict(X_test)


### Making the confusion matrix
cm = confusion_matrix(y_test, Y_pred_nb)


### Printing the accuracy, precision, and recall of the model
print('Confusion matrix for Gaussian Naive Bayes\n',cm)

naive_bayes_accuracy = round(100 * accuracy_score(y_test, Y_pred_nb), 2)
model_accuracy['Gaussian Naive Bayes'] = naive_bayes_accuracy

naive_bayes_precision = round(100 * precision_score(y_test, Y_pred_nb, average = 'weighted'), 2)
model_precision['Gaussian Naive Bayes'] = naive_bayes_precision

naive_bayes_recall = round(100 * recall_score(y_test, Y_pred_nb, average = 'weighted'), 2)
model_recall['Gaussian Naive Bayes'] = naive_bayes_recall

print('The accuracy of this model is {} %.'.format(naive_bayes_accuracy))
print('The precision of this model is {} %.'.format(naive_bayes_precision))
print('The recall of this model is {} %.'.format(naive_bayes_recall))

Confusion matrix for Gaussian Naive Bayes
 [[69  0  0  0  0]
 [ 0 49  0  0  0]
 [ 0  0 84  0  0]
 [ 0  0  0 36  0]
 [ 0  0  0  0 32]]
The accuracy of this model is 100.0 %.
The precision of this model is 100.0 %.
The recall of this model is 100.0 %.


In [32]:
Y_pred_nb

array([2, 1, 3, 2, 4, 4, 1, 3, 3, 5, 1, 2, 5, 2, 3, 3, 1, 3, 3, 4, 2, 3,
       3, 1, 5, 3, 3, 3, 3, 3, 2, 3, 5, 3, 1, 4, 4, 4, 4, 2, 4, 1, 1, 2,
       5, 3, 5, 2, 1, 1, 2, 5, 1, 1, 2, 5, 1, 3, 3, 3, 4, 2, 3, 5, 5, 3,
       3, 3, 4, 1, 5, 4, 2, 1, 1, 4, 3, 3, 3, 5, 1, 2, 3, 2, 4, 3, 1, 3,
       2, 1, 2, 3, 2, 2, 3, 1, 2, 5, 5, 1, 1, 1, 3, 1, 5, 4, 3, 5, 2, 2,
       3, 3, 1, 1, 2, 1, 4, 4, 2, 2, 5, 4, 3, 1, 4, 1, 3, 1, 1, 5, 1, 1,
       1, 3, 3, 2, 5, 3, 1, 4, 3, 5, 3, 5, 2, 3, 1, 2, 2, 3, 3, 3, 3, 4,
       5, 3, 3, 4, 3, 1, 2, 1, 1, 1, 2, 1, 3, 1, 4, 2, 1, 1, 4, 2, 2, 1,
       1, 1, 3, 3, 3, 4, 5, 4, 3, 1, 1, 3, 2, 2, 3, 3, 4, 2, 5, 3, 5, 1,
       3, 5, 2, 3, 2, 3, 3, 2, 3, 3, 3, 1, 1, 1, 3, 1, 1, 3, 2, 3, 2, 3,
       3, 1, 1, 1, 1, 5, 1, 3, 2, 1, 4, 1, 5, 4, 4, 3, 3, 3, 3, 3, 2, 3,
       1, 4, 4, 4, 3, 5, 3, 3, 1, 1, 4, 1, 5, 1, 5, 2, 1, 2, 2, 3, 4, 4,
       1, 2, 4, 2, 5, 3])

In [33]:
clf = GaussianNB()
clf.fit(matrices_X, matrices_Y)
clf_pf = GaussianNB()
clf_pf.partial_fit(matrices_X, matrices_Y, np.unique(matrices_Y))

GaussianNB()

In [34]:
FIRST_IDX = 0

In [35]:
# try with value [0,	0,	0,	0,	0,	0,	1,	0.582609,	0.666667,	0]
result_test_naive_bayes = clf_pf.predict([[0,	0,	0,	0,	0,	0,	1,	0.582609,	0.666667,	0]])[FIRST_IDX]
print(f"Customer Name : Dio has risk rating {result_test_naive_bayes} based on Gaussian Naive Bayes model")

Customer Name : Dio has risk rating 4 based on Gaussian Naive Bayes model


**Implementasi K-Nearest Neighbours**

Menentukan niai K = 10

Data latih

In [36]:
amount_of_neighbor = 10

In [37]:
# Fit the classifier to the data
knn = KNeighborsClassifier(n_neighbors = amount_of_neighbor)

# create train data
knn.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=10)

In [38]:
#show first 5 model predictions on the test data
Y_pred_knn = knn.predict(X_test)

In [39]:
#check accuracy of our model on the test data
knn.score(X_test, y_test)

0.9925925925925926

In [40]:
# Custom value to predict
result_test_knn = knn.predict([[0,	0,	0,	0,	0,	0,	1,	0.582609,	0.666667,	0]])
print(f"Customer Name : Dio has risk rating {result_test_knn[FIRST_IDX]} based on KNN model")

Customer Name : Dio has risk rating 3 based on KNN model


In [41]:
### Making the confusion matrix
cm = confusion_matrix(y_test, Y_pred_knn)

### Printing the accuracy, precision, and recall of the model
print('Confusion matrix for K - Nearest Neighbors\n',cm)
nn1_accuracy = round(100 * accuracy_score(y_test, Y_pred_knn), 2)
model_accuracy['1 - Nearest Neighbors'] = nn1_accuracy

nn1_precision = round(100 * precision_score(y_test, Y_pred_knn, average = 'weighted'), 2)
model_precision['1 - Nearest Neighbors'] = nn1_precision

nn1_recall = round(100 * recall_score(y_test, Y_pred_knn, average = 'weighted'), 2)
model_recall['1 - Nearest Neighbors'] = nn1_recall

print('The accuracy of this model is {} %.'.format(nn1_accuracy))
print('The precision of this model is {} %.'.format(nn1_precision))
print('The recall of this model is {} %.'.format(nn1_recall))

Confusion matrix for K - Nearest Neighbors
 [[69  0  0  0  0]
 [ 0 49  0  0  0]
 [ 0  1 83  0  0]
 [ 0  0  0 36  0]
 [ 0  1  0  0 31]]
The accuracy of this model is 99.26 %.
The precision of this model is 99.29 %.
The recall of this model is 99.26 %.


**Implementasi Decision Tree**

In [42]:
### Applying Decision Tree Classification model

decision_tree_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 27)
decision_tree_classifier.fit(X_train, y_train)


### Predicting the Test set results

Y_pred_dc = decision_tree_classifier.predict(X_test)

In [43]:
### Making the confusion matrix
cm = confusion_matrix(y_test, Y_pred_dc)

### Printing the accuracy, precision, and recall of the model
print('Confusion matrix for Decision Tree\n',cm)

decision_tree_accuracy = round(100 * accuracy_score(y_test, Y_pred_dc), 2)
model_accuracy['Decision Tree'] = decision_tree_accuracy

decision_tree_precision = round(100 * precision_score(y_test, Y_pred_dc, average = 'weighted'), 2)
model_precision['Decision Tree'] = decision_tree_precision

decision_tree_recall = round(100 * recall_score(y_test, Y_pred_dc, average = 'weighted'), 2)
model_recall['Decision Tree'] = decision_tree_recall

print('The accuracy of this model is {} %.'.format(decision_tree_accuracy))
print('The precision of this model is {} %.'.format(decision_tree_precision))
print('The recall of this model is {} %.'.format(decision_tree_recall))

Confusion matrix for Decision Tree
 [[69  0  0  0  0]
 [ 0 49  0  0  0]
 [ 0  0 84  0  0]
 [ 0  0  0 36  0]
 [ 0  0  0  0 32]]
The accuracy of this model is 100.0 %.
The precision of this model is 100.0 %.
The recall of this model is 100.0 %.
