# Credit Risk Modeling

In [110]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification

In [3]:
dataset_url = "https://raw.githubusercontent.com/Nauvaldi/Dataset/main/credit_score.csv"
dataset = pd.read_csv(dataset_url)
dataset

Unnamed: 0.1,Unnamed: 0,kode_kontrak,pendapatan_setahun_juta,kpr_aktif,durasi_pinjaman_bulan,jumlah_tanggungan,rata_rata_overdue,risk_rating
0,1,AGR-000001,295,YA,48,5,61 - 90 days,4
1,2,AGR-000011,271,YA,36,5,61 - 90 days,4
2,3,AGR-000030,159,TIDAK,12,0,0 - 30 days,1
3,4,AGR-000043,210,YA,12,3,46 - 60 days,3
4,5,AGR-000049,165,TIDAK,36,0,31 - 45 days,2
...,...,...,...,...,...,...,...,...
895,896,AGR-010739,112,YA,48,5,> 90 days,5
896,897,AGR-010744,120,YA,48,2,46 - 60 days,3
897,898,AGR-010758,166,TIDAK,24,2,0 - 30 days,1
898,899,AGR-010775,196,TIDAK,48,0,31 - 45 days,2


In [4]:
data = dataset.drop(columns=['kpr_aktif', 'rata_rata_overdue'])

In [5]:
#Pre processing dengan get dummies panda
data_kpr = dataset[['kpr_aktif']]
data_rata = dataset[['rata_rata_overdue']]
kpr = pd.get_dummies(data_kpr)
rata_rata = pd.get_dummies(data_rata)

In [6]:
dataOlah = pd.concat([kpr,rata_rata],axis=1)

In [7]:
dataHasil = pd.concat([data,dataOlah], axis=1)
dataHasil

Unnamed: 0.1,Unnamed: 0,kode_kontrak,pendapatan_setahun_juta,durasi_pinjaman_bulan,jumlah_tanggungan,risk_rating,kpr_aktif_TIDAK,kpr_aktif_YA,rata_rata_overdue_0 - 30 days,rata_rata_overdue_31 - 45 days,rata_rata_overdue_46 - 60 days,rata_rata_overdue_61 - 90 days,rata_rata_overdue_> 90 days
0,1,AGR-000001,295,48,5,4,0,1,0,0,0,1,0
1,2,AGR-000011,271,36,5,4,0,1,0,0,0,1,0
2,3,AGR-000030,159,12,0,1,1,0,1,0,0,0,0
3,4,AGR-000043,210,12,3,3,0,1,0,0,1,0,0
4,5,AGR-000049,165,36,0,2,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,896,AGR-010739,112,48,5,5,0,1,0,0,0,0,1
896,897,AGR-010744,120,48,2,3,0,1,0,0,1,0,0
897,898,AGR-010758,166,24,2,1,1,0,1,0,0,0,0
898,899,AGR-010775,196,48,0,2,1,0,0,1,0,0,0


Normalisasi Data

In [8]:
datakelas = dataHasil['risk_rating']
data = dataHasil.drop(columns=['kode_kontrak','risk_rating'])
data

Unnamed: 0.1,Unnamed: 0,pendapatan_setahun_juta,durasi_pinjaman_bulan,jumlah_tanggungan,kpr_aktif_TIDAK,kpr_aktif_YA,rata_rata_overdue_0 - 30 days,rata_rata_overdue_31 - 45 days,rata_rata_overdue_46 - 60 days,rata_rata_overdue_61 - 90 days,rata_rata_overdue_> 90 days
0,1,295,48,5,0,1,0,0,0,1,0
1,2,271,36,5,0,1,0,0,0,1,0
2,3,159,12,0,1,0,1,0,0,0,0
3,4,210,12,3,0,1,0,0,1,0,0
4,5,165,36,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
895,896,112,48,5,0,1,0,0,0,0,1
896,897,120,48,2,0,1,0,0,1,0,0
897,898,166,24,2,1,0,1,0,0,0,0
898,899,196,48,0,1,0,0,1,0,0,0


In [9]:
# scale features
scaler = MinMaxScaler()
model =scaler.fit(data)
scaled_data=model.transform(data)
# print scaled features
print(scaled_data)

[[0.         0.97826087 1.         ... 0.         1.         0.        ]
 [0.00111235 0.87391304 0.66666667 ... 0.         1.         0.        ]
 [0.00222469 0.38695652 0.         ... 0.         0.         0.        ]
 ...
 [0.99777531 0.4173913  0.33333333 ... 0.         0.         0.        ]
 [0.99888765 0.54782609 1.         ... 0.         0.         0.        ]
 [1.         0.5826087  0.33333333 ... 0.         0.         0.        ]]


In [10]:
namakolom = data.columns.values
dataMinMax = pd.DataFrame(scaled_data, columns=namakolom)
label = ['Unnamed: 0']
dataMinMax= dataMinMax.drop(columns = label)

In [11]:
dataMinMax

Unnamed: 0,pendapatan_setahun_juta,durasi_pinjaman_bulan,jumlah_tanggungan,kpr_aktif_TIDAK,kpr_aktif_YA,rata_rata_overdue_0 - 30 days,rata_rata_overdue_31 - 45 days,rata_rata_overdue_46 - 60 days,rata_rata_overdue_61 - 90 days,rata_rata_overdue_> 90 days
0,0.978261,1.000000,0.833333,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.873913,0.666667,0.833333,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.386957,0.000000,0.000000,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.608696,0.000000,0.500000,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.413043,0.666667,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
895,0.182609,1.000000,0.833333,0.0,1.0,0.0,0.0,0.0,0.0,1.0
896,0.217391,1.000000,0.333333,0.0,1.0,0.0,0.0,1.0,0.0,0.0
897,0.417391,0.333333,0.333333,1.0,0.0,1.0,0.0,0.0,0.0,0.0
898,0.547826,1.000000,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
# Z Score atau StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
model = (scaler.fit(data))
data_mean = (scaler.mean_)
scale_data = (scaler.transform(data))
print(scale_data)

In [None]:
dataZScale = pd.DataFrame(scale_data, columns=data.columns.values)
dataZScale

In [None]:
datakelas = dataHasil['risk_rating']
data = pd.concat([datakelas,dataMinMax],axis=1)
data

Split Data menjadi X dan Y

X_train dan X_test

y_test dan y_train

In [20]:
percent_test_data = 0.3
X = data.iloc[:,1:11].values
Y = data.iloc[:,0].values

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = percent_test_data, random_state=0)

In [24]:
X.shape

(900, 10)

In [25]:
X_train.shape

(630, 10)

## Naive Bayes

In [63]:
# Feature Scaling to bring the variable in a single scale
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [64]:
GaussianNB(priors=None)

GaussianNB()

In [65]:
# Fitting Naive Bayes Classification to the Training set with linear kernel
nvklasifikasi = GaussianNB()
nvklasifikasi = nvklasifikasi.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_pred = nvklasifikasi.predict(X_test)
y_pred

In [None]:
#lets see the actual and predicted value side by side
y_compare = np.vstack((y_test,y_pred)).T
#actual value on the left side and predicted value on the right hand side
#printing the top 5 values
y_compare[:5,:]

In [None]:
# Menentukan probabilitas hasil prediksi
nvklasifikasi.predict_proba(X_test)

In [None]:
akurasi = round(100 * accuracy_score(y_test, y_pred))
print('Model accuracy score: {0:0.2f}'. format(akurasi))

In [70]:
# print the scores on training and test set
akurasi_training = round(100* nvklasifikasi.score(X_train, y_train))
akurasi_test = round(100 * nvklasifikasi.score(X_test, y_test) )
print('Training set score: {:.2f}'.format(akurasi_training))
print('Test set score: {:.2f}'.format(akurasi_test))

Training set score: 100.00
Test set score: 100.00


In [81]:
model = GaussianNB()
model.fit(X, Y)
model_pf = GaussianNB()
model_pf.partial_fit(X,Y, np.unique(Y))

GaussianNB()

In [90]:
no_index = 0

In [None]:
# try with value [0,	0,	0,	0,	0,	1,	1,	0.8,	0.9,	0]
result_test_naive_bayes = model_pf.predict([[0,	0,	0,	0,	0,	1,	1,	0.8,	0.9,	0]])[no_index]
print(f"Customer : Budi Memiliki risk rating {result_test_naive_bayes} Pada metode Gaussian Naive Bayes model")

## KNN

In [93]:
K=10
knn=KNeighborsClassifier(n_neighbors=K)
knn.fit(X_train,y_train)
y_pred=knn.predict(X_test)

In [None]:
skor_akurasi = round(100 * accuracy_score(y_test,y_pred))
print("Model accuracy score : {0:0.2f}" . format(skor_akurasi))

In [None]:
# Custom value to predict
result_test_knn = knn.predict([[0,	0,	0,	0,	0.5,	1,	1,	0.7,	0.2,	0]])
print(f"Customer : Budi Memiliki risk rating {result_test_knn[no_index]} Pada metode KNN model")

## Decision Tree

In [96]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier()

In [97]:
# prediction
dt.score(X_test, y_test)

1.0

In [98]:
y_pred = dt.predict(X_test)

In [99]:
#Accuracy
akurasi = round(100 * accuracy_score(y_test,y_pred))
print('Model Accuracy Score: {0:0.2f}'.format(akurasi))

Model Accuracy Score: 100.00


In [100]:
# Custom value to predict
result_test_knn = dt.predict([[0,	0,	0,	0.7,	0.6,	1,	1,	0.3,	0.5,	0]])
print(f"Customer : Budi Memiliki risk rating {result_test_knn[no_index]} Pada metode Decision Tree model")

Customer : Budi Memiliki risk rating 1 Pada metode Decision Tree model


# Ensemble Learning

In [102]:
model1 = BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=10, random_state=0).fit(X_train, y_train)
dt = model1.predict(X_test)
kolom = ['Decision Tree']
dt = pd.DataFrame(dt,columns = kolom)

In [103]:
X_test.shape

(270, 10)

In [104]:
model2 = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors = K),n_estimators=10, random_state=0).fit(X_train, y_train)
knn = model2.predict(X_test)
kolom = ['KNN']
knn = pd.DataFrame(knn,columns = kolom)

In [106]:
model3 = BaggingClassifier(base_estimator=GaussianNB(),n_estimators=10, random_state=0).fit(X_train, y_train)
nb = model3.predict(X_test)
kolom = ['Naive Bayes']
naive_bayes = pd.DataFrame(nb,columns = kolom)

In [107]:
Hasil = pd.concat([dt, knn,naive_bayes], axis=1)
Hasil

Unnamed: 0,Decision Tree,KNN,Naive Bayes
0,2,2,2
1,1,1,1
2,3,3,3
3,2,2,2
4,4,4,4
...,...,...,...
265,2,2,2
266,4,4,4
267,2,2,2
268,5,5,5


In [None]:
bagging_accuracy1 = round(100 * accuracy_score(y_test, naive_bayes))
bagging_accuracy2 = round(100 * accuracy_score(y_test, dt))
bagging_accuracy3 = round(100 * accuracy_score(y_test, knn))
print('The accuracy of this model is Bagging Naive Bayes {} %.'.format(bagging_accuracy1))
print('The accuracy of this model is Bagging Decision Tree {} %.'.format(bagging_accuracy2))
print('The accuracy of this model is Bagging kNN {} %.'.format(bagging_accuracy3))