## Import library

In [177]:
import pandas as pd 
import sklearn


## Read data

In [178]:
data = pd.read_csv('german_credit_data.csv', index_col = 0)

In [179]:
data

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,40,male,3,own,little,little,3857,30,car,good
997,38,male,2,own,little,,804,12,radio/TV,good
998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [180]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 85.9+ KB


In [181]:
data.describe(include='all')

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
count,1000.0,1000,1000.0,1000,817,606,1000.0,1000.0,1000,1000
unique,,2,,3,4,3,,,8,2
top,,male,,own,little,little,,,car,good
freq,,690,,713,603,274,,,337,700
mean,35.546,,1.904,,,,3271.258,20.903,,
std,11.375469,,0.653614,,,,2822.736876,12.058814,,
min,19.0,,0.0,,,,250.0,4.0,,
25%,27.0,,2.0,,,,1365.5,12.0,,
50%,33.0,,2.0,,,,2319.5,18.0,,
75%,42.0,,2.0,,,,3972.25,24.0,,


## Preprocessing

In [182]:
def preprocess_missing_value(data):
    mode_saving_accounts = data['Saving accounts'].mode()
    data['Saving accounts'].fillna(mode_saving_accounts[0], inplace=True)

    mode_checking_account = data['Checking account'].mode()
    data['Checking account'].fillna(mode_checking_account[0], inplace=True)

def preprocessing(data):

    data['Sex'] = data['Sex'].apply(lambda x: 1 if x == 'male' else 0)
    data['Risk'] = data['Risk'].apply(lambda x: 1 if x == 'good' else 0)
    data['Housing'] = data['Housing'].apply(lambda x: 0 if x == 'free' else (1 if x == 'own' else 2))
    data['Saving accounts'] = data['Saving accounts'].apply(lambda x: 0 if x == 'nan' 
                                                                        else(1 if x == 'little' 
                                                                        else(2 if x == 'moderate'
                                                                        else(3 if x == 'quite rich' 
                                                                        else 4))))
                                                                        
    data['Checking account'] = data['Checking account'].apply(lambda x: 0 if x == 'nan' 
                                                                        else(1 if x == 'little' 
                                                                        else(2 if x == 'moderate'
                                                                        else(3 if x == 'quite rich' 
                                                                        else 4))))
    data['Purpose'] = data['Purpose'].apply(lambda x: 5 if x == 'car' 
                                                        else (4 if x == 'radio/TV'
                                                        else (3 if x == 'furniture/equipment'
                                                        else (2 if x == 'business'
                                                        else (1 if x == 'education'
                                                        else 0)))))


In [183]:
preprocess_missing_value(data)
preprocessing(data)

In [184]:
for col in data.columns:
  missing_data = data[col].isna().sum()
  missing_percent = missing_data/len(data) * 100
  print(f'Missing data of {col} is {missing_percent}"%' )

Missing data of Age is 0.0"%
Missing data of Sex is 0.0"%
Missing data of Job is 0.0"%
Missing data of Housing is 0.0"%
Missing data of Saving accounts is 0.0"%
Missing data of Checking account is 0.0"%
Missing data of Credit amount is 0.0"%
Missing data of Duration is 0.0"%
Missing data of Purpose is 0.0"%
Missing data of Risk is 0.0"%


In [185]:
data

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,1,2,1,1,1,1169,6,4,1
1,22,0,2,1,1,2,5951,48,4,0
2,49,1,1,1,1,1,2096,12,1,1
3,45,1,2,0,1,1,7882,42,3,1
4,53,1,2,0,1,1,4870,24,5,0
...,...,...,...,...,...,...,...,...,...,...
995,31,0,1,1,1,1,1736,12,3,1
996,40,1,3,1,1,1,3857,30,5,1
997,38,1,2,1,1,1,804,12,4,1
998,23,1,2,0,1,1,1845,45,4,0


## Split data

In [186]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [187]:
X

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67,1,2,1,1,1,1169,6,4
1,22,0,2,1,1,2,5951,48,4
2,49,1,1,1,1,1,2096,12,1
3,45,1,2,0,1,1,7882,42,3
4,53,1,2,0,1,1,4870,24,5
...,...,...,...,...,...,...,...,...,...
995,31,0,1,1,1,1,1736,12,3
996,40,1,3,1,1,1,3857,30,5
997,38,1,2,1,1,1,804,12,4
998,23,1,2,0,1,1,1845,45,4


In [188]:
y

0      1
1      0
2      1
3      1
4      0
      ..
995    1
996    1
997    1
998    0
999    1
Name: Risk, Length: 1000, dtype: int64

In [189]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [190]:
X_train.shape

(800, 9)

## Decision Tree Model

In [191]:
from sklearn.tree import DecisionTreeClassifier

d_tree = DecisionTreeClassifier()
d_tree.fit(X_train, y_train)

## Predict

In [192]:
y_predict = d_tree.predict(X_test)

## Model Evaluation

In [193]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Confussion Matrix: \n", confusion_matrix(y_test, y_predict),"\n")

print("Classification report according to Test prediction: \n", classification_report(y_test, y_predict))

print ("Accuracy of Decision Tree: %.2f %%" %(100*accuracy_score(y_test, y_predict)))

Confussion Matrix: 
 [[ 22  37]
 [ 29 112]] 

Classification report according to Test prediction: 
               precision    recall  f1-score   support

           0       0.43      0.37      0.40        59
           1       0.75      0.79      0.77       141

    accuracy                           0.67       200
   macro avg       0.59      0.58      0.59       200
weighted avg       0.66      0.67      0.66       200

Accuracy of Decision Tree: 67.00 %


## With Smote

In [194]:
from imblearn.over_sampling import SMOTE 

X_train_smote, y_train_smote = SMOTE(random_state = 42).fit_resample(X_train, y_train)


In [195]:
X_train_smote.shape

(1118, 9)

In [196]:
decision_tree_smote_model = DecisionTreeClassifier()
decision_tree_smote_model.fit(X_train_smote, y_train_smote)

In [197]:
y_predict_with_smote = decision_tree_smote_model.predict(X_test)

In [198]:
print("Confussion Matrix: \n", confusion_matrix(y_test, y_predict_with_smote),"\n")

print("Classification report according to Test prediction: \n", classification_report(y_test, y_predict_with_smote))

print ("Accuracy of Decision Tree: %.2f %%" %(100*accuracy_score(y_test, y_predict_with_smote)))

Confussion Matrix: 
 [[25 34]
 [42 99]] 

Classification report according to Test prediction: 
               precision    recall  f1-score   support

           0       0.37      0.42      0.40        59
           1       0.74      0.70      0.72       141

    accuracy                           0.62       200
   macro avg       0.56      0.56      0.56       200
weighted avg       0.63      0.62      0.63       200

Accuracy of Decision Tree: 62.00 %
