In [78]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline



In [79]:
df = pd.read_table('german.data.txt', delim_whitespace=True, names=["Checking_account_status","Month","Credit_history","Credit_Purpose",
                                                                           "Credit_amount",
                                                                           "Savings", "Employment_period", "Installment_rate",
                                                                           "Sex_Marital", "other_debtors", "Residence_period",
                                                                           "Property", "Age", "OtherInstallment",
                                                                           "Housing", "ExistCredits", "Job",
                                                                           "Liability", "Phone", "Foreign", "Predict"])

In [80]:
df.Predict.value_counts()

1    700
2    300
Name: Predict, dtype: int64

In [81]:
# 700 granted the credit. so, good credit
# 300 weren't granted the credit. so, bad credit
# hence, data set is not balanced with positive and negative differences of 400 
df.corr(method='pearson')

Unnamed: 0,Month,Credit_amount,Installment_rate,Residence_period,Age,ExistCredits,Liability,Predict
Month,1.0,0.624984,0.074749,0.034067,-0.036136,-0.011284,-0.023834,0.214927
Credit_amount,0.624984,1.0,-0.271316,0.028926,0.032716,0.020795,0.017142,0.154739
Installment_rate,0.074749,-0.271316,1.0,0.049302,0.058266,0.021669,-0.071207,0.072404
Residence_period,0.034067,0.028926,0.049302,1.0,0.266419,0.089625,0.042643,0.002967
Age,-0.036136,0.032716,0.058266,0.266419,1.0,0.149254,0.118201,-0.091127
ExistCredits,-0.011284,0.020795,0.021669,0.089625,0.149254,1.0,0.109667,-0.045732
Liability,-0.023834,0.017142,-0.071207,0.042643,0.118201,0.109667,1.0,-0.003015
Predict,0.214927,0.154739,0.072404,0.002967,-0.091127,-0.045732,-0.003015,1.0


In [82]:
df.corr(method = 'kendall')

Unnamed: 0,Month,Credit_amount,Installment_rate,Residence_period,Age,ExistCredits,Liability,Predict
Month,1.0,0.465738,0.093522,0.034895,-0.025186,0.023775,-0.037041,0.176092
Credit_amount,0.465738,1.0,-0.238537,0.018146,0.017308,0.018866,0.034161,0.071145
Installment_rate,0.093522,-0.238537,1.0,0.04101,0.055433,0.019119,-0.067282,0.068345
Residence_period,0.034895,0.018146,0.04101,1.0,0.185289,0.061913,0.037468,0.002365
Age,-0.025186,0.017308,0.055433,0.185289,1.0,0.116335,0.158022,-0.09299
ExistCredits,0.023775,0.018866,0.019119,0.061913,0.116335,1.0,0.095509,-0.046599
Liability,-0.037041,0.034161,-0.067282,0.037468,0.158022,0.095509,1.0,-0.003015
Predict,0.176092,0.071145,0.068345,0.002365,-0.09299,-0.046599,-0.003015,1.0


In [83]:
df.corr(method = 'spearman')

Unnamed: 0,Month,Credit_amount,Installment_rate,Residence_period,Age,ExistCredits,Liability,Predict
Month,1.0,0.624709,0.118046,0.043671,-0.036316,0.02843,-0.043265,0.205685
Credit_amount,0.624709,1.0,-0.3131,0.023646,0.026298,0.023778,0.041815,0.087083
Installment_rate,0.118046,-0.3131,1.0,0.047507,0.072157,0.020997,-0.072543,0.07369
Residence_period,0.043671,0.023646,0.047507,1.0,0.234709,0.067193,0.040455,0.002553
Age,-0.036316,0.026298,0.072157,0.234709,1.0,0.141287,0.190651,-0.112191
ExistCredits,0.02843,0.023778,0.020997,0.067193,0.141287,1.0,0.096978,-0.047316
Liability,-0.043265,0.041815,-0.072543,0.040455,0.190651,0.096978,1.0,-0.003015
Predict,0.205685,0.087083,0.07369,0.002553,-0.112191,-0.047316,-0.003015,1.0


In [84]:
# we can see that there is no strong correlation between any two  variables in particular.
# we can see that Credit_amount and month may have moderate correlation.
df.describe()

Unnamed: 0,Month,Credit_amount,Installment_rate,Residence_period,Age,ExistCredits,Liability,Predict
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.903,3271.258,2.973,2.845,35.546,1.407,1.155,1.3
std,12.058814,2822.736876,1.118715,1.103718,11.375469,0.577654,0.362086,0.458487
min,4.0,250.0,1.0,1.0,19.0,1.0,1.0,1.0
25%,12.0,1365.5,2.0,2.0,27.0,1.0,1.0,1.0
50%,18.0,2319.5,3.0,3.0,33.0,1.0,1.0,1.0
75%,24.0,3972.25,4.0,4.0,42.0,2.0,1.0,2.0
max,72.0,18424.0,4.0,4.0,75.0,4.0,2.0,2.0


In [85]:
# here we saw that for predict, it being categorical makes the statistics irrelevant.
# However, since panda describe function gives the output of predict, it's not missing anything.
# as a matter of fact, every column of data has statistics according to describe function. Hence, no missing values. 
# Based on that, we only need to check if some categorical attributes are missing.
df.head(5)

Unnamed: 0,Checking_account_status,Month,Credit_history,Credit_Purpose,Credit_amount,Savings,Employment_period,Installment_rate,Sex_Marital,other_debtors,...,Property,Age,OtherInstallment,Housing,ExistCredits,Job,Liability,Phone,Foreign,Predict
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [86]:
def check_missing_values(data_file, column_name):
    
    if data_file[column_name].isnull().values.any() == True:
        print(column_name,' is missing values')
        
    else:
        print(column_name,' is not missing any values')
        
# defining a list for categorical columns without predict column.
X = ["Checking_account_status","Credit_history","Credit_Purpose","Savings","Employment_period","Sex_Marital","other_debtors",
     'Property','OtherInstallment', 'Housing','Job','Phone','Foreign']

for column_name in X:
    check_missing_values(df, column_name)

Checking_account_status  is not missing any values
Credit_history  is not missing any values
Credit_Purpose  is not missing any values
Savings  is not missing any values
Employment_period  is not missing any values
Sex_Marital  is not missing any values
other_debtors  is not missing any values
Property  is not missing any values
OtherInstallment  is not missing any values
Housing  is not missing any values
Job  is not missing any values
Phone  is not missing any values
Foreign  is not missing any values


In [178]:
#now we know that data is not missing any values,lets build the binary classification model to train and test the random forest
# algorithm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, accuracy_score, confusion_matrix

In [179]:
german_one_hot_encode = pd.get_dummies(df)
#converting data frames attributes into categorical numeric value so it is easy for classifier. 
# It basically sets 1 for true attribute that the row follows and 0 for the attributes that the row doesn't follow.
# we can see that the column has increased from 21 to 62. Matrix size has grown significantly.
german_one_hot_encode.head(5)

Unnamed: 0,Month,Credit_amount,Installment_rate,Residence_period,Age,ExistCredits,Liability,Predict,Checking_account_status_A11,Checking_account_status_A12,...,Housing_A152,Housing_A153,Job_A171,Job_A172,Job_A173,Job_A174,Phone_A191,Phone_A192,Foreign_A201,Foreign_A202
0,6,1169,4,4,67,2,1,1,1,0,...,1,0,0,0,1,0,0,1,1,0
1,48,5951,2,2,22,1,1,2,0,1,...,1,0,0,0,1,0,1,0,1,0
2,12,2096,2,3,49,1,2,1,0,0,...,1,0,0,1,0,0,1,0,1,0
3,42,7882,2,4,45,1,2,1,1,0,...,0,1,0,0,1,0,1,0,1,0
4,24,4870,3,4,53,2,2,2,1,0,...,0,1,0,0,1,0,1,0,1,0


In [180]:
data = german_one_hot_encode.drop('Predict', axis = 1)
target = german_one_hot_encode['Predict']

In [181]:
data_train, data_test, target_train, target_test = train_test_split(data, target,test_size = .3, random_state = 1)
# with some research I realised that random_state just means how randomly datas will be chosen for train and test. 
# same number gives reproducable results. we will keep this number same through out.

In [182]:
# after splitting the data into training and testing, we want to oversample the training data
# and leave the testing data as it is for testing as they are the true values.
# by oversampling we increase more data set for bad credit customers so that the classifier doesn't get biased.
# Hence, oversampling. this also gives priority to finding bad customers for the bank than focusing on the good ones. 
data_train.shape, data_test.shape, target_train.shape, target_test.shape
#to check how the training and testing datas are splitted.

((700, 61), (300, 61), (700,), (300,))

In [183]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(ratio = 'minority', random_state=1, kind='regular')

In [184]:
data_train_oversample, target_train_oversample = sm.fit_sample(data_train, target_train)
data_train_oversample.shape, target_train_oversample.shape

((972, 61), (972,))

In [185]:
# let's make the classifier ready now. 
rf = RandomForestClassifier(n_estimators = 2000, n_jobs = -1,random_state =12, max_features = "auto", min_samples_leaf = 100)
rf.fit(data_train_oversample,target_train_oversample)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=100, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
            oob_score=False, random_state=12, verbose=0, warm_start=False)

In [186]:
target_predict = rf.predict(data_test)
accuracy_score(target_test, target_predict)
# hence accuracy of test data is 75.3%

0.76000000000000001

In [187]:
accuracy_score(target_train_oversample, rf.predict(data_train_oversample))
# to check accuracy on training data. as expected, higher than the accuracy on test data.

0.80658436213991769

In [188]:
pd.DataFrame(confusion_matrix(target_test, target_predict), columns=['Good predicted customer', 'Bad predicted customer'],
    index=['Truely good customer', 'Truely bad customer'])

Unnamed: 0,Good predicted customer,Bad predicted customer
Truely good customer,179,35
Truely bad customer,37,49
