In [1]:
# importing libraries
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('german_credit_data.csv')

## Data Exploration

In [3]:
data.isnull().sum()

Unnamed: 0            0
Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

In [4]:
data['Saving accounts'].value_counts()

little        603
moderate      103
quite rich     63
rich           48
Name: Saving accounts, dtype: int64

In [5]:
data['Checking account'].value_counts()

little      274
moderate    269
rich         63
Name: Checking account, dtype: int64

From the Analysis(refer-- German_Credit_Data_Analysis.ipynb), its clear that Saving accounts and Checking account category "little" is in the heigher side of the default ratio, also "little" is the MODE value for both the features. In order to impute the missing value of both the features if we use the MODE value, in my opinion data will become a little bias towards "little". To eliminate that situation we will replace the missing value with new category "unknown". second reason to use new category is that, no of missing value for Checking account is 394 & for Saving accounts it is 183, if we compare these value to there respective features categories we found that both are in the higher side. So, its better to give them a new category instead of using the MODE value for them.

## Missing value treatment

In [6]:
data['Saving accounts'].fillna("unknown", inplace=True)

In [7]:
data['Checking account'].fillna("unknown", inplace=True)

In [8]:
data.isnull().sum()

Unnamed: 0          0
Age                 0
Sex                 0
Job                 0
Housing             0
Saving accounts     0
Checking account    0
Credit amount       0
Duration            0
Purpose             0
Risk                0
dtype: int64

In [9]:
# Target Variable
data['Risk'].value_counts()

good    700
bad     300
Name: Risk, dtype: int64

It's clear that Data is imblanced. we will create model with both balanced and imblanced data.

## Feature Engineering 

In [10]:
data['Age group'] = pd.qcut(data.Age, q=4)

In [11]:
# Categorical to Numerical Conversion

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
cat_var = ['Sex','Job','Housing','Saving accounts','Checking account', 'Purpose','Risk','Age group']

In [14]:
le = LabelEncoder()

In [15]:
for i in cat_var:
    data[i] = le.fit_transform(data[i])
data.dtypes

Unnamed: 0          int64
Age                 int64
Sex                 int64
Job                 int64
Housing             int64
Saving accounts     int64
Checking account    int64
Credit amount       int64
Duration            int64
Purpose             int64
Risk                int64
Age group           int64
dtype: object

In [16]:
# Creating separate data set for independent and target variable

In [17]:
# Independent Variable
X = data.loc[: , ['Sex','Job','Housing','Saving accounts','Checking account','Credit amount','Duration','Purpose','Age group']]
# Target Variable
Y = data.loc[: , ['Risk']]

In [18]:
# One Hot Encoding
X = pd.get_dummies(X, columns=['Sex','Housing','Purpose'])

## Feature Scaling

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
scaler = StandardScaler()
scaler.fit_transform(X)

array([[ 0.14694918,  1.83316907, -1.25456565, ...,  1.60356745,
        -0.14998296, -0.11020775],
       [ 0.14694918, -0.69970702, -0.45902624, ...,  1.60356745,
        -0.14998296, -0.11020775],
       [-1.38377145, -0.69970702,  1.13205258, ..., -0.62360956,
        -0.14998296, -0.11020775],
       ..., 
       [ 0.14694918, -0.69970702,  1.13205258, ...,  1.60356745,
        -0.14998296, -0.11020775],
       [ 0.14694918, -0.69970702, -1.25456565, ...,  1.60356745,
        -0.14998296, -0.11020775],
       [ 0.14694918, -0.066488  , -0.45902624, ..., -0.62360956,
        -0.14998296, -0.11020775]])

In [21]:
# Creating Train - Test set
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X, Y,test_size = .2,random_state=12)

## Feature Selection

In [22]:
# Feature ranking with recursive feature elimination and cross-validated selection of the best number of features
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

In [23]:
rf = RandomForestClassifier(criterion='entropy', n_estimators=120, min_samples_leaf=5, n_jobs=-1)

In [24]:
rfecv = RFECV(estimator=rf, step=1, cv=10, scoring='roc_auc', verbose=2)
rfecv.fit(X,Y.values.ravel())
print("Optimal number of features : %d" % rfecv.n_features_)

Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.


In [25]:
print (X.columns[rfecv.support_])

Index(['Job', 'Saving accounts', 'Checking account', 'Credit amount',
       'Duration', 'Age group'],
      dtype='object')


With the help of RFECV, now we know that out of 19(after one hot encoding), only 6 columns are important. Keeping that in mind we will create two models one with having all the features and one with the choosen important features. to see the difference.

In [28]:
col_name = ['Job', 'Saving accounts', 'Checking account', 'Credit amount',
       'Duration', 'Age group', 'Sex_0', 'Sex_1', 'Housing_0', 'Housing_1',
       'Housing_2', 'Purpose_0', 'Purpose_1', 'Purpose_2', 'Purpose_3',
       'Purpose_4', 'Purpose_5', 'Purpose_6', 'Purpose_7']

In [29]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X, Y.values.ravel())

# Print the name and gini importance of each feature
for feature in zip(col_name, clf.feature_importances_):
    print(feature)

('Job', 0.063090420812146886)
('Saving accounts', 0.072940218642288016)
('Checking account', 0.14426221755735702)
('Credit amount', 0.27203591247634601)
('Duration', 0.16392262141705413)
('Age group', 0.080647657948974077)
('Sex_0', 0.017790552380677144)
('Sex_1', 0.01813360638321003)
('Housing_0', 0.01268416346599552)
('Housing_1', 0.02139199710004561)
('Housing_2', 0.015676401468787217)
('Purpose_0', 0.018182992295784371)
('Purpose_1', 0.027183299757405539)
('Purpose_2', 0.0036470970806435398)
('Purpose_3', 0.012180996974014001)
('Purpose_4', 0.019820967279144654)
('Purpose_5', 0.023764590234038315)
('Purpose_6', 0.0092292266335041364)
('Purpose_7', 0.0034150600925838366)


In [30]:
# importing important librariesa for model building
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost 
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import recall_score,precision_score,confusion_matrix,accuracy_score,f1_score

In [31]:
# important variable as per RFECV
prediction_var = ['Job', 'Saving accounts', 'Checking account', 'Credit amount','Duration', 'Age group']

# Model building with Imbalanced Dataset

## Model 1a Logistic Regression with all the features

In [32]:
model_1a= LogisticRegression()
model_1a.fit(x_train, y_train.values.ravel())
scores_1a = cross_val_score(model_1a, x_train, y_train.values.ravel(), cv=5, scoring='f1')
y_pred_1a = model_1a.predict(x_val)

In [33]:
acc_1a = accuracy_score(y_val,y_pred_1a)
p_s_1a = precision_score(y_val,y_pred_1a)
r_s_1a = recall_score(y_val,y_pred_1a)
cm_1a = confusion_matrix(y_val,y_pred_1a)
f1_1a = f1_score(y_val, y_pred_1a)

In [34]:
print("Accuracy Score for model_1a :", acc_1a)
print("Precision Score for model_1a :", p_s_1a)
print("Recall Score for model_1a :", r_s_1a)
print("F1 Score for model_1a :", f1_1a)
print("Confusion matrix for model_1a :", cm_1a)

Accuracy Score for model_1a : 0.73
Precision Score for model_1a : 0.728915662651
Recall Score for model_1a : 0.930769230769
F1 Score for model_1a : 0.817567567568
Confusion matrix for model_1a : [[ 25  45]
 [  9 121]]


## Model 1b Logistic Regression with imp features

In [35]:
model_1b= LogisticRegression()
model_1b.fit(x_train[prediction_var], y_train.values.ravel())
scores_1b = cross_val_score(model_1b, x_train[prediction_var], y_train.values.ravel(), cv=5, scoring='f1')
y_pred_1b = model_1b.predict(x_val[prediction_var])

In [36]:
acc_1b = accuracy_score(y_val,y_pred_1b)
p_s_1b = precision_score(y_val,y_pred_1b)
r_s_1b = recall_score(y_val,y_pred_1b)
cm_1b = confusion_matrix(y_val,y_pred_1b)
f1_1b = f1_score(y_val, y_pred_1b)

In [37]:
print("Accuracy Score for model_1b :", acc_1b)
print("Precision Score for model_1b :", p_s_1b)
print("Recall Score for model_1b :", r_s_1b)
print("F1 Score for model_1b :", f1_1b)
print("Confusion matrix for model_1b :", cm_1b)

Accuracy Score for model_1b : 0.705
Precision Score for model_1b : 0.717791411043
Recall Score for model_1b : 0.9
F1 Score for model_1b : 0.798634812287
Confusion matrix for model_1b : [[ 24  46]
 [ 13 117]]


Model with only imp features shows some decresase value in every field. Let's go with other algorithms also then we have more clear picture with us.

# Hyperparameter Tuning for Random Forest

In [38]:
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [200,400,600,800,1000]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [5,10,15,20,30,50]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [5, 10, 15, 20, 30, 50, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000]}


In [39]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, 
                               verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   57.5s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  6.0min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 30, 50, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [40]:
rf_random.best_params_

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': True}

## Model 2a Random Forest with all the features

In [41]:
# Using the best parameter as per result

In [42]:
model_2a = RandomForestClassifier(n_estimators=600,bootstrap= True,min_samples_leaf=4,min_samples_split= 2,
                               max_features= 'sqrt',max_depth= None,random_state=42, n_jobs = -1)
model_2a.fit(x_train, y_train.values.ravel())
scores_2a = cross_val_score(model_2a, x_train, y_train.values.ravel(), cv=5, scoring='f1')
y_pred_2a = model_2a.predict(x_val)

In [43]:
acc_2a = accuracy_score(y_val,y_pred_2a)
p_s_2a = precision_score(y_val,y_pred_2a)
r_s_2a = recall_score(y_val,y_pred_2a)
cm_2a = confusion_matrix(y_val,y_pred_2a)
f1_2a = f1_score(y_val, y_pred_2a)

In [44]:
print("Accuracy Score for model_2a :", acc_2a)
print("Precision Score for model_2a :", p_s_2a)
print("Recall Score for model_2a :", r_s_2a)
print("F1 Score for model_2a :", f1_2a)
print("Confusion matrix for model_2a :", cm_2a)

Accuracy Score for model_2a : 0.715
Precision Score for model_2a : 0.71098265896
Recall Score for model_2a : 0.946153846154
F1 Score for model_2a : 0.811881188119
Confusion matrix for model_2a : [[ 20  50]
 [  7 123]]


## Model 2b Random Forest with important features

In [46]:
model_2b = RandomForestClassifier(n_estimators=600,bootstrap= True,min_samples_leaf=4,min_samples_split= 2,
                               max_features= 'sqrt',max_depth= None,random_state=42, n_jobs = -1)
model_2b.fit(x_train[prediction_var], y_train.values.ravel())
scores_2b = cross_val_score(model_2b, x_train[prediction_var], y_train.values.ravel(), cv=5, scoring='f1')
y_pred_2b = model_2b.predict(x_val[prediction_var])

In [47]:
acc_2b = accuracy_score(y_val,y_pred_2b)
p_s_2b = precision_score(y_val,y_pred_2b)
r_s_2b = recall_score(y_val,y_pred_2b)
cm_2b = confusion_matrix(y_val,y_pred_2b)
f1_2b = f1_score(y_val, y_pred_2b)

In [48]:
print("Accuracy Score for model_2b :", acc_2b)
print("Precision Score for model_2b :", p_s_2b)
print("Recall Score for model_2b :", r_s_2b)
print("F1 Score for model_2b :", f1_2b)
print("Confusion matrix for model_2b :", cm_2b)

Accuracy Score for model_2b : 0.745
Precision Score for model_2b : 0.742331288344
Recall Score for model_2b : 0.930769230769
F1 Score for model_2b : 0.825938566553
Confusion matrix for model_2b : [[ 28  42]
 [  9 121]]


There is a increase in every field for Random forest model with only important features

# Hyperparameter Tuning for XGB Classifier

In [49]:
# Number of trees in XGB Classifier
n_estimators = [200,400,600,800,1000]
# learning rate
learning_rate = [0.1,0.01,0.001,0.2,0.02,0.002]
# Maximum number of depth in tree
max_depth = [3,4,5,7,8,10]
# Minimum number of samples required to split a node
min_child_weight = [2, 5, 10]
# Minimum number of subsamples
subsample = [0.8,0.9,1]
# Minimum number of colsamples_bytrees
colsample_bytree = [0.3,0.4,0.6,0.8]
# regularization parameter
gamma = [0,1,5]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'min_child_weight': min_child_weight,
               'subsample': subsample,
              'colsample_bytree': colsample_bytree,
              'gamma': gamma}
pprint(random_grid)

{'colsample_bytree': [0.3, 0.4, 0.6, 0.8],
 'gamma': [0, 1, 5],
 'learning_rate': [0.1, 0.01, 0.001, 0.2, 0.02, 0.002],
 'max_depth': [3, 4, 5, 7, 8, 10],
 'min_child_weight': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000],
 'subsample': [0.8, 0.9, 1]}


In [50]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
xgb = XGBClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
xgb_random = RandomizedSearchCV(estimator = xgb, param_distributions = random_grid, n_iter = 100, cv = 3, 
                                verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
xgb_random.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.0min finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000], 'learning_rate': [0.1, 0.01, 0.001, 0.2, 0.02, 0.002], 'max_depth': [3, 4, 5, 7, 8, 10], 'min_child_weight': [2, 5, 10], 'subsample': [0.8, 0.9, 1], 'colsample_bytree': [0.3, 0.4, 0.6, 0.8], 'gamma': [0, 1, 5]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [51]:
xgb_random.best_params_

{'subsample': 1,
 'n_estimators': 200,
 'min_child_weight': 2,
 'max_depth': 10,
 'learning_rate': 0.01,
 'gamma': 1,
 'colsample_bytree': 0.6}

## Model 3a XGB Classifier with all the features

In [52]:
model_3a = XGBClassifier( learning_rate =0.01, n_estimators=200, max_depth=10,min_child_weight=2, gamma=1, 
                    subsample=1, colsample_bytree=0.6,seed=27)
model_3a.fit(x_train, y_train.values.ravel())
scores_3a = cross_val_score(model_3a, x_train, y_train.values.ravel(), cv=5, scoring='f1')
y_pred_3a = model_3a.predict(x_val)

In [53]:
acc_3a = accuracy_score(y_val,y_pred_3a)
p_s_3a = precision_score(y_val,y_pred_3a)
r_s_3a = recall_score(y_val,y_pred_3a)
cm_3a = confusion_matrix(y_val,y_pred_3a)
f1_3a = f1_score(y_val, y_pred_3a)

In [54]:
print("Accuracy Score for model_3a :", acc_3a)
print("Precision Score for model_3a :", p_s_3a)
print("Recall Score for model_3a :", r_s_3a)
print("F1 Score for model_3a :", f1_3a)
print("Confusion matrix for model_3a :", cm_3a)

Accuracy Score for model_3a : 0.72
Precision Score for model_3a : 0.717647058824
Recall Score for model_3a : 0.938461538462
F1 Score for model_3a : 0.813333333333
Confusion matrix for model_3a : [[ 22  48]
 [  8 122]]


## Model 3b XGB Classifier with only important features

In [55]:
model_3b = XGBClassifier( learning_rate =0.01, n_estimators=200, max_depth=10,min_child_weight=2, gamma=1, 
                    subsample=1, colsample_bytree=0.6,seed=27)
model_3b.fit(x_train[prediction_var], y_train.values.ravel())
scores_3b = cross_val_score(model_3b, x_train[prediction_var], y_train.values.ravel(), cv=5, scoring='f1')
y_pred_3b = model_3b.predict(x_val[prediction_var])

In [56]:
acc_3b = accuracy_score(y_val,y_pred_3b)
p_s_3b = precision_score(y_val,y_pred_3b)
r_s_3b = recall_score(y_val,y_pred_3b)
cm_3b = confusion_matrix(y_val,y_pred_3b)
f1_3b = f1_score(y_val, y_pred_3b)

In [57]:
print("Accuracy Score for model_3b :", acc_3b)
print("Precision Score for model_3b :", p_s_3b)
print("Recall Score for model_3b :", r_s_3b)
print("F1 Score for model_3b :", f1_3b)
print("Confusion matrix for model_3b:", cm_3b)

Accuracy Score for model_3b : 0.72
Precision Score for model_3b : 0.71511627907
Recall Score for model_3b : 0.946153846154
F1 Score for model_3b : 0.814569536424
Confusion matrix for model_3b: [[ 21  49]
 [  7 123]]


# Model building with Balanced Dataset

In [58]:
# library to correct the imbalnce dataset
from imblearn.combine import SMOTEENN

In [59]:
smt = SMOTEENN(ratio='auto')

In [60]:
X_smt, y_smt = smt.fit_sample(x_train, y_train.values.ravel())

In [61]:
X_smt = pd.DataFrame(data=X_smt)

In [62]:
new_cols=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18']

In [63]:
X_smt.rename(columns=dict(zip(X_smt.columns[:],x_train.columns[:])),inplace=True)

In [64]:
X_smt.head(2)

Unnamed: 0,Job,Saving accounts,Checking account,Credit amount,Duration,Age group,Sex_0,Sex_1,Housing_0,Housing_1,Housing_2,Purpose_0,Purpose_1,Purpose_2,Purpose_3,Purpose_4,Purpose_5,Purpose_6,Purpose_7
0,2.0,0.0,0.0,4308.0,48.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1193.0,24.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model 1c Logistic Regression with All the features

In [66]:
model_1c= LogisticRegression()
model_1c.fit(X_smt, y_smt.ravel())
scores_1c = cross_val_score(model_1c, X_smt, y_smt.ravel(), cv=5, scoring='f1')
y_pred_1c = model_1c.predict(x_val)

In [67]:
acc_1c = accuracy_score(y_val,y_pred_1c)
p_s_1c = precision_score(y_val,y_pred_1c)
r_s_1c = recall_score(y_val,y_pred_1c)
cm_1c = confusion_matrix(y_val,y_pred_1c)
f1_1c = f1_score(y_val, y_pred_1c)

In [68]:
print("Accuracy Score for model_1c :", acc_1c)
print("Precision Score for model_1c :", p_s_1c)
print("Recall Score for model_1c :", r_s_1c)
print("F1 Score for model_1c :", f1_1c)
print("Confusion matrix for model_1c :", cm_1c)

Accuracy Score for model_1c : 0.655
Precision Score for model_1c : 0.814432989691
Recall Score for model_1c : 0.607692307692
F1 Score for model_1c : 0.696035242291
Confusion matrix for model_1c : [[52 18]
 [51 79]]


## Model 1d Logistic Regression with important features

In [69]:
model_1d= LogisticRegression()
model_1d.fit(X_smt[prediction_var], y_smt.ravel())
scores_1d = cross_val_score(model_1d, X_smt[prediction_var], y_smt.ravel(), cv=5, scoring='f1')
y_pred_1d = model_1d.predict(x_val[prediction_var])

In [70]:
acc_1d = accuracy_score(y_val,y_pred_1d)
p_s_1d = precision_score(y_val,y_pred_1d)
r_s_1d = recall_score(y_val,y_pred_1d)
cm_1d = confusion_matrix(y_val,y_pred_1d)
f1_1d = f1_score(y_val, y_pred_1d)

In [71]:
print("Accuracy Score for model_1d :", acc_1d)
print("Precision Score for model_1d :", p_s_1d)
print("Recall Score for model_1d :", r_s_1d)
print("F1 Score for model_1d :", f1_1d)
print("Confusion matrix for model_1d :", cm_1d)

Accuracy Score for model_1d : 0.665
Precision Score for model_1d : 0.824742268041
Recall Score for model_1d : 0.615384615385
F1 Score for model_1d : 0.704845814978
Confusion matrix for model_1d : [[53 17]
 [50 80]]


## Model 2c Random Forest with all the features

In [74]:
model_2c= RandomForestClassifier(n_estimators=600,bootstrap= True,min_samples_leaf=4,min_samples_split= 2,
                               max_features= 'sqrt',max_depth= None,random_state=42, n_jobs = -1)
model_2c.fit(X_smt,y_smt.ravel())
scores_2c = cross_val_score(model_2c, X_smt, y_smt.ravel(), cv=5, scoring='f1')
y_pred_2c = model_2c.predict(x_val)

In [75]:
acc_2c = accuracy_score(y_val,y_pred_2c)
p_s_2c = precision_score(y_val,y_pred_2c)
r_s_2c = recall_score(y_val,y_pred_2c)
cm_2c = confusion_matrix(y_val,y_pred_2c)
f1_2c = f1_score(y_val, y_pred_2c)

In [76]:
print("Accuracy Score for model_2c :", acc_2c)
print("Precision Score for model_2c :", p_s_2c)
print("Recall Score for model_2c :", r_s_2c)
print("F1 Score for model_2c :", f1_2c)
print("Confusion matrix for model_2c :", cm_2c)

Accuracy Score for model_2c : 0.665
Precision Score for model_2c : 0.726618705036
Recall Score for model_2c : 0.776923076923
F1 Score for model_2c : 0.75092936803
Confusion matrix for model_2c : [[ 32  38]
 [ 29 101]]


## Model 2d Random Forest with important features

In [77]:
model_2d= RandomForestClassifier(n_estimators=600,bootstrap= True,min_samples_leaf=4,min_samples_split= 2,
                               max_features= 'sqrt',max_depth= None,random_state=42, n_jobs = -1)
model_2d.fit(X_smt[prediction_var],y_smt.ravel())
scores_2d = cross_val_score(model_2d, X_smt[prediction_var], y_smt.ravel(), cv=5, scoring='f1')
y_pred_2d = model_2d.predict(x_val[prediction_var])

In [78]:
acc_2d = accuracy_score(y_val,y_pred_2d)
p_s_2d = precision_score(y_val,y_pred_2d)
r_s_2d = recall_score(y_val,y_pred_2d)
cm_2d = confusion_matrix(y_val,y_pred_2d)
f1_2d = f1_score(y_val, y_pred_2d)

In [80]:
print("Accuracy Score for model_2d :", acc_2d)
print("Precision Score for model_2d :", p_s_2d)
print("Recall Score for model_2d :", r_s_2d)
print("F1 Score for model_2d :", f1_2d)
print("Confusion matrix for model_2d :", cm_2d)

Accuracy Score for model_2d : 0.715
Precision Score for model_2d : 0.806722689076
Recall Score for model_2d : 0.738461538462
F1 Score for model_2d : 0.771084337349
Confusion matrix for model_2d : [[47 23]
 [34 96]]


## Model 3c XGB with all the features

In [82]:
model_3c = XGBClassifier( learning_rate =0.01, n_estimators=200, max_depth=10,min_child_weight=2, gamma=1, 
                    subsample=1, colsample_bytree=0.6,seed=27)
model_3c.fit(X_smt, y_smt)
scores_3c = cross_val_score(model_3c, X_smt, y_smt, cv=5, scoring='f1')
y_pred_3c = model_3c.predict(x_val)

In [83]:
acc_3c = accuracy_score(y_val,y_pred_3c)
p_s_3c = precision_score(y_val,y_pred_3c)
r_s_3c = recall_score(y_val,y_pred_3c)
cm_3c = confusion_matrix(y_val,y_pred_3c)
f1_3c = f1_score(y_val, y_pred_3c)

In [84]:
print("Accuracy Score for model_3c :", acc_3c)
print("Precision Score for model_3c :", p_s_3c)
print("Recall Score for model_3c :", r_s_3c)
print("F1 Score for model_3c :", f1_3c)
print("Confusion matrix for model_3c :", cm_3c)

Accuracy Score for model_3c : 0.705
Precision Score for model_3c : 0.751773049645
Recall Score for model_3c : 0.815384615385
F1 Score for model_3c : 0.782287822878
Confusion matrix for model_3c : [[ 35  35]
 [ 24 106]]


## Model 3d XGB with important features

In [85]:
model_3d = XGBClassifier( learning_rate =0.01, n_estimators=200, max_depth=10,min_child_weight=2, gamma=1, 
                    subsample=1, colsample_bytree=0.6,seed=27)
model_3d.fit(X_smt[prediction_var], y_smt)
scores_3d = cross_val_score(model_3d, X_smt[prediction_var], y_smt, cv=5, scoring='f1')
y_pred_3d = model_3d.predict(x_val[prediction_var])

In [86]:
acc_3d = accuracy_score(y_val,y_pred_3d)
p_s_3d = precision_score(y_val,y_pred_3d)
r_s_3d = recall_score(y_val,y_pred_3d)
cm_3d = confusion_matrix(y_val,y_pred_3d)
f1_3d = f1_score(y_val, y_pred_3d)

In [87]:
print("Accuracy Score for model_3d :", acc_3d)
print("Precision Score for model_3d :", p_s_3d)
print("Recall Score for model_3d :", r_s_3d)
print("F1 Score for model_3d :", f1_3d)
print("Confusion matrix for model_3d :", cm_3d)

Accuracy Score for model_3d : 0.735
Precision Score for model_3d : 0.798449612403
Recall Score for model_3d : 0.792307692308
F1 Score for model_3d : 0.795366795367
Confusion matrix for model_3d : [[ 44  26]
 [ 27 103]]


# Model Building using PCA and Imblanced Data

In [88]:
from sklearn.decomposition import PCA

In [89]:
pca = PCA(n_components= 4)
x_train = pca.fit_transform(x_train)
x_val = pca.transform(x_val)

## Model 1e Logistic Regression+PCA

In [90]:
model_1e= LogisticRegression()
model_1e.fit(x_train, y_train.values.ravel())
scores_1e = cross_val_score(model_1e, x_train, y_train.values.ravel(), cv=5, scoring='f1')
y_pred_1e = model_1e.predict(x_val)

In [91]:
acc_1e = accuracy_score(y_val,y_pred_1e)
p_s_1e = precision_score(y_val,y_pred_1e)
r_s_1e = recall_score(y_val,y_pred_1e)
cm_1e = confusion_matrix(y_val,y_pred_1e)
f1_1e = f1_score(y_val, y_pred_1e)

In [92]:
print("Accuracy Score for model_1e :", acc_1e)
print("Precision Score for model_1e :", p_s_1e)
print("Recall Score for model_1e :", r_s_1e)
print("F1 Score for model_1e :", f1_1e)
print("Confusion matrix for model_1e :", cm_1e)

Accuracy Score for model_1e : 0.705
Precision Score for model_1e : 0.717791411043
Recall Score for model_1e : 0.9
F1 Score for model_1e : 0.798634812287
Confusion matrix for model_1e : [[ 24  46]
 [ 13 117]]


## Model 2e Random Forest+PCA

In [93]:
model_2e= RandomForestClassifier(n_estimators=600,bootstrap= True,min_samples_leaf=4,min_samples_split= 2,
                               max_features= 'sqrt',max_depth= None,random_state=42, n_jobs = -1)
model_2e.fit(x_train,y_train.values.ravel())
scores_2e = cross_val_score(model_2e, x_train, y_train.values.ravel(), cv=5, scoring='f1')
y_pred_2e = model_2e.predict(x_val)

In [94]:
acc_2e = accuracy_score(y_val,y_pred_2e)
p_s_2e = precision_score(y_val,y_pred_2e)
r_s_2e = recall_score(y_val,y_pred_2e)
cm_2e = confusion_matrix(y_val,y_pred_2e)
f1_2e = f1_score(y_val, y_pred_2e)

In [95]:
print("Accuracy Score for model_2e :", acc_2e)
print("Precision Score for model_2e :", p_s_2e)
print("Recall Score for model_2e :", r_s_2e)
print("F1 Score for model_2e :", f1_2e)
print("Confusion matrix for model_2e :", cm_2e)

Accuracy Score for model_2e : 0.74
Precision Score for model_2e : 0.740740740741
Recall Score for model_2e : 0.923076923077
F1 Score for model_2e : 0.821917808219
Confusion matrix for model_2e : [[ 28  42]
 [ 10 120]]


##  Model 3e XGB+PCA 

In [96]:
model_3e = XGBClassifier( learning_rate =0.01, n_estimators=200, max_depth=10,min_child_weight=2, gamma=1, 
                    subsample=1, colsample_bytree=0.6,seed=27)
model_3e.fit(x_train, y_train.values.ravel())
scores_3e = cross_val_score(model_3e, x_train, y_train.values.ravel(), cv=5, scoring='f1')
y_pred_3e = model_3e.predict(x_val)

In [97]:
acc_3e = accuracy_score(y_val,y_pred_3e)
p_s_3e = precision_score(y_val,y_pred_3e)
r_s_3e = recall_score(y_val,y_pred_3e)
cm_3e = confusion_matrix(y_val,y_pred_3e)
f1_3e = f1_score(y_val, y_pred_3e)

In [98]:
print("Accuracy Score for model_3e :", acc_3e)
print("Precision Score for model_3e :", p_s_3e)
print("Recall Score for model_3e :", r_s_3e)
print("F1 Score for model_3e :", f1_3e)
print("Confusion matrix for model_3e :", cm_3e)

Accuracy Score for model_3e : 0.755
Precision Score for model_3e : 0.748466257669
Recall Score for model_3e : 0.938461538462
F1 Score for model_3e : 0.832764505119
Confusion matrix for model_3e : [[ 29  41]
 [  8 122]]


# Model with PCA and Balanced Data

In [99]:
pca = PCA(n_components= 4)
X_smt = pca.fit_transform(X_smt)

## Model 1f Logistic Regression+PCA+SMOTEEN

In [100]:
model_1f= LogisticRegression()
model_1f.fit(X_smt, y_smt.ravel())
scores_1f = cross_val_score(model_1f, X_smt, y_smt.ravel(), cv=5, scoring='f1')
y_pred_1f = model_1f.predict(x_val)

In [101]:
acc_1f = accuracy_score(y_val,y_pred_1f)
p_s_1f = precision_score(y_val,y_pred_1f)
r_s_1f = recall_score(y_val,y_pred_1f)
cm_1f = confusion_matrix(y_val,y_pred_1f)
f1_1f = f1_score(y_val, y_pred_1f)

In [102]:
print("Accuracy Score for model_1f :", acc_1f)
print("Precision Score for model_1f :", p_s_1f)
print("Recall Score for model_1f :", r_s_1f)
print("F1 Score for model_1f :", f1_1f)
print("Confusion matrix for model_1f :", cm_1f)

Accuracy Score for model_1f : 0.585
Precision Score for model_1f : 0.78313253012
Recall Score for model_1f : 0.5
F1 Score for model_1f : 0.610328638498
Confusion matrix for model_1f : [[52 18]
 [65 65]]


## Model 2f Random Forest+PCA+SMOTEEN

In [103]:
model_2f= RandomForestClassifier(n_estimators=600,bootstrap= True,min_samples_leaf=4,min_samples_split= 2,
                               max_features= 'sqrt',max_depth= None,random_state=42, n_jobs = -1)
model_2f.fit(X_smt,y_smt.ravel())
scores_2f = cross_val_score(model_2f, X_smt, y_smt.ravel(), cv=5, scoring='f1')
y_pred_2f = model_2f.predict(x_val)

In [104]:
acc_2f = accuracy_score(y_val,y_pred_2f)
p_s_2f = precision_score(y_val,y_pred_2f)
r_s_2f = recall_score(y_val,y_pred_2f)
cm_2f = confusion_matrix(y_val,y_pred_2f)
f1_2f = f1_score(y_val, y_pred_2f)

In [105]:
print("Accuracy Score for model_2f :", acc_2f)
print("Precision Score for model_2f :", p_s_2f)
print("Recall Score for model_2f :", r_s_2f)
print("F1 Score for model_2f :", f1_2f)
print("Confusion matrix for model_2f :", cm_2f)

Accuracy Score for model_2f : 0.505
Precision Score for model_2f : 0.682352941176
Recall Score for model_2f : 0.446153846154
F1 Score for model_2f : 0.539534883721
Confusion matrix for model_2f : [[43 27]
 [72 58]]


## Model 3f XGB Classifier+PCA+SMOTEEN

In [106]:
model_3f = XGBClassifier( learning_rate =0.01, n_estimators=200, max_depth=10,min_child_weight=2, gamma=1, 
                    subsample=1, colsample_bytree=0.6,seed=27)
model_3f.fit(X_smt, y_smt.ravel())
scores_3f = cross_val_score(model_3f, X_smt, y_smt.ravel(), cv=5, scoring='f1')
y_pred_3f = model_3f.predict(x_val)

In [107]:
acc_3f = accuracy_score(y_val,y_pred_3f)
p_s_3f = precision_score(y_val,y_pred_3f)
r_s_3f = recall_score(y_val,y_pred_3f)
cm_3f = confusion_matrix(y_val,y_pred_3f)
f1_3f = f1_score(y_val, y_pred_3f)

In [108]:
print("Accuracy Score :", acc_3f)
print("Precision Score :", p_s_3f)
print("Recall Score :", r_s_3f)
print("F1 Score :", f1_3f)
print("Confusion matrix :", cm_3f)

Accuracy Score : 0.51
Precision Score : 0.7
Recall Score : 0.430769230769
F1 Score : 0.533333333333
Confusion matrix : [[46 24]
 [74 56]]


# Final Report

Out of 18 different model using 3 algorithms "Logistic Regression", "Random Forest Classifier" & "XGB Classifier", with the help of "PCA" and "SMOTEEN", finally we have the result of all the model.     
By doing the comparision its quite clear that MODEL 3e i.e (XGB Classifier with PCA) has the best figure compare to every other model. So our final and best to go model is MODEL 3e. 

Below is the output result of MODEL 3e

Accuracy Score for model_3e : 0.755        
Precision Score for model_3e : 0.748466257669     
Recall Score for model_3e : 0.938461538462      
F1 Score for model_3e : 0.832764505119      

# FINAL MODEL STRUCTURE

# THANK YOU