## Import Libraries

In [19]:
import numpy as np
import pandas as pd

from sklearn.ensemble import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.preprocessing import *
from imblearn.over_sampling import SMOTENC
# from xgboost import XGBClassifier

## Import Data

In [20]:
# Reading Processed Train Set Data File
raw_d1 = pd.read_csv('Data/Processed_Trainset.csv')

# Feature Headers List
headers = list(raw_d1.columns)

# Reading Test Data Set
raw_d2 = pd.read_csv('Data/Processed_Testset.csv')

id_col = list(raw_d2.Client_ID)
raw_d2.drop(['Client_ID'], axis=1, inplace=True)

## Correlation

In [21]:
# Generating Heat Map
correlation = raw_d1.corr()
correlation.style.background_gradient(cmap='coolwarm')
# Removed Highly Correlated features using this heatmap

Unnamed: 0,Balance_Limit_V1,AGE,g_F,g_M,es_Graduate,es_High School,es_Other,ms_Other,ms_Single,PAY_JULY,PAY_AUG,PAY_SEP,PAY_OCT,PAY_NOV,PAY_DEC,DUE_AMT_JULY,DUE_AMT_AUG,DUE_AMT_SEP,DUE_AMT_OCT,DUE_AMT_NOV,DUE_AMT_DEC,PAID_AMT_JULY,PAID_AMT_AUG,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC,NEXT_MONTH_DEFAULT
Balance_Limit_V1,1.0,0.139847,-0.0222018,0.0222018,0.246245,-0.135102,-0.13554,-0.102112,0.102112,-0.257252,-0.281462,-0.274788,-0.250769,-0.237439,-0.224307,0.279922,0.272542,0.277842,0.286514,0.286916,0.281861,0.181979,0.165341,0.194308,0.191948,0.207909,0.210399,-0.148938
AGE,0.139847,1.0,0.0872118,-0.0872118,-0.0941585,0.200074,-0.0580198,-0.418161,0.418161,-0.0454425,-0.0522472,-0.0482252,-0.0452513,-0.0514852,-0.0468625,0.0544036,0.0523907,0.0528159,0.050275,0.0488162,0.0464542,0.0274405,0.0239328,0.0299418,0.0306161,0.0256983,0.0242356,0.00912637
g_F,-0.0222018,0.0872118,1.0,-1.0,0.0213745,0.0100856,-0.0279094,0.0315728,-0.0315728,0.0543111,0.0662101,0.0616221,0.0557791,0.0497633,0.0404514,0.0344339,0.0320495,0.0248567,0.0218686,0.0184492,0.0185867,-0.000145349,-0.00034225,0.00731015,0.00360936,0.00191015,0.00345906,0.0416493
g_M,0.0222018,-0.0872118,-1.0,1.0,-0.0213745,-0.0100856,0.0279094,-0.0315728,0.0315728,-0.0543111,-0.0662101,-0.0616221,-0.0557791,-0.0497633,-0.0404514,-0.0344339,-0.0320495,-0.0248567,-0.0218686,-0.0184492,-0.0185867,0.000145349,0.00034225,-0.00731015,-0.00360936,-0.00191015,-0.00345906,-0.0416493
es_Graduate,0.246245,-0.0941585,0.0213745,-0.0213745,1.0,-0.326787,-0.714632,0.155441,-0.155441,-0.140541,-0.168204,-0.158866,-0.150917,-0.139776,-0.125931,-0.0227736,-0.0207784,-0.0128351,-0.0050981,0.000475518,-0.00366277,0.0456449,0.0431433,0.0509866,0.0449068,0.0442107,0.0497585,-0.0488292
es_High School,-0.135102,0.200074,0.0100856,-0.0100856,-0.326787,1.0,-0.427564,-0.108466,0.108466,0.0593654,0.0637543,0.0613402,0.056686,0.0489785,0.0396604,-0.0219916,-0.0226696,-0.0221983,-0.03017,-0.0297953,-0.0300017,-0.0205142,-0.0162414,-0.0276194,-0.0194764,-0.0347389,-0.0364647,0.0309857
es_Other,-0.13554,-0.0580198,-0.0279094,0.0279094,-0.714632,-0.427564,1.0,-0.0683997,0.0683997,0.0904889,0.1137,0.106555,0.102396,0.0974451,0.0910984,0.0380597,0.0366531,0.0287065,0.0272062,0.0215977,0.0257088,-0.028476,-0.0292457,-0.0283265,-0.0285381,-0.016576,-0.0206051,0.0237715
ms_Other,-0.102112,-0.418161,0.0315728,-0.0315728,0.155441,-0.108466,-0.0683997,1.0,-1.0,0.0182007,0.0245761,0.0313893,0.0300597,0.031228,0.0265911,-0.0270562,-0.026237,-0.0292275,-0.02655,-0.0292209,-0.0246749,-0.0102448,-0.0125704,-0.00261996,-0.0153059,-0.000762231,-0.00508587,-0.027905
ms_Single,0.102112,0.418161,-0.0315728,0.0315728,-0.155441,0.108466,0.0683997,-1.0,1.0,-0.0182007,-0.0245761,-0.0313893,-0.0300597,-0.031228,-0.0265911,0.0270562,0.026237,0.0292275,0.02655,0.0292209,0.0246749,0.0102448,0.0125704,0.00261996,0.0153059,0.000762231,0.00508587,0.027905
PAY_JULY,-0.257252,-0.0454425,0.0543111,-0.0543111,-0.140541,0.0593654,0.0904889,0.0182007,-0.0182007,1.0,0.672879,0.573821,0.536305,0.507525,0.472829,0.185679,0.18849,0.177217,0.178472,0.17896,0.174119,-0.076644,-0.0706821,-0.0719125,-0.0661681,-0.0617827,-0.0573836,0.324362


## Formatting

In [22]:
# Convert Data into ndArrays
train_data = np.array(raw_d1)
test_data = np.array(raw_d2)

# Total Training Data
X_tot = train_data[:, :-1]
Y_tot = train_data[:, -1]

# Splitting Data into a Training Set and a Cross Validation Set
X_train, X_cross, Y_train, Y_cross = train_test_split(X_tot, Y_tot, test_size = 0.2, random_state = 42, shuffle = True)
X_train.shape, Y_train.shape

((19200, 27), (19200,))

## SMOTE

In [23]:
smote = SMOTENC(list(range(0,15)), sampling_strategy='auto', k_neighbors=10, n_jobs=1)
X_train,Y_train = smote.fit_resample(X_train, Y_train)

pd.DataFrame(np.concatenate((X_train,np.reshape(Y_train, (Y_train.shape[0], 1))), axis=1)
             ,columns=headers).to_csv('Data/SMOTED.csv', index=None)
X_train.shape, Y_train.shape

NameError: name 'SMOTENC' is not defined

## Normalizing Data

In [15]:
sc = RobustScaler()
sc.fit(X_tot)

X_tot = sc.fit_transform(X_tot)
X_train = sc.transform(X_train)
X_cross = sc.transform(X_cross)
test_data = sc.transform(test_data)

pd.DataFrame(X_tot, columns=headers[:-1])

Unnamed: 0,Balance_Limit_V1,AGE,g_F,g_M,es_Graduate,es_High School,es_Other,ms_Other,ms_Single,PAY_JULY,...,DUE_AMT_SEP,DUE_AMT_OCT,DUE_AMT_NOV,DUE_AMT_DEC,PAID_AMT_JULY,PAID_AMT_AUG,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC
0,0.750,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,...,-0.301720,-0.038562,1.072397,-0.478748,-0.159548,0.141495,3.621697,18.380870,-0.393683,59.773070
1,0.750,-1.000000,1.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.145333,-0.004731,0.345448,0.763803,15.803981,4.331197,4.044524,8.924195,-0.286656,3.376111
2,-0.375,0.000000,1.0,-1.0,0.0,1.0,0.0,-1.0,1.0,4.0,...,-0.230986,-0.362867,-0.371302,-0.353373,-0.529317,-0.484313,-0.431568,-0.399792,-0.393683,-0.387346
3,-0.250,0.000000,1.0,-1.0,1.0,0.0,0.0,-1.0,1.0,2.0,...,0.364248,0.437727,0.518774,0.567878,-0.114578,-0.078851,-0.051517,0.133264,0.066352,0.050640
4,0.750,0.000000,1.0,-1.0,1.0,0.0,0.0,0.0,0.0,2.0,...,2.862625,3.218938,3.578009,3.716657,-0.529317,1.682666,1.231157,1.599169,1.469749,1.549382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,0.750,0.000000,1.0,-1.0,0.0,1.0,0.0,-1.0,1.0,0.0,...,3.247456,1.309661,0.268691,-0.022372,1.583109,4.331197,0.756812,0.412369,0.918593,-0.129115
23996,0.000,0.000000,1.0,-1.0,0.0,1.0,0.0,0.0,0.0,-1.0,...,-0.286144,-0.192224,-0.264966,-0.353373,-0.072835,0.364655,1.705707,-0.365409,-0.393683,-0.387346
23997,-0.375,0.000000,1.0,-1.0,0.0,0.0,1.0,0.0,0.0,4.0,...,-0.298980,0.033921,0.050387,0.047578,-0.529317,-0.484313,4.794139,0.719626,0.131228,0.413169
23998,-0.250,0.000000,1.0,-1.0,0.0,1.0,0.0,-1.0,1.0,1.0,...,0.969585,0.640112,-0.128412,0.660424,20.818612,0.336512,-0.151774,0.113534,13.507016,0.078475


## Classification Model 

In [16]:
rf = RandomForestClassifier(bootstrap=True, class_weight= None, max_depth= 8, max_features= X_train.shape[1], 
                            max_leaf_nodes= None, min_samples_leaf= 1, min_samples_split= 2, min_weight_fraction_leaf= 0.0, 
                            n_estimators= 200, n_jobs= 1, oob_score= False, random_state= 85, verbose= 0, warm_start= False)

rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=8, max_features=27,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
                       oob_score=False, random_state=85, verbose=0,
                       warm_start=False)

## Prediction and Evaluation

In [17]:
# Predicting Outcomes
predict_train = rf.predict(X_train)
predict_cross = rf.predict(X_cross)
predict_tot = rf.predict(X_tot)
predict_test = rf.predict(test_data)

# Computing Confusion Matrix on Cross Validation Set
confusion = confusion_matrix(Y_cross,predict_cross)

# Computing Accuracy Values
accuracy_train = accuracy_score(Y_train, predict_train)
accuracy_cross = accuracy_score(Y_cross, predict_cross)
accuracy_tot = accuracy_score(Y_tot, predict_tot)

# Computing AUC Values
auc_train = roc_auc_score(Y_train, predict_train)
auc_cross = roc_auc_score(Y_cross, predict_cross)
auc_tot = roc_auc_score(Y_tot, predict_tot)                

# Printing Accuracy Metrics
print("Train Accuracy            : {:.3f}%       AUC: {:.4f}".format(accuracy_train * 100, auc_train))
print("Cross Validation Accuracy : {:.3f}%       AUC: {:.4f} ".format(accuracy_cross * 100, auc_cross))
print("Total Set Accuracy        : {:.3f}%       AUC: {:.4f}".format(accuracy_tot * 100, auc_tot))
print("Confusion Matrix for Cross Validation Set :\n %s" % str(confusion))
print("Number of Ones Predicted for Test Data : %d\n" % np.sum(predict_test))

_, _, f_score, num_l = precision_recall_fscore_support(Y_cross, predict_cross, beta=1.0, pos_label=1)
print("Mean F1 Score = %f \n"%((f_score[0]*num_l[0] + f_score[1]*num_l[1])/(num_l[0] + num_l[1])))

print(classification_report(Y_cross, predict_cross, digits=4))

Train Accuracy            : 83.724%       AUC: 0.6791
Cross Validation Accuracy : 82.792%       AUC: 0.6623 
Total Set Accuracy        : 83.537%       AUC: 0.6757
Confusion Matrix for Cross Validation Set :
 [[3587  158]
 [ 668  387]]
Number of Ones Predicted for Test Data : 641

Mean F1 Score = 0.805976 

              precision    recall  f1-score   support

         0.0     0.8430    0.9578    0.8968      3745
         1.0     0.7101    0.3668    0.4838      1055

    accuracy                         0.8279      4800
   macro avg     0.7765    0.6623    0.6903      4800
weighted avg     0.8138    0.8279    0.8060      4800



## Upload

In [18]:
# Uploading Predictions to a CSV
df = pd.DataFrame(predict_test, dtype=int)
df.index = id_col
df.to_csv('Data/Submission.csv', header=['NEXT_MONTH_DEFAULT'], index_label='Client_ID')