## Imports

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
import xgboost


In [2]:
# load data
train_csv_file = 'DataFiles/CreditCard_train.csv'
test_csv_file = 'DataFiles/CreditCard_test.csv'

_train_data = pd.read_csv(train_csv_file, index_col=0, header=1)
_test_data = pd.read_csv(test_csv_file, index_col=0, header=1)

# create copy df 
df_train = _train_data.copy()
df_test = _test_data.copy()

df_train.describe()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
count,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,...,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0
mean,165495.986667,1.62825,1.847417,1.55725,35.380458,-0.003125,-0.1235,-0.15475,-0.211667,-0.252917,...,42368.188417,40000.682542,38563.710625,5542.912917,5815.336,4969.266,4743.480042,4783.486042,5189.399042,0.22375
std,129128.744855,0.483282,0.780007,0.52208,9.27105,1.123425,1.20058,1.204033,1.166549,1.136993,...,63070.680934,60345.012766,59155.759799,15068.576072,20797.03,16095.61434,14883.26999,15270.405279,17630.37199,0.416765
min,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,2340.0,1740.0,1234.75,1000.0,800.0,379.0,279.75,244.0,60.75,0.0
50%,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,18940.5,18107.5,17036.0,2100.0,2000.0,1702.5,1500.0,1500.0,1500.0,0.0
75%,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,...,52188.5,49746.5,48796.25,5000.0,5000.0,4347.25,4000.0,4005.0,4000.0,0.0
max,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,505000.0,1684259.0,896040.0,497000.0,417990.0,528666.0,1.0


### feature selection

In [3]:
from sklearn.preprocessing import StandardScaler

X_train_unscaled = df_train.iloc[:,:-1]
y_train_unraveled = df_train.iloc[:,-1:]

X_test_unscaled = df_test.iloc[:,:-1]
y_test_unraveled = df_test.iloc[:,-1:]


# standardise data
X_train = X_train_unscaled
X_test = X_test_unscaled

# standardise test data
y_train = y_train_unraveled.values.ravel()
y_test = y_test_unraveled.values.ravel()

In [4]:
y_train
X_train

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,2,2,1,24,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
2,120000,2,2,2,26,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
3,90000,2,2,2,34,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
4,50000,2,2,1,37,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
5,50000,1,2,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23996,80000,1,2,1,25,1,2,2,0,0,...,80903,80215,63296,49854,3800,6,3636,2646,2000,1830
23997,20000,1,2,1,25,0,0,0,0,0,...,17562,17322,17119,17350,1552,2659,1419,606,500,1000
23998,10000,1,2,2,26,0,0,0,0,0,...,9825,17506,16608,9176,1300,2200,1300,320,1820,1000
23999,20000,1,5,2,26,0,0,0,0,0,...,19394,39950,0,0,3055,1467,1096,1000,0,0


In [5]:
type(X_train)

pandas.core.frame.DataFrame

In [6]:
X_train.describe()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
count,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,...,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0
mean,165495.986667,1.62825,1.847417,1.55725,35.380458,-0.003125,-0.1235,-0.15475,-0.211667,-0.252917,...,46367.06,42368.188417,40000.682542,38563.710625,5542.912917,5815.336,4969.266,4743.480042,4783.486042,5189.399042
std,129128.744855,0.483282,0.780007,0.52208,9.27105,1.123425,1.20058,1.204033,1.166549,1.136993,...,68193.9,63070.680934,60345.012766,59155.759799,15068.576072,20797.03,16095.61434,14883.26999,15270.405279,17630.37199
min,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-157264.0,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,2773.5,2340.0,1740.0,1234.75,1000.0,800.0,379.0,279.75,244.0,60.75
50%,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,20039.0,18940.5,18107.5,17036.0,2100.0,2000.0,1702.5,1500.0,1500.0,1500.0
75%,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,...,59298.0,52188.5,49746.5,48796.25,5000.0,5000.0,4347.25,4000.0,4005.0,4000.0
max,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,8.0,...,1664089.0,891586.0,927171.0,961664.0,505000.0,1684259.0,896040.0,497000.0,417990.0,528666.0


### scaling data

In [8]:
from sklearn import preprocessing 
 
X_train = preprocessing.scale(X_train) #removes columns and transforms data into ndarray
X_test = preprocessing.scale(X_test)

In [12]:
X_train

array([[-1.12677482,  0.76923536,  0.19562207, ..., -0.31871886,
        -0.31325859, -0.29435041],
       [-0.35233777,  0.76923536,  0.19562207, ..., -0.25152793,
        -0.31325859, -0.18090744],
       [-0.58466889,  0.76923536,  0.19562207, ..., -0.25152793,
        -0.24777108, -0.01074299],
       ...,
       [-1.20421852, -1.29999224,  0.19562207, ..., -0.29721776,
        -0.19407132, -0.23762892],
       [-1.12677482, -1.29999224,  4.04182335, ..., -0.25152793,
        -0.31325859, -0.29435041],
       [-0.50722518, -1.29999224, -1.08644502, ..., -0.28512339,
        -0.24777108, -0.23762892]])

### Gradient Boosting

In [15]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X_train,y_train)

y_train2 = y_train-tree_reg1.predict(X_train)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X_train,y_train2)

y_train3 = y_train2-tree_reg2.predict(X_train)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X_train,y_train3)

y_pred = sum(tree.predict(X_test) for tree in (tree_reg1, tree_reg2, tree_reg3))
print(accuracy_score(y_test, y_pred.round()))



0.8296666666666667


In [28]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=50, learning_rate=1)
gbrt.fit(X_train, y_train)
y_pred = gbrt.predict(X_test)
print("Train accuracy score: ", accuracy_score(y_train,gbrt.predict(X_train).round()))
print("Test accuracy score: ", accuracy_score(y_test,y_pred.round()))

Train accuracy score:  0.825125
Test accuracy score:  0.8283333333333334


In [None]:
y_pred

### xgboost

In [45]:
import xgboost


xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train,y_train)
y_pred = xgb_reg.predict(X_test)
print(accuracy_score(y_test,y_pred.round()))

y_train_pred = 



0.8211666666666667


In [42]:
X_train_unscaled.describe()

X_test_unscaled.describe()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,...,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,175437.666667,1.505667,1.876,1.530333,35.905667,-0.071,-0.174833,-0.212,-0.256667,-0.319333,...,49597.5435,46841.991167,41554.274667,40103.9595,6146.250833,6344.473,6251.3435,5156.464167,4862.994,5319.916667
std,131905.841315,0.50001,0.830104,0.521017,8.990675,1.12376,1.182723,1.16674,1.17885,1.116348,...,73739.999695,69041.251707,62562.774782,61111.043232,21523.044734,30401.58,22638.732479,18466.518562,15310.976004,18355.191015
min,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-46127.0,-65167.0,-61372.0,-209051.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,60000.0,1.0,1.0,1.0,29.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,2487.75,2290.75,1900.0,1320.0,1000.0,990.0,799.75,326.0,278.75,288.75
50%,150000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,20360.5,19657.0,18071.5,17203.5,2195.5,2230.0,2000.0,1634.5,1590.0,1697.0
75%,250000.0,2.0,2.0,2.0,42.0,0.0,0.0,0.0,0.0,0.0,...,65252.75,61357.75,52099.0,50620.5,5188.75,5042.25,5000.0,4209.5,4145.0,4300.0
max,780000.0,2.0,6.0,3.0,75.0,8.0,7.0,8.0,7.0,7.0,...,855086.0,706864.0,587067.0,514975.0,873552.0,1227082.0,889043.0,621000.0,426529.0,443001.0


In [46]:
X_train_unscaled.describe()


xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train_unscaled_data,y_train)
y_pred = xgb_reg.predict(X_test_unscaled_data)
print(accuracy_score(y_test,y_pred.round()))

0.8261666666666667


In [30]:
y_train

array([1, 1, 0, ..., 0, 0, 1])

In [31]:
X_train

array([[-1.12677482,  0.76923536,  0.19562207, ..., -0.31871886,
        -0.31325859, -0.29435041],
       [-0.35233777,  0.76923536,  0.19562207, ..., -0.25152793,
        -0.31325859, -0.18090744],
       [-0.58466889,  0.76923536,  0.19562207, ..., -0.25152793,
        -0.24777108, -0.01074299],
       ...,
       [-1.20421852, -1.29999224,  0.19562207, ..., -0.29721776,
        -0.19407132, -0.23762892],
       [-1.12677482, -1.29999224,  4.04182335, ..., -0.25152793,
        -0.31325859, -0.29435041],
       [-0.50722518, -1.29999224, -1.08644502, ..., -0.28512339,
        -0.24777108, -0.23762892]])

#### kernel PCA

In [7]:
from sklearn.decomposition import KernelPCA

#transformer = KernelPCA(kernel='poly', degree=3)
#X_train_transformed = transformer.fit_transform(X_train[0:4000])
#X_train_transformed.shape

## Testing some classifiers

In [10]:
log_clf = LogisticRegression(max_iter=10000)
rnd_clf = RandomForestClassifier()
svm_clf = SVC(gamma='auto',C=4)
#svm_kernel_clf = SVC(kernel='poly', degree=2)
#from sklearn.preprocessing import StandardScaler
#from sklearn.pipeline import make_pipeline
#svm_scaled_clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))

for clf in (log_clf, rnd_clf, svm_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test,y_pred))

LogisticRegression 0.821
RandomForestClassifier 0.828
SVC 0.8315


In [None]:
X_train.describe()

## HORROR

 ## Horror END


end

### Voting classifier

In [11]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svm', svm_clf)],
    voting='hard')
voting_clf.fit(X_train,y_train)
y_pred = voting_clf.predict(X_test)
print(voting_clf.__class__.__name__, accuracy_score(y_test,y_pred))

VotingClassifier 0.829


### Adaboosting

In [13]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.8303333333333334


In [14]:
y_train_pred = ada_clf.predict(X_train)
print('training score accuracy = ' + str(accuracy_score(y_train, y_train_pred)))

training score accuracy = 0.8171666666666667


### NN - incomplete