## Imports

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
import xgboost


In [2]:
# load data
train_csv_file = 'DataFiles/CreditCard_train.csv'
test_csv_file = 'DataFiles/CreditCard_test.csv'

train_data = pd.read_csv(train_csv_file, index_col=0, header=1)
test_data = pd.read_csv(test_csv_file, index_col=0, header=1)

train_data.head()
train_data.describe()
test_data.describe()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,...,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,175437.666667,1.505667,1.876,1.530333,35.905667,-0.071,-0.174833,-0.212,-0.256667,-0.319333,...,46841.991167,41554.274667,40103.9595,6146.250833,6344.473,6251.3435,5156.464167,4862.994,5319.916667,0.211
std,131905.841315,0.50001,0.830104,0.521017,8.990675,1.12376,1.182723,1.16674,1.17885,1.116348,...,69041.251707,62562.774782,61111.043232,21523.044734,30401.58,22638.732479,18466.518562,15310.976004,18355.191015,0.408052
min,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-65167.0,-61372.0,-209051.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,60000.0,1.0,1.0,1.0,29.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,2290.75,1900.0,1320.0,1000.0,990.0,799.75,326.0,278.75,288.75,0.0
50%,150000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,19657.0,18071.5,17203.5,2195.5,2230.0,2000.0,1634.5,1590.0,1697.0,0.0
75%,250000.0,2.0,2.0,2.0,42.0,0.0,0.0,0.0,0.0,0.0,...,61357.75,52099.0,50620.5,5188.75,5042.25,5000.0,4209.5,4145.0,4300.0,0.0
max,780000.0,2.0,6.0,3.0,75.0,8.0,7.0,8.0,7.0,7.0,...,706864.0,587067.0,514975.0,873552.0,1227082.0,889043.0,621000.0,426529.0,443001.0,1.0


### feature selection

In [3]:
X_train, y_train_unprocessed = train_data.iloc[:,:-1], train_data.iloc[:,-1:]
y_train = y_train_unprocessed.values.ravel()
X_test, y_test_unprocessed = test_data.iloc[:,:-1], test_data.iloc[:,-1:]
y_test = y_test_unprocessed.values.ravel()

#### kernel PCA

In [4]:
from sklearn.decomposition import KernelPCA

#transformer = KernelPCA(kernel='poly', degree=3)
#X_train_transformed = transformer.fit_transform(X_train[0:4000])
#X_train_transformed.shape

## Testing some classifiers

In [7]:
log_clf = LogisticRegression(max_iter=10000)
rnd_clf = RandomForestClassifier()
svm_clf = SVC(gamma='auto')
#svm_kernel_clf = SVC(kernel='poly', degree=2)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
svm_scaled_clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))

for clf in (log_clf, rnd_clf, svm_clf, svm_scaled_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test,y_pred))

LogisticRegression 0.789
RandomForestClassifier 0.8256666666666667
SVC 0.791
Pipeline 0.8303333333333334


### Voting classifier

In [None]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf)],
    voting='hard')
voting_clf.fit(X_train,y_train)
y_pred = voting_clf.predict(X_test)
print(voting_clf.__class__.__name__, accuracy_score(y_test,y_pred))

### Adaboosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

In [None]:
y_train_pred = ada_clf.predict(X_train)
print(accuracy_score(y_train, y_train_pred))

### Gradient Boosting

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X_train,y_train)

y_train2 = y_train-tree_reg1.predict(X_train)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X_train,y_train2)

y_train3 = y_train2-tree_reg2.predict(X_train)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X_train,y_train3)

y_pred = sum(tree.predict(X_test) for tree in (tree_reg1, tree_reg2, tree_reg3))
print(accuracy_score(y_test, y_pred.round()))

from sklearn.


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=20, learning_rate=1)
gbrt.fit(X_train, y_train)
y_pred = gbrt.predict(X_test)
print("Train accuracy score: ", accuracy_score(y_train,gbrt.predict(X_train).round()))
print("Test accuracy score: ", accuracy_score(y_test,y_pred.round()))

### xgboost

In [None]:
import xgboost


xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train,y_train)
y_pred = xgb_reg.predict(X_test)
print(accuracy_score(y_test,y_pred.round()))



### NN - incomplete