In [11]:
# Imports
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import pickle
import pandas as pd
import numpy as np
import statistics


In [12]:
data = pd.read_csv('creditcard.csv')
up_icon = '\u25b2'
string_seperater = 'Round '+up_icon+'\n'
# print(string_seperater.encode('utf8'))

In [13]:
# Select features: V1 ~ V28 and Amount
# Select target: Class
features = ['Amount'] + ['V%d' % index for index in range(1, 29)]
target = 'Class'

# raw data X and y
X = data[features]
y = data[target]

In [14]:
# 1. Define a model
# 2. Stratified K-Folds cross-validator maintains percentage of samples for each class.
# 3. Random_state = none ensures the results is replicable.
# n_splits: how many times we want to split the data.

i = 0

models = [] # to hold all the temporary models.
model_scores = [] # to hold the accuracy scores 

# Open the file for output.
train_results_file = open('training_results', 'w')
test_results_file = open('testing_results', 'a')

skf = StratifiedKFold(n_splits=5, random_state=1234)
for train_indices_out, test_indices_out in skf.split(X, y):
    
    i+=1
    # train/test split - outer loop.
    X_train_out = X.loc[train_indices_out] 
    y_train_out = y.loc[train_indices_out]
    
    X_test_out = X.loc[test_indices_out]
    y_test_out = y.loc[test_indices_out]
    
    print("------ Round {} ------".format(i))
    
    j=0
    print('Training Phase ... \n')
    for train_indices_in, test_indices_in in skf.split(X_train_out, y_train_out):
        
#         model = DecisionTreeClassifier()
#         model = KNeighborsClassifier()
#         model = RandomForestClassifier()
        model = AdaBoostClassifier()
#         model = SVC()
        
        j+=1
        # train/test split - inner loop.
        X_train_in = X_train_out.iloc[train_indices_in] 
        y_train_in = y_train_out.iloc[train_indices_in]
    
        X_test_in = X_train_out.iloc[test_indices_in]
        y_test_in = y_train_out.iloc[test_indices_in]
        
        # Fit
        model.fit(X_train_in, y_train_in)
    
        # Add to the model list.
        models.append(model)
    
        # Predict
        y_pred_in = model.predict(X_test_in)

        # Show results
        accuracy_in = accuracy_score(y_test_in, y_pred_in)
        cm_in = classification_report(y_test_in, y_pred_in)
        
        model_scores.append(accuracy_in)       

        print('Training Model {}'.format(j))
        print('Training Accuracy: {}'.format(accuracy_in))
        print('Training CM: \n',cm_in)
        
        train_results_file.write(str(accuracy_in)+'\n')
#         test_results_file.write('{}\n'.format(accuracy_out))
        
    # iterate the list of models from last training round.
#     print('Testing Phase ... \n')
    
#     for model_index in range(0,len(models)):

#         # Testing Predict.
#         y_pred_out = models[model_index].predict(X_test_out)
        
#         # Show testing results.
#         accuracy_out = accuracy_score(y_test_out, y_pred_out)
#         cm_out = classification_report(y_test_out, y_pred_out)
        
#         # Write testing results to file.
#         test_results_file.write('{}\n'.format(accuracy_out))
        
#         print("Testing model {}".format(model_index+1))
#         print("Testing Accuracy Results: {}".format(accuracy_out))
#         print("Testing CM: \n",cm_out)
    
#     string_seperater = 'Round '+str(i)+up_icon+'\n'
    train_results_file.write('---------\n')
       
    
    # Calculate the mean and std_dev of the accuracy scores.
#     mean = statistics.mean(model_scores)
#     std = statistics.stdev(model_scores)
    
#     print("mean and std: {}, {}".format(mean, std))
    
    # Retrieve the index of the model with highest score.
    highest_score_index = model_scores.index(max(model_scores))
    
    # Retrieve that model.
    best_model = models[highest_score_index]
    
    # Predict on best model on this round.
    y_pred_out = best_model.predict(X_test_out)
    
    # Show results
    accuracy_out = accuracy_score(y_test_out, y_pred_out)
    cm_out = classification_report(y_test_out, y_pred_out)
    
    test_results_file.write('{}\n'.format(accuracy_out))
    print('Best Model is: {}'.format(highest_score_index+1))
    print("Testing Accuracy round {}: {}".format(i, accuracy_out))
    print(cm_out)
    
    # empty results for the next round of execution.
    models = [] 
    model_scores = [] 
    
test_results_file.write('---------\n')    

train_results_file.close()
test_results_file.close()


------ Round 1 ------
Training Phase ... 

Training Model 1
Training Accuracy: 0.9995172262453369
Training CM: 
              precision    recall  f1-score   support

          0       1.00      1.00      1.00     45491
          1       0.94      0.77      0.85        79

avg / total       1.00      1.00      1.00     45570

Training Model 2
Training Accuracy: 0.9993416721527321
Training CM: 
              precision    recall  f1-score   support

          0       1.00      1.00      1.00     45491
          1       0.92      0.68      0.78        79

avg / total       1.00      1.00      1.00     45570

Training Model 3
Training Accuracy: 0.9818736421690184
Training CM: 
              precision    recall  f1-score   support

          0       1.00      0.98      0.99     45490
          1       0.08      0.87      0.14        79

avg / total       1.00      0.98      0.99     45569

Training Model 4
Training Accuracy: 0.9991660814606742
Training CM: 
              precision    recall

Best Model is: 2
Testing Accuracy round 5: 0.9989817594494479
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     56863
          1       0.79      0.55      0.65        98

avg / total       1.00      1.00      1.00     56961

