In [6]:
# Imports
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy as np
import statistics

In [7]:
data = pd.read_csv('creditcard.csv')

In [8]:
# Select features: V1 ~ V28 and Amount
# Select target: Class
features = ['Amount'] + ['V%d' % index for index in range(1, 29)]
target = 'Class'

# raw data X and y
X = data[features]
y = data[target]

In [9]:
# Normalize values for each feature, because there are many features with wide range of values
# We need to get them in the equivalent range. Make the distribution of each values on the same scale.
# Normalization procedure: (value - mean)/std
# Normalization has to be done after split individually on both train and test sets.
# Scaled data has zero mean and unit variance:
# from sklearn.preprocessing import MinMaxScaler
# min_max_scalar = MinMaxScaler()
def nomalization(X):
    for feature in X.columns:
        X[feature] -= X[feature].mean()
        X[feature] /= X[feature].std()
    return X
print('trick')

trick


In [10]:
# 1. Define a model
# 2. Stratified K-Folds cross-validator maintains percentage of samples for each class.
# 3. Random_state = none ensures the results is replicable.
# n_splits: how many times we want to split the data.

# We test on a series of models.

model = DecisionTreeClassifier()
# model = KNeighborsClassifier()
# model = RandomForestClassifier()
# model = AdaBoostClassifier()
# model = SVC() 
i = 0

models = [] # to hold all the temporary models.
model_scores = [] # to hold the accuracy scores 

skf = StratifiedKFold(n_splits=5)
for train_indices_out, test_indices_out in skf.split(X, y):
    i+=1
    # train/test split - outer loop.
    X_train_out = X.loc[train_indices_out] 
    y_train_out = y.loc[train_indices_out]
    
    X_test_out = X.loc[test_indices_out]
    y_test_out = y.loc[test_indices_out]
    
    print("Round {}".format(i))
    
    for train_indices_in, test_indices_in in skf.split(X_train_out, y_train_out):

        # train/test split - inner loop.
        X_train_in = X_train_out.iloc[train_indices_in] 
        y_train_in = y_train_out.iloc[train_indices_in]
    
        X_test_in = X_train_out.iloc[test_indices_in]
        y_test_in = y_train_out.iloc[test_indices_in]
        
        # Fit
        model.fit(X_train_in, y_train_in)
    
        # Add to the model list.
        models.append(model)
    
        # Predict
        y_pred = model.predict(X_test_in)

        # Show results
        accuracy = accuracy_score(y_test_in, y_pred)
        print(accuracy)
        model_scores.append(accuracy)
        cm = classification_report(y_test_in, y_pred)
        print(cm)
    
    # Calculate the mean and std_dev of the accuracy scores.
    mean = statistics.mean(model_scores)
    std = statistics.stdev(model_scores)
    
    print("mean and std: {}, {}".format(mean, std))
    
    # Retrieve the index of the model with highest score.
    highest_score_index = model_scores.index(max(model_scores))
    
    # Retrieve that model.
    best_model = models[highest_score_index]
    
    print("highest score: ".format(max(model_scores)))



Round 1
0.999210006583
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     45491
          1       0.75      0.82      0.78        79

avg / total       1.00      1.00      1.00     45570

0.99916611806
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     45491
          1       0.78      0.72      0.75        79

avg / total       1.00      1.00      1.00     45570

0.977199411881
             precision    recall  f1-score   support

          0       1.00      0.98      0.99     45490
          1       0.06      0.84      0.11        79

avg / total       1.00      0.98      0.99     45569

0.998858848315
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     45490
          1       0.65      0.72      0.68        78

avg / total       1.00      1.00      1.00     45568

0.999188026685
             precision    recall  f1-score   support

    