In [1]:
data_path = './creditcard.csv'

In [3]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression, RidgeCV, LassoCV
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.gaussian_process.kernels import RBF
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

rstate = 1

def load_creditcard():
    df = pd.read_csv(data_path)
    return df

data = load_creditcard()
print(data)

            Time         V1         V2        V3        V4        V5  \
0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
2            1.0  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
3            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
4            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
...          ...        ...        ...       ...       ...       ...   
284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V21       V22  \
0       0.462388  0.239599  0.098698  0.363787  ... -0.01830

In [4]:
## SELECTING HALF OF THE DATA
x, y = data.loc[:, ~data.columns.isin(['Class', 'Time'])], data['Class']

x_half, x_to_ignore, y_half, y_to_ignore = train_test_split(x, y, random_state=rstate, test_size=0.5)
# print("ratio smaller dataset: \n", y_to_use.value_counts())
x_half_train, x_half_test, y_half_train, y_half_test = train_test_split(x_half, y_half, random_state=rstate)
# print(x_train)

In [12]:
## UNDERSAMPLING

# Split the data based on whether it is malicious or not.
data_malicious = data.loc[data['Class'] == 1]
data_safe = data.loc[data['Class'] == 0]
print(f"half data malicious ratio: {data_malicious.shape[0]}:{data_safe.shape[0]}")
# print(data_malicious, data_safe)

data_safe_undersampled = data_safe.sample(n=data_malicious.shape[0], random_state = rstate)
# print(dsu)
data_undersampled = pd.concat([data_malicious, data_safe_undersampled], axis = 0)
print(f"undersampled malicious ratio: {data_malicious.shape[0]}:{data_safe_undersampled.shape[0]}")
# print(undersampled)

x_us, y_us = data_undersampled.loc[:, ~data_undersampled.columns.isin(['Class', 'Time'])], data_undersampled['Class']
x_us_train, x_us_test, y_us_train, y_us_test = train_test_split(x_us, y_us, random_state=rstate)


half data malicious ratio: 492:284315
undersampled malicious ratio: 492:492


In [14]:
## SMALL DATASET WITH SAME RATIO
small_ds_size = data_undersampled.shape[0]
malicious_number = 20

mal_entries = data[data['Class'] == 1].sample(n = malicious_number, random_state=rstate)
safe_entries = data[data['Class'] == 0].sample((small_ds_size - malicious_number), random_state=rstate)
data_small = pd.concat([mal_entries, safe_entries], axis = 0)
print(f"small dataset malicious ratio = {len(data_small[data_small['Class']==1])}:{len(data_small[data_small['Class']==0])}")

x_small, y_small = data_small.loc[:, ~data_small.columns.isin(['Class', 'Time'])], data_small['Class']
x_small_train, x_small_test, y_small_train, y_small_test = train_test_split(x_small, y_small, random_state=rstate)

small dataset malicious ratio = 20:964


In [7]:
def tree():
    clf1 = DecisionTreeClassifier(random_state=rstate)
    clf1.fit(x_train, y_train)
    return clf1.score(x_test, y_test)

def adaBoost(estimators, tree_depth):
    clf2 = AdaBoostClassifier(n_estimators= estimators, base_estimator=DecisionTreeClassifier(max_depth=tree_depth))
    clf2.fit(x_train, y_train)

    prediction = clf2.predict(x_test)
    # plot_confusion_matrix(confusion_matrix(prediction, y_test))
    # tn, fp, fn, tp
    print(confusion_matrix(prediction, y_test).ravel())
    roc = roc_auc_score(y_test, prediction)
    print(roc)
    # score = clf2.score(x_test, y_test) 
    # confusion_matrix()
    # print(score)
    return roc

def stacking():
    estimators = [('ridge', DecisionTreeClassifier()), ("kn", KNeighborsClassifier())]
    # for i in range(estimators):
    #     learners.append((str(i), DecisionTreeClassifier()))
    clf = StackingClassifier(estimators=estimators, stack_method="auto", final_estimator=LogisticRegression())
    print("hohihioho")
    clf.fit(x_train, y_train)
    print("hohihioho2")
    prediction = clf.predict(x_test)
    # tn, fp, fn, tp
    print(confusion_matrix(prediction, y_test).ravel())
    roc = roc_auc_score(y_test, prediction)
    print(roc)
    return roc

def plot_accuracy_n_estimators():
    estimators = [x for x in range(1,21)]
    scores = []
    for e in estimators:
        scores.append(adaBoost(e, None))

    plt.plot(estimators, scores, label = "AdaBoost")
    # plt.plot(estimators, [tree()]*len(scores), label = "Decision Tree")
    plt.xlabel("Number of Estimators for AdaBoost")
    plt.ylabel("ROC AUC")
    plt.legend()
    plt.show()

def plot_ROC_AUC_Stacking():
    learners = [2] # Number of Learners
    scores = []
    scores.append(stacking())

    plt.plot(learners, scores, label = "Stacking")
    # plt.plot(estimators, [tree()]*len(scores), label = "Decision Tree")
    plt.xlabel("Number of learners for Stacking")
    plt.ylabel("ROC AUC")
    plt.legend()
    plt.show()

# def plot_accuracy_depth_estimators():
#     depths = [x for x in range(1,101)]
#     scores = []
#     for d in depths:
#         scores.append(adaBoost(50, d))

#     plt.plot(depths, scores, label = "AdaBoost")
#     plt.plot(depths, [tree()]*len(scores), label = "Decision Tree")
#     plt.xlabel("Allowed tree depth of estimators")
#     plt.ylabel("Accuracy Score")
#     plt.legend()
#     plt.show()

# plot_accuracy_depth_estimators()
# plot_accuracy_n_estimators()
# plot_ROC_AUC_Stacking()

In [8]:
names = [
    "KNeighbours",
    "Linear SVM",
    "Decision Tree",
    "Random Forest",
    "Neural Net MLP",
    "AdaBoost",
    "Naive Bayes",
    "Gradiant Boost",
    "XGBoost"
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    GradientBoostingClassifier(n_estimators=10,max_depth=5),
    xgb.XGBClassifier(objective ='reg:squarederror', colsample_bytree = 1, learning_rate = 0.3, max_depth = 6, alpha = 0 )
]


def runClassifier(x_train, y_train, x_test, y_test):
    latex = ""
    for name, clf in zip(names, classifiers):
        # ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(x_train, y_train)
        prediction = clf.predict(x_test)
        precision = precision_score(y_test, prediction, average=None)
        recall = recall_score(y_test, prediction, average=None) 
        fmeasure = f1_score(y_test, prediction, average=None)

        print(f"==={name}===")
        print(confusion_matrix(prediction, y_test).ravel())
        roc = roc_auc_score(y_test, prediction)
        print(roc)
        print("Precision: ", precision )
        print("Recall: ", recall)
        print("F-measure", fmeasure )
        string = f"{name} & {format(roc, '.3f')} & {format(precision[1], '.3f')} & {format(recall[1], '.3f')} & {format(fmeasure[1], '.3f')}"
        latex += f"\t \hline \n \t {string} \\\ \n"
    latex += "\t \hline"
    print(latex)

In [None]:
## ==== CODE THAT GIVES RESULTS =====
print("=== HALF DATASET ===")
runClassifier(x_half_train, y_half_train, x_half_test, y_half_test) #half data
print("=== UNDERSAMPING ===")
runClassifier(x_us_train, y_us_train, x_us_test, y_us_test)
print("=== SMALL DATASET ===")
runClassifier(x_small_train, y_small_train, x_small_test, y_small_test)
