In [1]:


import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, classification_report, f1_score
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn import svm
import itertools
from sklearn.naive_bayes import GaussianNB

def Kfold_for_TrainModel(X_train_data, y_train_data):
    fold = KFold(len(X_train_data), 5, shuffle=False)
    c_params = [0.01, 0.1, 1, 10, 100]
    result_tables = pd.DataFrame(columns=['C_parameter', 'Mean recall score'])
    result_tables['C_parameter'] = c_params
    j = 0
    for c_param in c_params:
        print('-------------------------------------------')
        print('C parameter:', c_param)
        print('-------------------------------------------')
        print('')
        recall_list = []
        for iteration, indices in enumerate(fold, start=1):
            lr = LogisticRegression(C=c_param, penalty='l1', max_iter=10)
            lr.fit(X_train_data.iloc[indices[0], :], y_train_data.iloc[indices[0], :].values.ravel())
            y_undersample_pred = lr.predict(X_train_data.iloc[indices[1], :].values)
            recall = recall_score(y_train_data.iloc[indices[1], :].values, y_undersample_pred)
            f1_score_value = f1_score(y_train_data.iloc[indices[1], :].values, y_undersample_pred)
            recall_list.append(recall)
            print('Iteration ', iteration, " Recall: ", recall, " F1 score: ", f1_score_value)
        print('')
        print('Average recall: ', np.mean(recall_list))
        print('')
        result_tables.loc[j, 'Mean recall score'] = np.mean(recall_list)
        j = j + 1

    result_tables['Mean recall score'] = result_tables['Mean recall score'].astype('float64')
    best_c_param = result_tables.loc[result_tables['Mean recall score'].idxmax(), 'C_parameter']
    print('*********************************************************************************')
    print('Best model corresponds to C parameter = ', best_c_param)
    print('*********************************************************************************')
    return best_c_param

def showData(data):
    print(data.shape)
    print(data.head())

def dataPrepare():
    data = pd.read_csv("benign_traffic.csv")
    data['Class'] = 0

    udp = pd.read_csv("miraeudp.csv")
    udp['Class'] = 1

    ack = pd.read_csv("miraeack.csv")
    ack['Class'] = 1

    scan = pd.read_csv("miraescan.csv")
    scan['Class'] = 1

    syn = pd.read_csv("miraesyn.csv")
    syn['Class'] = 1

    udpplain = pd.read_csv("miraeudpplain.csv")
    udpplain['Class'] = 1

    g_combo = pd.read_csv("gafgytcombo.csv")
    g_combo['Class'] = 1

    g_junk = pd.read_csv("gafgytjunk.csv")
    g_junk['Class'] = 1

    g_scan = pd.read_csv("gafgytscan.csv")
    g_scan['Class'] = 1

    g_tcp = pd.read_csv("gafgyttcp.csv")
    g_tcp['Class'] = 1

    g_udp = pd.read_csv("gafgytudp.csv")
    g_udp['Class'] = 1

    frames = [data, udp, ack, scan, syn, udpplain, g_combo, g_junk, g_scan, g_tcp, g_udp]
    result = pd.concat(frames, ignore_index=True)

    showData(result)
    return result

def dataPreprocessing(data):
    print('------')
    count_class = pd.value_counts(data['Class'], sort=True).sort_index()
    print(count_class)
    print('------')
    X = data.iloc[:, data.columns != 'Class']
    y = data.iloc[:, data.columns == 'Class']
    positive_sample_count = len(data[data.Class == 1])
    print("Positive sample count: ", positive_sample_count)
    negative_sample_index = np.array(data[data.Class == 0].index)
    print("Negative sample indices in the dataset (printing the first 5): ", negative_sample_index[:5])
    positive_sample_index = data[data.Class == 1].index
    random_positive_sample_index = np.random.choice(positive_sample_index, int(1*len(data[data.Class == 0])), replace=False)
    print("Positive sample indices in the dataset (printing the first 5): ", random_positive_sample_index[:5])
    under_sample_index = np.concatenate([random_positive_sample_index, negative_sample_index])
    under_sample_data = data.iloc[under_sample_index, :]
    X_under_sample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
    y_under_sample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']
    print('After under-sampling, the proportion of positive samples in the new dataset: ',
          len(under_sample_data[under_sample_data.Class == 1]) / len(under_sample_data))
    print('After under-sampling, the proportion of negative samples in the new dataset: ',
          len(under_sample_data[under_sample_data.Class == 0]) / len(under_sample_data))
    print('After under-sampling, the number of samples in the new dataset: ', len(under_sample_data))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    X_train_under_sample, X_test_under_sample, y_train_under_sample, y_test_under_sample = train_test_split(
        X_under_sample,
        y_under_sample,
        test_size=0.3,
        random_state=0)
    print('Training set sample count: ', len(X_train_under_sample))
    print('Testing set sample count: ', len(X_test_under_sample))
    return X_train, X_test, y_train, y_test, X_train_under_sample, X_test_under_sample, y_train_under_sample, y_test_under_sample

def plot_confusion_matrix(confusion_matrix, classes):
    plt.figure()
    plt.imshow(confusion_matrix, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)
    thresh = confusion_matrix.max() / 2.
    for i, j in itertools.product(range(confusion_matrix.shape[0]), range(confusion_matrix.shape[1])):
        plt.text(j, i, confusion_matrix[i, j],
                 horizontalalignment="center",
                 color="white" if confusion_matrix[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    print('Precision: ', confusion_matrix[1, 1] / (confusion_matrix[1, 1] + confusion_matrix[0, 1]))
    print('Recall: ', confusion_matrix[1, 1] / (confusion_matrix[1, 1] + confusion_matrix[1, 0]))
    print('Accuracy: ',
          (confusion_matrix[0, 0] + confusion_matrix[1, 1]) / (
                      confusion_matrix[0, 0] + confusion_matrix[0, 1] + confusion_matrix[1, 1] + confusion_matrix[1, 0]))
    print('*********************************************************************************')

# Data preparation
result = dataPrepare()
# Data preprocessing: training set, validation set, test set
X_train, X_test, y_train, y_test, X_train_under_sample, X_test_under_sample, y_train_under_sample, y_test_under_sample = dataPreprocessing(result)

# Model
# best_c_param = Kfold_for_TrainModel(X_train_under_sample, y_train_under_sample)
best_c_param = 10
# Evaluation
lr = RandomForestClassifier()
lr.fit(X_train_under_sample, y_train_under_sample.values.ravel())
# Get the test results
y_undersample_pred = lr.predict(X_test_under_sample.values)
# Build the confusion matrix
conf_matrix = confusion_matrix(y_test_under_sample, y_undersample_pred)
np.set_printoptions(precision=2)
class_names = [0, 1]
plot_confusion_matrix(conf_matrix, classes=class_names)



# Model
gnb = GaussianNB()
gnb.fit(X_train_under_sample, y_train_under_sample.values.ravel())

# Evaluation
y_undersample_pred_nb = gnb.predict(X_test_under_sample.values)

# Build the confusion matrix for Naive Bayes
conf_matrix_nb = confusion_matrix(y_test_under_sample, y_undersample_pred_nb)
np.set_printoptions(precision=2)
class_names = [0, 1]
plot_confusion_matrix(conf_matrix_nb, classes=class_names)

# Print precision, recall, and accuracy for Naive Bayes
print('Precision (Naive Bayes): ', conf_matrix_nb[1, 1] / (conf_matrix_nb[1, 1] + conf_matrix_nb[0, 1]))
print('Recall (Naive Bayes): ', conf_matrix_nb[1, 1] / (conf_matrix_nb[1, 1] + conf_matrix_nb[1, 0]))
print('Accuracy (Naive Bayes): ',
      (conf_matrix_nb[0, 0] + conf_matrix_nb[1, 1]) / (
                  conf_matrix_nb[0, 0] + conf_matrix_nb[0, 1] + conf_matrix_nb[1, 1] + conf_matrix_nb[1, 0]))
print('*********************************************************************************')


# Uncomment the following code to test real data
# IsolationForest
rng = np.random.RandomState(42)
method=''
clf = IsolationForest(max_samples="auto", random_state=rng)
clf.fit(X_train_under_sample)
pred_y = clf.predict(X_train_under_sample)
# IsolationForest returns -1 for anomalies and 1 for normal values, so replace 1 with 0, -1 with 1 for anomalies
print(pred_y)
pred_y = [0 if x==1 else x for x in pred_y]
pred_y = [1 if x==-1 else x for x in pred_y]
print(pred_y)
# Build the confusion matrix
conf_matrix = confusion_matrix(y_train_under_sample, pred_y)
np.set_printoptions(precision=2)
class_names = [0, 1]
plot_confusion_matrix(conf_matrix, classes=class_names)
# End

# One-Class SVM
rng = np.random.RandomState(42)
method=''
clf = svm.OneClassSVM(nu=0.02, kernel="rbf", gamma=0.1)
clf.fit(X_train_under_sample)
pred_y = clf.predict(X_train_under_sample)
# One-Class SVM returns -1 for anomalies and 1 for normal values, so replace 1 with 0, -1 with 1 for anomalies
print(pred_y)
pred_y = [0 if x==1 else x for x in pred_y]
pred_y = [1 if x==-1 else x for x in pred_y]
print(pred_y)
# Build the confusion matrix
conf_matrix = confusion_matrix(y_train_under_sample, pred_y)
np.set_printoptions(precision=2)
class_names = [0, 1]
plot_confusion_matrix(conf_matrix, classes=class_names)
# End


FileNotFoundError: [Errno 2] No such file or directory: 'benign_traffic.csv'