In [1]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import joblib
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import pandas as pd
import time
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [2]:
features, labels = [], []
training_files = ["dataset/individual/tcp_dataset_individual_attack_1.csv",
              "dataset/individual/tcp_dataset_individual_attack_2.csv",
              "dataset/individual/old/dataset_tcp_attack_1.csv",
              "dataset/individual/old/dataset_tcp_attack_2.csv",
              "dataset/individual/tcp_dataset_individual_normal_1.csv",
              "dataset/individual/tcp_dataset_individual_normal_2.csv",
              "dataset/individual/old/dataset_tcp_normal_1.csv",
              "dataset/individual/old/dataset_tcp_normal_2.csv",
              "dataset/individual/old/dataset_tcp_normal_3.csv"]

In [3]:
for fname in training_files:
    meal = open(fname, "rt")
    for line in meal:
        data_list = line.rsplit(",")
        if(len(data_list) == 14):
            data_list[(len(data_list)-1)]=data_list[(len(data_list)-1)].replace('\n', '')
            features.append(data_list[:(len(data_list)-1)])
            labels.append(data_list[(len(data_list)-1)])
        elif(len(data_list) == 15):
            data_list.pop(3)
            data_list[(len(data_list)-1)]=data_list[(len(data_list)-1)].replace('\n', '')
            features.append(data_list[:(len(data_list)-1)])
            labels.append(data_list[(len(data_list)-1)])
    meal.close()

In [4]:
print(f"Size of feature dataset : {len(features)}")
print("Features first and last entries:\n\t", end = "")
print(features)
print("Labels first and last entries:\n\t", end = "")
print(labels)

Size of feature dataset : 933
Features first and last entries:
	[['0.8755519298449245', '0.0', '28.0', '654477784.9398003', '607147907.414416', '0.0', '0.0', '0.0', '0.0', '19799.543905240946', '0.0', '0.0', '97'], ['1.6350172610206875', '0.0', '56.00223209837265', '601403641.6735896', '606759430.7742991', '0.0', '0.0', '0.0', '0.0', '18454.14166013035', '0.0', '0.0', '194'], ['1.6603023925601164', '0.0', '58.60034129593445', '600890934.6640853', '626257127.3073899', '0.0', '0.0', '0.0', '0.0', '18234.942528203028', '0.0', '0.0', '203'], ['1.5654420110090757', '0.0', '57.73430522661548', '619134671.9512614', '605168756.9246663', '0.0', '0.0', '0.0', '0.0', '19358.233631083185', '0.0', '0.0', '200'], ['1.5178251319498257', '0.0', '83.32038098543886', '636206633.2874775', '631438307.4765562', '0.0', '0.0', '0.06867951064544758', '35.16390945046916', '18706.4977475877', '0.0', '0.0', '211'], ['1.6395917946462064', '0.0', '17968.105925687098', '618273120.9997284', '633796324.4404662', '0.0

In [5]:
timestamp_std = []
ip_src_std = []
port_src_std = []
seq_std = []
ack_std = []
for data in features:
    timestamp_std.append(float(data[0]))
    ip_src_std.append(float(data[1]))
    port_src_std.append(float(data[2]))
    seq_std.append(float(data[3]))
    ack_std.append(float(data[4]))
a = np.array(timestamp_std)
b = np.array(ip_src_std)
c = np.array(port_src_std)
d = np.array(seq_std)
e = np.array(ack_std)
print(np.std(a))
print(np.std(b))
print(np.std(c))
print(np.std(d))
print(np.std(e))

0.4910641044212609
20.396029661233744
6837.072307797152
355674713.1445296
402820875.5982184


In [6]:
data_counter = Counter(labels)
print(data_counter)

Counter({'0': 615, '1': 318})


In [7]:
over = SMOTE(sampling_strategy=1)
under = RandomUnderSampler()
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
features, labels = pipeline.fit_resample(features, labels)

  X = check_array(


In [8]:
data_counter = Counter(labels)
print(data_counter)

Counter({'0': 615, '1': 615})


In [9]:
print(f"Size of feature dataset : {len(features)}")
print("Features first and last entries:\n\t", end = "")
print(features)
print("Labels first and last entries:\n\t", end = "")
print(labels)

Size of feature dataset : 1230
Features first and last entries:
	[[0.25228296534028055, 0.0, 11.120501587408526, 1158156393.5051055, 1448103990.3984392, 0.6420453428086074, 0.0, 31.176486866440438, 20462.59791735888, 17778.38733153263, 0.0, 49.68980165877994, 60.0], [0.05388340303818221, 0.0, 587.6843025861529, 60544649.416113526, 1222854790.7640784, 0.628539361054709, 0.0, 30.710114150082724, 20031.1566038731, 11963.909952771488, 0.0, 48.461770601955344, 9.0], [0.045325412133046486, 0.0, 1378.0129438685935, 819806255.1840686, 1102537760.404277, 0.6998542122237652, 0.0, 32.93283207638585, 22303.8871783666, 18051.155196229614, 0.0, 53.14400920410234, 7.0], [0.12796453604807792, 0.0, 3757.3708893320604, 500623755.89911485, 793252035.307875, 0.43588989435406733, 0.0, 22.597344976788758, 13890.380897855177, 21378.28799337075, 0.0, 37.21756977557778, 20.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], [1.9588967558467751, 0.0, 4977.363157335418, 1050396448.7626569, 135

In [10]:
timestamp_std = []
ip_src_std = []
port_src_std = []
seq_std = []
ack_std = []
for data in features:
    timestamp_std.append(data[0])
    ip_src_std.append(data[1])
    port_src_std.append(data[2])
    seq_std.append(data[3])
    ack_std.append(data[4])
a = np.array(timestamp_std)
b = np.array(ip_src_std)
c = np.array(port_src_std)
d = np.array(seq_std)
e = np.array(ack_std)
print(np.std(a))
print(np.std(b))
print(np.std(c))
print(np.std(d))
print(np.std(e))

0.44441762500113907
22.084918398122156
7212.950652536937
331236221.34865373
398318088.90076524


In [11]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.20, stratify=labels, random_state = 0)

In [12]:
sc = StandardScaler()
X_train = sc.fit_transform(features_train)
X_test = sc.transform(features_test)

In [13]:
features_train

[[1.7067831430888252,
  0.0,
  138.14265180230296,
  1219463742.7981462,
  1299566935.932573,
  0.872958567687868,
  0.0,
  34.59057912101698,
  25063.75120462174,
  20256.00879289045,
  0.0,
  58.627935385155354,
  272.0],
 [1.5299663091670404,
  2.0586853220437766,
  7024.8144015469525,
  826324589.885335,
  1434147115.983254,
  0.8489401821637669,
  0.0,
  32.21586571521427,
  24159.382008695095,
  18670.73614719986,
  0.0,
  57.14087324398418,
  46.0],
 [1.410776149207322,
  0.0,
  3892.7009123144776,
  1273459982.2539532,
  1391204274.2170875,
  0.8510341313630148,
  0.0,
  34.61801521295154,
  24994.517185830246,
  18975.07670392432,
  0.0,
  58.891985148048725,
  843.0],
 [1.4592311077450673,
  0.0,
  2178.598699430939,
  1231793360.2533007,
  1409501782.2299013,
  0.9483033722073847,
  0.0,
  19.904105146918685,
  30221.48016887714,
  18661.91308672898,
  0.0,
  3.793213488829539,
  334.0],
 [1.4525607018531068,
  0.0,
  761.4567756365177,
  1167289708.1568053,
  1433641265.276