In [11]:
# First things first, some general imports:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [12]:
data_path = '../data/CICIDS2017/CSVs/TrafficLabelling/'
# One of the CSVs has an encoding issue, so use unicode_escape encoding
dfs = [pd.read_csv(os.path.join(data_path, k), encoding='unicode_escape') for k in os.listdir(data_path) if k.endswith('.csv')]
df = pd.concat(dfs)
df.head()

Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.10.3-192.168.10.50-389-33898-6,192.168.10.50,33898.0,192.168.10.3,389.0,6.0,6/7/2017 8:59,113095465.0,48.0,24.0,...,32.0,203985.5,575837.3,1629110.0,379.0,13800000.0,4277541.0,16500000.0,6737603.0,BENIGN
1,192.168.10.3-192.168.10.50-389-33904-6,192.168.10.50,33904.0,192.168.10.3,389.0,6.0,6/7/2017 8:59,113473706.0,68.0,40.0,...,32.0,178326.875,503426.9,1424245.0,325.0,13800000.0,4229413.0,16500000.0,6945512.0,BENIGN
2,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0.0,8.0.6.4,0.0,0.0,6/7/2017 8:59,119945515.0,150.0,0.0,...,0.0,6909777.333,11700000.0,20400000.0,6.0,24400000.0,24300000.0,60100000.0,5702188.0,BENIGN
3,192.168.10.14-65.55.44.109-59135-443-6,192.168.10.14,59135.0,65.55.44.109,443.0,6.0,6/7/2017 8:59,60261928.0,9.0,7.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,192.168.10.3-192.168.10.14-53-59555-17,192.168.10.14,59555.0,192.168.10.3,53.0,17.0,6/7/2017 8:59,269.0,2.0,2.0,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [None]:
# There are some null rows, lets drop them
df = df.dropna()
df.size

In [None]:
# Get counts of unique label values
# Note: All column titles (except for Flow ID) have a weird leading space, so we have to account for this when indexing
df[' Label'].value_counts()

In [None]:
X = df.drop([' Label'], axis=1)

# We need to find a way to account for string-valued columns -- I suspect we are losing valuable information without it
# For now, we drop non-numeric columns so that it plays nicely with the rest of the code
X = X._get_numeric_data()

# Account for data that is too large to fit in a float
X = X[np.isfinite(X).all(1)]
y = y[np.isfinite(X).all(1)]

y = df[' Label']
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, test_size = 0.2, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
# Use Synthetic Minority Over-Sampling (SMOTE) to account for class imbalances
# ~Line not currently applicable~ In particular, use SMOTE-NC because we have both numerical and categorical data (string columns like IP addresses)
from imblearn.over_sampling import SMOTE
# Label values are encoded very strangely -- I don't know why, but I had to copy and paste for the program to recognize them
smote=SMOTE(n_jobs=-1, sampling_strategy={'Bot':5000, 'Web Attack  Brute Force':5000, 'Web Attack  XSS':5000, 'Infiltration':5000, 'Web Attack  Sql Injection':5000, 'Heartbleed':5000})

X_train, y_train = smote.fit_resample(X_train, y_train)
# Print resampled counts
y_train[' Label'].value_counts

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(15), # 15 classes
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=64, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(algorithm="SAMME", random_state=42),
    QuadraticDiscriminantAnalysis(),
]

In [None]:
for model in (classifiers):
    model.fit(X_train, y_train)

In [None]:
preds = {i:[] for i in range(len(names))}
for i in range(len(names)):
    model = names[i]
    y_pred = list(model.predict(X_test))
    preds[i] = y_pred

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {names[k]} (34 classes) #####")
    print('accuracy_score: ', accuracy_score(y_pred, y_test))
    print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()