# 5.1. Flow Classification Task - Packets

We use **CTU-13 dataset, malware capture 43**. This notebook attempts to classify hosts (dataset grouped on source IP).

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from global_helper import *
from helper import *
from itertools import product

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

## Load data

In [3]:
%%time

# Read file and preprocess columns (takes ~1.5min)
# Only loads the traffic associated with the provided IP address.
df = load_data("../data/capture20110811.pcap.netflow.labeled")

Wall time: 1min 29s


In [4]:
df.head(5)

Unnamed: 0,duration,protocol,flags,tos,packets,bytes,flows,label,src_ip,src_port,dest_ip,dest_port
2011-08-11 10:10:00.003,0.967,UDP,INT,0,2,135,1,Background,89.31.8.11,23929,147.32.84.229,13363
2011-08-11 10:10:00.003,0.967,UDP,INT,0,2,276,1,Background,147.32.84.229,13363,89.31.8.11,23929
2011-08-11 10:10:00.006,0.0,UDP,INT,0,1,62,1,Background,208.88.186.6,34042,147.32.84.229,13363
2011-08-11 10:10:00.008,0.0,UDP,INT,0,1,78,1,Background,92.118.218.77,55246,147.32.84.229,13363
2011-08-11 10:10:00.009,0.0,UDP,INT,0,1,72,1,Background,182.185.139.181,10223,147.32.84.229,13363


## Data preprocessing
In this section, we remove background data, drop irrelevant columns, convert strings categories to numbers and correct the class imbalance.

In [None]:
X_train, X_test, y_train, y_test = preprocess_df_hosts(df)

In [31]:
num_botnet = np.sum(y_train) + np.sum(y_test)
num_total = len(y_train) + len(y_test)
num_legit = num_total - num_botnet

display("Class imbalance (botnet vs legitimate): %d / %d" % (num_botnet, num_legit))

'Class imbalance (botnet vs legitimate): 266 / 310'

## Classification
In this section, we experiment with multiple classifiers.

Results are also saved in `results_flowclassification_smote_hosts.txt`.

In [36]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC, SVC

classifiers = {
    "2-Nearest Neighbors": KNeighborsClassifier(2),
    "3-Nearest Neighbors": KNeighborsClassifier(3),
    "4-Nearest Neighbors": KNeighborsClassifier(4),
    "5-Nearest Neighbors": KNeighborsClassifier(5),
    "SVM (Linear)": LinearSVC(),
    "SVM (RBF)": SVC(),
    "Decision Tree": DecisionTreeClassifier(max_depth=5),
    "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "AdaBoost": AdaBoostClassifier(),
#     "Naive Bayes": GaussianNB()
}

### Evaluate classifiers

In [38]:
# With SMOTE
for smote_ratio in [0, .9, 1]:

    print("\n\n*****\nSMOTE: %.3f\n*****\n\n" % smote_ratio, flush=True)
    
    if smote_ratio > 0:
        X, y = SMOTE(sampling_strategy=smote_ratio).fit_resample(X_train, y_train)
    else:
        X, y = X_train, y_train
    
    display("Botnet samples: %d / %d" % (np.sum(y), len(y)))
    
    for name, clf in classifiers.items():

        print("\n\nRunning: %s\n" % clf, flush=True)

         # Fit classifier
        clf.fit(X, y)

        # Predict labels for the test set
        y_pred = clf.predict(X_test)

        # Print performance
        print(classification_report(y_test, y_pred, target_names=CLASS_LABEL_LIST), flush=True)
        print_cm(confusion_matrix(y_test, y_pred), CLASS_LABEL_LIST)



*****
SMOTE: 0.000
*****




'Botnet samples: 204 / 432'



Running: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=2, p=2,
           weights='uniform')

              precision    recall  f1-score   support

  LEGITIMATE       0.90      0.99      0.94        82
      Botnet       0.98      0.85      0.91        62

   micro avg       0.93      0.93      0.93       144
   macro avg       0.94      0.92      0.93       144
weighted avg       0.94      0.93      0.93       144

               LEGITIMATE     Botnet 
    LEGITIMATE         81          1 
        Botnet          9         53 


Running: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

              precision    recall  f1-score   support

  LEGITIMATE       0.94      0.99      0.96        82
      Botnet       0.98      0.92      0.95        62

   micro avg       0.96      0.96    



'Botnet samples: 205 / 433'



Running: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=2, p=2,
           weights='uniform')

              precision    recall  f1-score   support

  LEGITIMATE       0.90      0.99      0.94        82
      Botnet       0.98      0.85      0.91        62

   micro avg       0.93      0.93      0.93       144
   macro avg       0.94      0.92      0.93       144
weighted avg       0.94      0.93      0.93       144

               LEGITIMATE     Botnet 
    LEGITIMATE         81          1 
        Botnet          9         53 


Running: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

              precision    recall  f1-score   support

  LEGITIMATE       0.94      0.99      0.96        82
      Botnet       0.98      0.92      0.95        62

   micro avg       0.96      0.96    



'Botnet samples: 228 / 456'



Running: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=2, p=2,
           weights='uniform')

              precision    recall  f1-score   support

  LEGITIMATE       0.90      0.99      0.94        82
      Botnet       0.98      0.85      0.91        62

   micro avg       0.93      0.93      0.93       144
   macro avg       0.94      0.92      0.93       144
weighted avg       0.94      0.93      0.93       144

               LEGITIMATE     Botnet 
    LEGITIMATE         81          1 
        Botnet          9         53 


Running: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

              precision    recall  f1-score   support

  LEGITIMATE       0.94      0.99      0.96        82
      Botnet       0.98      0.92      0.95        62

   micro avg       0.96      0.96    

