# Synthetic Data Experimentation

All testing done in this notebook is for proof of concept. We still need to decide exactly what our definition of "Darknet" is.
<br>
The following sections experiment with undersampling, oversampling, and synthetic generation. Furthermore, since Linear Discriminant Analysis runs much faster than Random Forest, the bulk of the SMOTE classifiers are tested on LD instead of RF classifiers

In [30]:
import numpy as np
import pandas as pd

# Read csv files into dataframes
X=pd.read_csv("../Data/original/cleaned_X.csv")
y=pd.read_csv("../Data/original/cleaned_y.csv")

# This notebook focuses on classification of darknet vs non-darknet
y=y["Label_dark"]

In [42]:
print(f"X dimensions: {X.shape}, y dimensions {y.shape}")

X dimensions: (117024, 64), y dimensions (117024,)


# Baseline Random Forest Test

In [49]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier= RandomForestClassifier(n_estimators=10)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("AUC:",metrics.roc_auc_score(y_test, y_pred))
print("F1:",metrics.f1_score(y_test, y_pred))

Accuracy: 0.9791070284127323
AUC: 0.964880926718256
F1: 0.949363156259708


# Perform random undersampling

In [27]:
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler()

X_und,y_und=under.fit_resample(X,y)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_und, y_und, test_size=0.2)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("AUC:",metrics.roc_auc_score(y_test, y_pred))
print("F1:",metrics.f1_score(y_test, y_pred))

Accuracy: 0.9703319502074689
AUC: 0.9703462102487002
F1: 0.9703381041277743


In [40]:
print(f"X dimensions: {X_und.shape}, y dimensions {y_und.shape}")

X dimensions: (48196, 64), y dimensions (48196,)


In [19]:
X_und.to_csv("../Data/synthetic/RandomUndersampling/using_darknet_labelX.csv")
y_und.to_csv("../Data/synthetic/RandomUndersampling/using_darknet_labely.csv")

# Perform random oversampling

In [33]:
from imblearn.over_sampling import RandomOverSampler

over = RandomOverSampler()

X_over,y_over=over.fit_resample(X,y)

In [35]:
rf_classifier = RandomForestClassifier(n_estimators=10)

X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("AUC:",metrics.roc_auc_score(y_test, y_pred))
print("F1:",metrics.f1_score(y_test, y_pred))

Accuracy: 0.9910683059374243
AUC: 0.9911047536052024
F1: 0.9910723889426697


In [41]:
print(f"X dimensions: {X_over.shape}, y dimensions {y_over.shape}")

X dimensions: (185852, 64), y dimensions (185852,)


In [22]:
X_over.to_csv("../Data/synthetic/RandomOversampling/using_darknet_labelX.csv")
y_over.to_csv("../Data/synthetic/RandomOversampling/using_darknet_labely.csv")

# Direct undersampling
### Using Edited Nearest Neighbors, Instance Hardness Threshold, and TomekLinks
### This section can be removed because it is included in the n_sampling later. Keeping it for the moment to show different ways to do things

In [24]:
from imblearn.under_sampling import EditedNearestNeighbours,InstanceHardnessThreshold,TomekLinks 

under_samp_models=[EditedNearestNeighbours(),InstanceHardnessThreshold(),TomekLinks()]
under_samp_names=["EditedNearestNeighbours","InstanceHardnessThreshold", "TomekLinks"]

for i,under_samp_model in enumerate(under_samp_models):
    X_und_dir,y_und_dir=under_samp_model.fit_resample(X,y)
    X_und_dir.to_csv(f"../Data/synthetic/DirectedUndersampling/{under_samp_names[i]}X.csv")
    y_und_dir.to_csv(f"../Data/synthetic/DirectedUndersampling/{under_samp_names[i]}y.csv")

# Testing a single SMOTE variant with Random Forest

In [4]:
# Convert the data and target datasets to numpy arrays
X = X.to_numpy()
y = y.to_numpy()

In [5]:
# The smote_variants package wants specific labels for the data
dataset= {'data': X, 'target': y, 'name': 'darknet'}

In [6]:
import os.path
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import smote_variants as sv
import sklearn.datasets as datasets
import smote_variants as sv 
from sklearn.model_selection import train_test_split
from sklearn import metrics


cache_path= "../Cache/SmoteCache"

if not os.path.exists(cache_path):
    os.makedirs(cache_path)

In [7]:
rf_classifier= RandomForestClassifier(n_estimators=10)

# samp_obj and cl_obj contain the oversampling and classifier objects which give the
# best performance together
# Performs RepeatedStratifiedKFold with 5 splits and 3 repeats 
samp_obj, cl_obj= sv.model_selection(dataset= dataset,
                                        samplers= sv.get_n_quickest_oversamplers(1),
                                        classifiers= [rf_classifier],
                                        cache_path= cache_path,
                                        n_jobs= 5,
                                        max_samp_par_comb= 35)

2022-02-12 13:45:11,504:INFO:dataset: darknet, samplings_available: False, evaluations_available: False
2022-02-12 13:45:11,504:INFO:doing the folding
2022-02-12 13:45:11,505:INFO:Folding reading from file folding_darknet.pickle
2022-02-12 13:45:12,068:INFO:do the samplings
2022-02-12 13:45:12,068:INFO:create sampling objects, random_state: 
2022-02-12 13:45:12,069:INFO:samplers: [<class 'smote_variants._smote_variants.SPY'>]
2022-02-12 13:45:12,069:INFO:[{'n_neighbors': 3, 'threshold': 0.3}, {'n_neighbors': 3, 'threshold': 0.5}, {'n_neighbors': 3, 'threshold': 0.7}, {'n_neighbors': 5, 'threshold': 0.3}, {'n_neighbors': 5, 'threshold': 0.5}, {'n_neighbors': 5, 'threshold': 0.7}, {'n_neighbors': 7, 'threshold': 0.3}, {'n_neighbors': 7, 'threshold': 0.5}, {'n_neighbors': 7, 'threshold': 0.7}]
2022-02-12 13:45:12,070:INFO:random_indices: [5 7 8 6 0 1 2 4 3]
2022-02-12 13:45:12,070:INFO:[{'n_neighbors': 5, 'threshold': 0.7}, {'n_neighbors': 7, 'threshold': 0.5}, {'n_neighbors': 7, 'thresho

["('darknet', OrderedDict([('n_neighbors', 5), ('random_state', None), ('threshold', 0.3)]), 'RandomForestClassifier', OrderedDict([('bootstrap', True), ('ccp_alpha', 0.0), ('class_weight', None), ('criterion', 'gini'), ('max_depth', None), ('max_features', 'auto'), ('max_leaf_nodes', None), ('max_samples', None), ('min_impurity_decrease', 0.0), ('min_impurity_split', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('min_weight_fraction_leaf', 0.0), ('n_estimators', 10), ('n_jobs', None), ('oob_score', False), ('random_state', None), ('verbose', 0), ('warm_start', False)]))"]
["('darknet', OrderedDict([('n_neighbors', 5), ('random_state', None), ('threshold', 0.5)]), 'RandomForestClassifier', OrderedDict([('bootstrap', True), ('ccp_alpha', 0.0), ('class_weight', None), ('criterion', 'gini'), ('max_depth', None), ('max_features', 'auto'), ('max_leaf_nodes', None), ('max_samples', None), ('min_impurity_decrease', 0.0), ('min_impurity_split', None), ('min_samples_leaf', 1), ('mi

2022-02-12 14:10:39,352:INFO:concatenating the results
2022-02-12 14:10:39,400:INFO:aggregating the results


In [8]:
print(f"The best outcome is the {samp_obj} variant with the {cl_obj} classifier")

The best outcome is the ('SPY', "{'n_neighbors': 5, 'threshold': 0.3, 'n_jobs': 1}") variant with the RandomForestClassifier(n_estimators=10) classifier


In [13]:
# training the best techniques using the entire dataset
X_samp, y_samp= samp_obj.sample(dataset['data'], dataset['target'])

2022-02-12 14:13:04,589:INFO:SPY: Running sampling via ('SPY', "{'n_neighbors': 5, 'threshold': 0.3, 'n_jobs': 1}")


In [24]:
X_samp_df = pd.DataFrame(X_samp, columns = ['Src Port', 'Dst Port', 'Protocol', 'Flow Duration', 'Total Fwd Packet',
       'Total Bwd packets', 'Total Length of Fwd Packet',
       'Total Length of Bwd Packet', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count',
       'Down/Up Ratio', 'Average Packet Size', 'Fwd Segment Size Avg',
       'Bwd Segment Size Avg', 'Bwd Packet/Bulk Avg', 'Bwd Bulk Rate Avg',
       'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Bytes',
       'FWD Init Win Bytes', 'Bwd Init Win Bytes', 'Fwd Act Data Pkts',
       'Fwd Seg Size Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min'])
y_samp_df = pd.DataFrame(y_samp, columns = ['Darknet'])

In [51]:
X_samp_df.to_csv("../Data/synthetic/SMOTE/best_versionX.csv")
y_samp_df.to_csv("../Data/synthetic/SMOTE/best_versiony.csv")

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X_samp_df, y_samp_df, test_size=0.2)
cl_obj.fit(X_samp, y_samp)

RandomForestClassifier(n_estimators=10)

In [48]:
y_pred = cl_obj.predict(X_test)

In [53]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("AUC:",metrics.roc_auc_score(y_test, y_pred))
print("F1:",metrics.f1_score(y_test, y_pred))


Accuracy: 0.9979918820764794
AUC: 0.9959099117525652
F1: 0.9951330640985814


# Baseline Linear Discriminant Analysis Test

In [53]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

ld_classifier = LinearDiscriminantAnalysis()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
ld_classifier.fit(X_train, y_train)

y_pred = ld_classifier.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("AUC:",metrics.roc_auc_score(y_test, y_pred))
print("F1:",metrics.f1_score(y_test, y_pred))

Accuracy: 0.8810937833796197
AUC: 0.7981089733345222
F1: 0.6936034349884399


# Testing 10 of the 85 SMOTE Variants with Linear Discriminant Analysis

In [2]:
# Convert the data and target datasets to numpy arrays
X = X.to_numpy()
y = y.to_numpy()

In [3]:
# The smote_variants package wants specific labels for the data
dataset= {'data': X, 'target': y, 'name': 'darknet'}

In [4]:
import os.path
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import smote_variants as sv
import sklearn.datasets as datasets
import smote_variants as sv 
from sklearn.model_selection import train_test_split
from sklearn import metrics


cache_path= "/mnt/sda1"

if not os.path.exists(cache_path):
    os.makedirs(cache_path)

In [5]:
ld_classifier= LinearDiscriminantAnalysis()

# samp_obj and cl_obj contain the oversampling and classifier objects which give the
# best performance together
# Performs RepeatedStratifiedKFold with 5 splits and 3 repeats 
samp_obj, cl_obj= sv.model_selection(dataset= dataset,
                                        samplers= sv.get_n_quickest_oversamplers(10),
                                        classifiers= [ld_classifier],
                                        cache_path= cache_path,
                                        n_jobs= 5,
                                        max_samp_par_comb= 25)

2022-02-12 19:10:06,223:INFO:dataset: darknet, samplings_available: False, evaluations_available: False
2022-02-12 19:10:06,223:INFO:doing the folding
2022-02-12 19:10:06,224:INFO:Folding doing folding folding_darknet.pickle
2022-02-12 19:10:06,555:INFO:Folding dumping to file folding_darknet.pickle
2022-02-12 19:10:07,654:INFO:do the samplings
2022-02-12 19:10:07,654:INFO:create sampling objects, random_state: 
2022-02-12 19:10:07,655:INFO:samplers: [<class 'smote_variants._smote_variants.SPY'>, <class 'smote_variants._smote_variants.OUPS'>, <class 'smote_variants._smote_variants.SMOTE_D'>, <class 'smote_variants._smote_variants.NT_SMOTE'>, <class 'smote_variants._smote_variants.Gazzah'>, <class 'smote_variants._smote_variants.ROSE'>, <class 'smote_variants._smote_variants.NDO_sampling'>, <class 'smote_variants._smote_variants.SMOTE'>, <class 'smote_variants._smote_variants.Borderline_SMOTE1'>, <class 'smote_variants._smote_variants.Borderline_SMOTE2'>]
2022-02-12 19:10:07,655:INFO:[{

2022-02-12 19:10:07,678:INFO:{'k': 7, 'proportion': 0.1, 'random_state': None}
2022-02-12 19:10:07,679:INFO:Sampling sampler parameter string OrderedDict([('k', 7), ('proportion', 0.1), ('random_state', None)])
2022-02-12 19:10:07,679:INFO:{'k': 7, 'proportion': 0.25, 'random_state': None}
2022-02-12 19:10:07,679:INFO:Sampling sampler parameter string OrderedDict([('k', 7), ('proportion', 0.25), ('random_state', None)])
2022-02-12 19:10:07,679:INFO:{'k': 3, 'proportion': 0.5, 'random_state': None}
2022-02-12 19:10:07,680:INFO:Sampling sampler parameter string OrderedDict([('k', 3), ('proportion', 0.5), ('random_state', None)])
2022-02-12 19:10:07,680:INFO:{'k': 7, 'proportion': 0.5, 'random_state': None}
2022-02-12 19:10:07,680:INFO:Sampling sampler parameter string OrderedDict([('k', 7), ('proportion', 0.5), ('random_state', None)])
2022-02-12 19:10:07,681:INFO:{'k': 5, 'proportion': 2.0, 'random_state': None}
2022-02-12 19:10:07,681:INFO:Sampling sampler parameter string OrderedDict(

2022-02-12 19:10:07,698:INFO:{'n_components': 5, 'proportion': 0.25, 'random_state': None}
2022-02-12 19:10:07,698:INFO:Sampling sampler parameter string OrderedDict([('n_components', 5), ('proportion', 0.25), ('random_state', None)])
2022-02-12 19:10:07,699:INFO:{'n_components': 2, 'proportion': 0.25, 'random_state': None}
2022-02-12 19:10:07,699:INFO:Sampling sampler parameter string OrderedDict([('n_components', 2), ('proportion', 0.25), ('random_state', None)])
2022-02-12 19:10:07,699:INFO:{'n_components': 2, 'proportion': 1.0, 'random_state': None}
2022-02-12 19:10:07,699:INFO:Sampling sampler parameter string OrderedDict([('n_components', 2), ('proportion', 1.0), ('random_state', None)])
2022-02-12 19:10:07,700:INFO:{'n_components': 5, 'proportion': 0.75, 'random_state': None}
2022-02-12 19:10:07,700:INFO:Sampling sampler parameter string OrderedDict([('n_components', 5), ('proportion', 0.75), ('random_state', None)])
2022-02-12 19:10:07,700:INFO:{'n_components': 3, 'proportion':

2022-02-12 19:10:07,724:INFO:{'T': 0.5, 'n_neighbors': 3, 'proportion': 1.5, 'random_state': None}
2022-02-12 19:10:07,724:INFO:Sampling sampler parameter string OrderedDict([('T', 0.5), ('n_neighbors', 3), ('proportion', 1.5), ('random_state', None)])
2022-02-12 19:10:07,725:INFO:{'T': 0.5, 'n_neighbors': 7, 'proportion': 0.75, 'random_state': None}
2022-02-12 19:10:07,725:INFO:Sampling sampler parameter string OrderedDict([('T', 0.5), ('n_neighbors', 7), ('proportion', 0.75), ('random_state', None)])
2022-02-12 19:10:07,726:INFO:{'T': 0.5, 'n_neighbors': 5, 'proportion': 2.0, 'random_state': None}
2022-02-12 19:10:07,726:INFO:Sampling sampler parameter string OrderedDict([('T', 0.5), ('n_neighbors', 5), ('proportion', 2.0), ('random_state', None)])
2022-02-12 19:10:07,727:INFO:{'T': 0.5, 'n_neighbors': 7, 'proportion': 0.25, 'random_state': None}
2022-02-12 19:10:07,727:INFO:Sampling sampler parameter string OrderedDict([('T', 0.5), ('n_neighbors', 7), ('proportion', 0.25), ('random_

2022-02-12 19:10:07,742:INFO:{'n_neighbors': 7, 'proportion': 1.5, 'random_state': None}
2022-02-12 19:10:07,742:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 7), ('proportion', 1.5), ('random_state', None)])
2022-02-12 19:10:07,743:INFO:{'n_neighbors': 5, 'proportion': 1.5, 'random_state': None}
2022-02-12 19:10:07,743:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 5), ('proportion', 1.5), ('random_state', None)])
2022-02-12 19:10:07,743:INFO:{'n_neighbors': 7, 'proportion': 2.0, 'random_state': None}
2022-02-12 19:10:07,743:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 7), ('proportion', 2.0), ('random_state', None)])
2022-02-12 19:10:07,744:INFO:{'n_neighbors': 5, 'proportion': 0.5, 'random_state': None}
2022-02-12 19:10:07,744:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 5), ('proportion', 0.5), ('random_state', None)])
2022-02-12 19:10:07,745:INFO:{'n_neighbors': 5, 'proportion': 1.0, 'random_s

2022-02-12 19:10:07,764:INFO:{'k_neighbors': 3, 'n_neighbors': 7, 'proportion': 2.0, 'random_state': None}
2022-02-12 19:10:07,764:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 3), ('n_neighbors', 7), ('proportion', 2.0), ('random_state', None)])
2022-02-12 19:10:07,764:INFO:{'k_neighbors': 3, 'n_neighbors': 5, 'proportion': 2.0, 'random_state': None}
2022-02-12 19:10:07,765:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 3), ('n_neighbors', 5), ('proportion', 2.0), ('random_state', None)])
2022-02-12 19:10:07,765:INFO:{'k_neighbors': 5, 'n_neighbors': 5, 'proportion': 1.5, 'random_state': None}
2022-02-12 19:10:07,765:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 5), ('n_neighbors', 5), ('proportion', 1.5), ('random_state', None)])
2022-02-12 19:10:07,766:INFO:{'k_neighbors': 3, 'n_neighbors': 7, 'proportion': 0.75, 'random_state': None}
2022-02-12 19:10:07,767:INFO:Sampling sampler parameter string OrderedDict([('k_neigh

2022-02-12 19:10:07,783:INFO:random_indices: [23 55 24 25 48 49 28 17 47 27 37 56 61  9 14 35 20  5  3 18  4 54 57 51
 50]
2022-02-12 19:10:07,783:INFO:[{'k_neighbors': 5, 'n_neighbors': 3, 'proportion': 0.5}, {'k_neighbors': 7, 'n_neighbors': 5, 'proportion': 2.0}, {'k_neighbors': 5, 'n_neighbors': 3, 'proportion': 0.75}, {'k_neighbors': 5, 'n_neighbors': 3, 'proportion': 1.0}, {'k_neighbors': 7, 'n_neighbors': 3, 'proportion': 2.0}, {'k_neighbors': 7, 'n_neighbors': 5, 'proportion': 0.1}, {'k_neighbors': 5, 'n_neighbors': 5, 'proportion': 0.1}, {'k_neighbors': 3, 'n_neighbors': 7, 'proportion': 0.75}, {'k_neighbors': 7, 'n_neighbors': 3, 'proportion': 1.5}, {'k_neighbors': 5, 'n_neighbors': 3, 'proportion': 2.0}, {'k_neighbors': 5, 'n_neighbors': 7, 'proportion': 0.5}, {'k_neighbors': 7, 'n_neighbors': 7, 'proportion': 0.1}, {'k_neighbors': 7, 'n_neighbors': 7, 'proportion': 1.5}, {'k_neighbors': 3, 'n_neighbors': 5, 'proportion': 0.5}, {'k_neighbors': 3, 'n_neighbors': 7, 'proportio

2022-02-12 19:10:07,809:INFO:executing 168 sampling in parallel
2022-02-12 20:59:51,717:INFO:do the evaluations
2022-02-12 20:59:51,777:INFO:create classifier jobs
2022-02-12 20:59:51,796:INFO:{'n_neighbors': 3, 'threshold': 0.3, 'random_state': None}
2022-02-12 20:59:51,798:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 3), ('random_state', None), ('threshold', 0.3)])
2022-02-12 20:59:51,804:INFO:{'n_neighbors': 5, 'threshold': 0.7, 'random_state': None}
2022-02-12 20:59:51,806:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 5), ('random_state', None), ('threshold', 0.7)])
2022-02-12 20:59:51,808:INFO:{'n_neighbors': 5, 'threshold': 0.5, 'random_state': None}
2022-02-12 20:59:51,812:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 5), ('random_state', None), ('threshold', 0.5)])
2022-02-12 20:59:51,815:INFO:{'n_neighbors': 5, 'threshold': 0.3, 'random_state': None}
2022-02-12 20:59:51,816:INFO:Sampling sampler parameter strin

2022-02-12 20:59:51,872:INFO:Sampling sampler parameter string OrderedDict([('k', 7), ('proportion', 2.0), ('random_state', None)])
2022-02-12 20:59:51,875:INFO:{'k': 5, 'proportion': 2.0, 'random_state': None}
2022-02-12 20:59:51,876:INFO:Sampling sampler parameter string OrderedDict([('k', 5), ('proportion', 2.0), ('random_state', None)])
2022-02-12 20:59:51,877:INFO:{'proportion': 2.0, 'random_state': None}
2022-02-12 20:59:51,878:INFO:Sampling sampler parameter string OrderedDict([('proportion', 2.0), ('random_state', None)])
2022-02-12 20:59:51,878:INFO:{'k_neighbors': 7, 'n_neighbors': 5, 'proportion': 1.5, 'random_state': None}
2022-02-12 20:59:51,879:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 7), ('n_neighbors', 5), ('proportion', 1.5), ('random_state', None)])
2022-02-12 20:59:51,879:INFO:{'k_neighbors': 3, 'n_neighbors': 3, 'proportion': 1.5, 'random_state': None}
2022-02-12 20:59:51,880:INFO:Sampling sampler parameter string OrderedDict([('k_neighbor

2022-02-12 20:59:51,915:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 5), ('n_neighbors', 7), ('proportion', 1.0), ('random_state', None)])
2022-02-12 20:59:51,916:INFO:{'n_neighbors': 7, 'proportion': 1.0, 'random_state': None}
2022-02-12 20:59:51,916:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 7), ('proportion', 1.0), ('random_state', None)])
2022-02-12 20:59:51,917:INFO:{'n_neighbors': 5, 'proportion': 1.0, 'random_state': None}
2022-02-12 20:59:51,917:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 5), ('proportion', 1.0), ('random_state', None)])
2022-02-12 20:59:51,918:INFO:{'n_neighbors': 3, 'proportion': 1.0, 'random_state': None}
2022-02-12 20:59:51,918:INFO:Sampling sampler parameter string OrderedDict([('n_neighbors', 3), ('proportion', 1.0), ('random_state', None)])
2022-02-12 20:59:51,919:INFO:{'T': 0.5, 'n_neighbors': 5, 'proportion': 1.0, 'random_state': None}
2022-02-12 20:59:51,919:INFO:Sampling sampler 

2022-02-12 20:59:51,953:INFO:{'proportion': 0.75, 'random_state': None}
2022-02-12 20:59:51,953:INFO:Sampling sampler parameter string OrderedDict([('proportion', 0.75), ('random_state', None)])
2022-02-12 20:59:51,954:INFO:{'k_neighbors': 7, 'n_neighbors': 5, 'proportion': 0.5, 'random_state': None}
2022-02-12 20:59:51,954:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 7), ('n_neighbors', 5), ('proportion', 0.5), ('random_state', None)])
2022-02-12 20:59:51,955:INFO:{'k_neighbors': 3, 'n_neighbors': 5, 'proportion': 0.5, 'random_state': None}
2022-02-12 20:59:51,955:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 3), ('n_neighbors', 5), ('proportion', 0.5), ('random_state', None)])
2022-02-12 20:59:51,956:INFO:{'k_neighbors': 5, 'n_neighbors': 7, 'proportion': 0.5, 'random_state': None}
2022-02-12 20:59:51,957:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 5), ('n_neighbors', 7), ('proportion', 0.5), ('random_state', None)]

2022-02-12 20:59:51,994:INFO:Sampling sampler parameter string OrderedDict([('n_components', 5), ('proportion', 0.25), ('random_state', None)])
2022-02-12 20:59:51,995:INFO:{'n_components': 3, 'proportion': 0.25, 'random_state': None}
2022-02-12 20:59:51,997:INFO:Sampling sampler parameter string OrderedDict([('n_components', 3), ('proportion', 0.25), ('random_state', None)])
2022-02-12 20:59:51,998:INFO:{'proportion': 0.25, 'random_state': None}
2022-02-12 20:59:51,999:INFO:Sampling sampler parameter string OrderedDict([('proportion', 0.25), ('random_state', None)])
2022-02-12 20:59:51,999:INFO:{'k': 5, 'proportion': 0.25, 'random_state': None}
2022-02-12 20:59:52,000:INFO:Sampling sampler parameter string OrderedDict([('k', 5), ('proportion', 0.25), ('random_state', None)])
2022-02-12 20:59:52,001:INFO:{'k': 3, 'proportion': 0.25, 'random_state': None}
2022-02-12 20:59:52,001:INFO:Sampling sampler parameter string OrderedDict([('k', 3), ('proportion', 0.25), ('random_state', None)])


["('darknet', OrderedDict([('n_neighbors', 3), ('random_state', None), ('threshold', 0.3)]), 'LinearDiscriminantAnalysis', OrderedDict([('covariance_estimator', None), ('n_components', None), ('priors', None), ('shrinkage', None), ('solver', 'svd'), ('store_covariance', False), ('tol', 0.0001)]))"]
["('darknet', OrderedDict([('n_neighbors', 5), ('random_state', None), ('threshold', 0.7)]), 'LinearDiscriminantAnalysis', OrderedDict([('covariance_estimator', None), ('n_components', None), ('priors', None), ('shrinkage', None), ('solver', 'svd'), ('store_covariance', False), ('tol', 0.0001)]))"]
["('darknet', OrderedDict([('n_neighbors', 5), ('random_state', None), ('threshold', 0.5)]), 'LinearDiscriminantAnalysis', OrderedDict([('covariance_estimator', None), ('n_components', None), ('priors', None), ('shrinkage', None), ('solver', 'svd'), ('store_covariance', False), ('tol', 0.0001)]))"]
["('darknet', OrderedDict([('n_neighbors', 5), ('random_state', None), ('threshold', 0.3)]), 'Linear

2022-02-12 20:59:52,006:INFO:{'k_neighbors': 5, 'n_neighbors': 7, 'proportion': 0.1, 'random_state': None}
2022-02-12 20:59:52,007:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 5), ('n_neighbors', 7), ('proportion', 0.1), ('random_state', None)])
2022-02-12 20:59:52,007:INFO:{'k_neighbors': 3, 'n_neighbors': 7, 'proportion': 0.1, 'random_state': None}
2022-02-12 20:59:52,008:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 3), ('n_neighbors', 7), ('proportion', 0.1), ('random_state', None)])
2022-02-12 20:59:52,008:INFO:{'k_neighbors': 7, 'n_neighbors': 7, 'proportion': 0.1, 'random_state': None}
2022-02-12 20:59:52,009:INFO:Sampling sampler parameter string OrderedDict([('k_neighbors', 7), ('n_neighbors', 7), ('proportion', 0.1), ('random_state', None)])
2022-02-12 20:59:52,009:INFO:{'k_neighbors': 5, 'n_neighbors': 5, 'proportion': 0.1, 'random_state': None}
2022-02-12 20:59:52,009:INFO:Sampling sampler parameter string OrderedDict([('k_neighb

["('darknet', OrderedDict([('proportion', 0.25), ('random_state', None)]), 'LinearDiscriminantAnalysis', OrderedDict([('covariance_estimator', None), ('n_components', None), ('priors', None), ('shrinkage', None), ('solver', 'svd'), ('store_covariance', False), ('tol', 0.0001)]))"]
["('darknet', OrderedDict([('k_neighbors', 5), ('n_neighbors', 7), ('proportion', 0.1), ('random_state', None)]), 'LinearDiscriminantAnalysis', OrderedDict([('covariance_estimator', None), ('n_components', None), ('priors', None), ('shrinkage', None), ('solver', 'svd'), ('store_covariance', False), ('tol', 0.0001)]))"]
["('darknet', OrderedDict([('k_neighbors', 3), ('n_neighbors', 7), ('proportion', 0.1), ('random_state', None)]), 'LinearDiscriminantAnalysis', OrderedDict([('covariance_estimator', None), ('n_components', None), ('priors', None), ('shrinkage', None), ('solver', 'svd'), ('store_covariance', False), ('tol', 0.0001)]))"]
["('darknet', OrderedDict([('k_neighbors', 7), ('n_neighbors', 7), ('proport

2022-02-12 21:39:58,952:INFO:concatenating the results
2022-02-12 21:39:59,661:INFO:aggregating the results


In [6]:
print(f"The best outcome is the {samp_obj} variant with the {cl_obj} classifier")

The best outcome is the ('OUPS', "{'proportion': 2.0, 'n_jobs': 1, 'random_state': None}") variant with the LinearDiscriminantAnalysis() classifier


In [7]:
# training the best techniques using the entire dataset
X_samp, y_samp= samp_obj.sample(dataset['data'], dataset['target'])

2022-02-12 21:40:00,005:INFO:OUPS: Running sampling via ('OUPS', "{'proportion': 2.0, 'n_jobs': 1, 'random_state': None}")
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [8]:
X_samp_df = pd.DataFrame(X_samp, columns = ['Src Port', 'Dst Port', 'Protocol', 'Flow Duration', 'Total Fwd Packet',
       'Total Bwd packets', 'Total Length of Fwd Packet',
       'Total Length of Bwd Packet', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count',
       'Down/Up Ratio', 'Average Packet Size', 'Fwd Segment Size Avg',
       'Bwd Segment Size Avg', 'Bwd Packet/Bulk Avg', 'Bwd Bulk Rate Avg',
       'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Bytes',
       'FWD Init Win Bytes', 'Bwd Init Win Bytes', 'Fwd Act Data Pkts',
       'Fwd Seg Size Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min'])
y_samp_df = pd.DataFrame(y_samp, columns = ['Darknet'])

In [9]:
X_samp_df.to_csv("../Data/synthetic/SMOTE/best_versionX.csv")
y_samp_df.to_csv("../Data/synthetic/SMOTE/best_versiony.csv")

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_samp_df, y_samp_df, test_size=0.2)
cl_obj.fit(X_samp, y_samp)

LinearDiscriminantAnalysis()

In [11]:
y_pred=cl_obj.predict(X_test)

In [12]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("AUC:",metrics.roc_auc_score(y_test, y_pred))
print("F1:",metrics.f1_score(y_test, y_pred))

Accuracy: 0.8764347287707768
AUC: 0.8715735344766745
F1: 0.8917311744945439


# Custom Testing (Deprecated, only keeping for reference)

In [None]:
variants = [sv.kmeans_SMOTE(), sv.SMOTE(), sv.SMOTE_TomekLinks(), sv.SMOTE_ENN(), sv.Borderline_SMOTE1(), sv.Borderline_SMOTE2(), sv.ADASYN(), sv.AHC(), sv.LLE_SMOTE(), sv.distance_SMOTE(), sv.SMMO(), sv.polynom_fit_SMOTE(), sv.Stefanowski(), sv.ADOMS(), sv.Safe_Level_SMOTE(), sv.MSMOTE(), sv.DE_oversampling(), sv.SMOBD(), sv.SUNDO(), sv.MSYN(), sv.SVM_balance(), sv.TRIM_SMOTE(), sv.SMOTE_RSB(), sv.ProWSyn(), sv.SL_graph_SMOTE(), sv.NRSBoundary_SMOTE(), sv.LVQ_SMOTE(), sv.SOI_CJ(), sv.ROSE(), sv.SMOTE_OUT(), sv.SMOTE_Cosine(), sv.Selected_SMOTE(), sv.LN_SMOTE(), sv.MWMOTE(), sv.PDFOS(), sv.IPADE_ID(), sv.RWO_sampling(), sv.NEATER(), sv.DEAGO(), sv.Gazzah(), sv.MCT(), sv.ADG(), sv.SMOTE_IPF(), sv.KernelADASYN(), sv.MOT2LD(), sv.V_SYNTH(), sv.OUPS(), sv.SMOTE_D(), sv.SMOTE_PSO(), sv.CURE_SMOTE(), sv.SOMO(), sv.ISOMAP_Hybrid(), sv.CE_SMOTE(), sv.Edge_Det_SMOTE(), sv.CBSO(), sv.E_SMOTE(), sv.DBSMOTE(), sv.ASMOBD(), sv.Assembled_SMOTE(), sv.SDSMOTE(), sv.DSMOTE(), sv.G_SMOTE(), sv.NT_SMOTE(), sv.Lee(), sv.SPY(), sv.SMOTE_PSOBAT(), sv.MDO(), sv.Random_SMOTE(), sv.ISMOTE(), sv.VIS_RST(), sv.GASMOTE(), sv.A_SUWO(), sv.SMOTE_FRST_2T(), sv.AND_SMOTE(), sv.NRAS(), sv.AMSCO(), sv.SSO(), sv.NDO_sampling(), sv.DSRBF(), sv.Gaussian_SMOTE(), sv.kmeans_SMOTE(), sv.Supervised_SMOTE(), sv.SN_SMOTE(), sv.CCR(), sv.ANS(), sv.cluster_SMOTE(), sv.NoSMOTE()]
succesful = []
failed = []
progress = 0

for variant in variants:
    oversampler= variant
    try:
        X_over_samp, y_over_samp = oversampler.sample(X, y)
        succesful.append(oversampler)
    except:
        failed.append(oversampler)
    progress += 1
    if progress % 5 == 0:
        print(f"{progress} samples out of {len(variants)} completed. {len(succesful)} passed and {len(failed)} failed")

2022-02-12 13:20:27,880:INFO:kmeans_SMOTE: Running sampling via ('kmeans_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'irt': 2.0, 'n_jobs': 1, 'random_state': None}")
2022-02-12 13:20:30,184:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")
2022-02-12 13:20:38,385:INFO:SMOTE_TomekLinks: Running sampling via ('SMOTE_TomekLinks', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")
2022-02-12 13:20:38,386:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': <module 'numpy.random' from '/home/drake/miniconda3/envs/gpu2/lib/python3.6/site-packages/numpy/random/__init__.py'>}")
2022-02-12 13:20:46,565:INFO:TomekLinkRemoval: Running noise removal via TomekLinkRemoval
2022-02-12 13:27:05,209:INFO:SMOTE_ENN: Running sampling via ('SMOTE_ENN', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")
2022-02-1

5 samples out of 87 completed. 4 passed and 1 failed


2022-02-12 13:31:13,724:INFO:ADASYN: Running sampling via ('ADASYN', "{'n_neighbors': 5, 'd_th': 0.9, 'beta': 1.0, 'n_jobs': 1, 'random_state': None}")
2022-02-12 13:31:31,186:INFO:AHC: Running sampling via ('AHC', "{'strategy': 'min', 'n_jobs': 1, 'random_state': None}")
