In [None]:
# imports 1

import numpy as np
import pandas as pd


In [None]:
# imports 2

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
# imports 3

import matplotlib.pyplot as plt
import seaborn as sns
sns.color_palette()
import statsmodels.api as sm

In [None]:
# imports 4

from xgboost import XGBClassifier

import joblib

### <span style = "color:green"> Read and prepare the UNNID data </span>

In [None]:
pd.options.display.float_format = "{:.2f}".format

In [None]:
# have an idea of the size of the data
%%sh
wc -l ../data/raw/UNR-IDD.csv

In [None]:
# import data as pd dataframe

rawdata_path = "../data/raw/UNR-IDD.csv"
raw_df = pd.read_csv(rawdata_path, index_col=None)

In [None]:
# save raw_df as csv to folder for further use

raw_df.to_csv("../data/raw/raw_df.csv", index=False)

In [None]:
# look at the data

raw_df.head()

In [None]:
raw_df.shape

In [None]:
# get an idea of the features  and the target

raw_df.columns

In [None]:
# get numerical and categorical columns
num_cols = list(raw_df._get_numeric_data().columns)
categ_cols = [colum for colum in raw_df.columns if colum not in num_cols]  #easy way of getting the categorical column names
print("categorical",
categ_cols , "\n"
"numerical",
num_cols)

In [None]:
# create the labels for binary labels and label

labels_binlab = raw_df["Binary Label"]
labels_labcat = raw_df["Label"]

In [None]:
# encode the labels with values

labels_binlab = labels_binlab.replace(to_replace= ["Attack", "Normal"], value= [1 , 0])
labels_labcat = labels_labcat.replace(to_replace= ['Blackhole', 'Diversion', 'Normal', 'Overflow', 'PortScan', 'TCP-SYN'], value= [1 , 2, 3, 4, 5, 0])

In [None]:
# create and use a results_dict to output the type of the attack, or normal state, as in
# results_dict[model.predict(single_data_sample)] or query_answer = results_dict[model.predict(single_data_sample)]
# then use query_answer to output the answer to text / email, voice message or red/green flag in dashboard.

results_dict = {key : value for key, value in zip([1 , 2, 3, 4, 5, 0], ['Blackhole', 'Diversion', 'Normal', 'Overflow', 'PortScan', 'TCP-SYN'])}
results_dict

In [None]:
# save all new dataframes

labels_binlab.to_csv("../data/processed/labels_binlab.csv", index=False)
labels_labcat.to_csv("../data/processed/labels_labcat.csv", index=False)

In [None]:
# create reduced dataset without columns that provide no information.
# dropping also switch  id and port id

red_new_df = raw_df[[#'Switch ID', # this is no general info but data from setup used to model data
                        #'Port Number', # this is no general info but data from setup used to model data
                        'Received Packets', 
                        'Received Bytes', 
                        'Sent Bytes', 
                        'Sent Packets', 
                        'Port alive Duration (S)',
                        #'Packets Rx Dropped', #empty feature
                        #'Packets Tx Dropped', #empty feature
                        #'Packets Rx Errors', #empty feature
                        #'Packets Tx Errors', #empty feature
                        'Delta Received Packets', 
                        'Delta Received Bytes',
                        'Delta Sent Bytes', 
                        'Delta Sent Packets',
                        #'Delta Port alive Duration (S)', # new # feature witn only one value for the set 
                        #'Delta Packets Rx Dropped', #empty feature
                        #' Delta Packets Tx Dropped', #empty feature
                        #'Delta Packets Rx Errors',#empty feature
                        #'Delta Packets Tx Errors', #empty feature
                        #'Connection Point', # new # information not general bur associated with test setup for data generation
                        'Total Load/Rate',
                        'Total Load/Latest', 
                        'Unknown Load/Rate', 
                        'Unknown Load/Latest',
                        'Latest bytes counter', 
                        #'is_valid', # info from data generation set up
                        #'Table ID', #empty feature
                        #'Active Flow Entries', # new # unknown feature source not replicable in real data?
                        'Packets Looked Up', 
                        'Packets Matched', 
                        #'Max Size', # # unknown feature source not replicable in real data?
                        'Label',
                        'Binary Label']]

In [None]:
# save  new dataset

red_new_df.to_csv("../data/processed/red_new_df.csv", index=False)

In [None]:
# drop the label columns creating the reduced features dataframe

red_new_features = red_new_df.drop(['Label', "Binary Label"], axis=1)
red_new_features.to_csv("../data/processed/red_new_features.csv", index=False)

### <span style = "color:green"> Select an appropriate ML classification model for the task and the data. </span>

In [None]:
# create train and test data for multi class labels 

X_red_new_train_cat, X_red_new_test_cat, y_red_new_train_cat, y_red_new_test_cat = train_test_split(red_new_features,labels_labcat, random_state= 0 , test_size= 0.2)

# save the test and train sets

sets = [X_red_new_train_cat,X_red_new_test_cat,y_red_new_train_cat,y_red_new_test_cat]
names = list(str("X_red_new_train_cat,X_red_new_test_cat,y_red_new_train_cat,y_red_new_test_cat").split(','))
for idx in range(len(sets)):
    sets[idx].to_csv("../data/processed/" + names[idx] + ".csv", index=False)

### <span style = "color:green"> Run simple logistic regression model on train and test it. </span>

In [None]:
# run Logistic Regression model for multi class labels WITHOUT STANDARIZATION

lgr_no_pipeline_cat = Pipeline(steps = [ 
        ("logistic_regression", LogisticRegression(class_weight= "balanced", random_state= 0, max_iter = 4000))])


lgr_no_pipeline_cat.fit(X_red_new_train_cat, y_red_new_train_cat)
y_red_new_pred_cat = lgr_no_pipeline_cat.predict(X_red_new_test_cat)
print(classification_report(y_red_new_test_cat, y_red_new_pred_cat))

In [None]:
# run Logistic Regression model for multi class labels WITH STANDARIZATION
lgr_yes_pipeline_cat = Pipeline(steps = [ ("numeric", StandardScaler() ),
        ("logistic_regression", LogisticRegression(class_weight= "balanced", random_state= 0, max_iter = 600))])


lgr_yes_pipeline_cat.fit(X_red_new_train_cat, y_red_new_train_cat)
y_yes_red_new_pred_cat = lgr_yes_pipeline_cat.predict(X_red_new_test_cat)
print(classification_report(y_red_new_test_cat, y_yes_red_new_pred_cat))

### <span style = "color:green"> Perform TPOT analysis and select a better model, fine tuned. </span>

In [None]:
!pip install TPOT

In [None]:
# check the names of the data files used in the analysis, do not use binary labels: use multi-labels

%%time
from sklearn.metrics import make_scorer


from tpot import TPOTClassifier
tpot = TPOTClassifier(generations=5, 
                      population_size=8,
                      scoring=None,
                      verbosity=2,
                      random_state=42)
tpot.fit(X_train_bin, y_train_bin)
print(f"Tpop score on test data: {tpot.score(X_test_bin, y_test_bin):.2f}")
tpot.export('tpot_network_analytics.py')

In [None]:
cat tpot_network_analytics.py

In [None]:
# train xgboos classifier model recommended by TPOT // MODEL PARAMETERS TAKEN FROM ANOTHER TPOT RUN

model_jg_01 = XGBClassifier(learning_rate=1.0, max_depth=5, min_child_weight=8, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)
# Fix random state in exported estimator
if hasattr(model_jg_01, 'random_state'):
    setattr(model_jg_01, 'random_state', 42)

model_jg_01.fit(X_red_new_train_cat, y_red_new_train_cat)

In [None]:
# check metrics for trained model

results_new_red = model_jg_01.predict(X_red_new_test_cat) # eview data names
print(classification_report(y_red_new_test_cat, results_new_red)) # review data names

In [None]:
# save fina model with joblib

joblib.dump(model_jg_01, "model_jg_01.joblib")

In [None]:
# get one sample data point from the test set by entering the sample id, valid id in range(len(test_data))
# def get_test_sample(sample_id):

sam = 345
sample1 = X_red_new_test_cat.iloc[[sam]]
tag1 = y_red_new_test_cat.iloc[[sam]]
type(sample1)