# Data Preparation

## Import and Settings

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
import dask
import dask.dataframe as dd
import dask.array as da

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder

import itertools
from sklearn.metrics import confusion_matrix
from joblib import dump, load
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

In [6]:
df = dd.read_csv('prepared_ds.csv')

In [76]:
from joblib import dump, load

class Classifier:

    current_ml_model = None
    models = {
        'NN': 'models/path',
        'kNN': 'models/path',
        'RFC_Ada': 'models/RandomForestClassifier_AdaBoost.joblib',
        'RFC': 'models/RandomForestClassifier.joblib'
    }
    
    blacklist = 'models/blacklist.txt'
    blacklist = 'models/blacklist.txt'

    def __init__(self, model_name='RFC'):
        self.set_current_model(model_name)
        
    def set_current_model(self, model_name):
        self.current_ml_model = self.load_obj(self.models[model_name])
    
    def read_model(self, filename):
        pass
    
    def predict(self, X, blacklist = True, whitelist = True):
        X_transformed = self.pipeline(X)
        
        predictions = self.current_ml_model.predict(X_transformed)
        
        ddos = df[[]]
        
        return 
    
    def load_obj(self, filename):
        return load(filename)
    
    def encode_ports(self, x):
        if x < 1024:
             y= 'System'
        elif x > 1023 and x < 49152:
             y= 'User'
        else :
             y= 'Dynamic' 
        return y
    
    def is_malicious(self, ip):
        pass
    
    def is_benign(self, ip):
        pass
        
    
    def pipeline(self, df):
        
        try:
            df = df.drop(["Unnamed: 0"], axis=1)
        except:
            pass
        
        # Stage 1 - Cleaning
        df_cleaned = df[~df['Flow Byts/s'].isin([np.inf, -np.inf])]
        df_cleaned = df_cleaned.dropna(subset=['Flow Byts/s'])
        df_cleaned = df_cleaned.drop(['Fwd URG Flags', 'Bwd URG Flags', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Blk Rate Avg'], axis=1)
        
        # Stage 2 - Exploring
        df_cleaned['dst_port_transformed'] = df_cleaned['Dst Port'].apply(self.encode_ports)
        df_cleaned['src_port_transformed'] = df_cleaned['Src Port'].apply(self.encode_ports)
        to_be_removed = ['Fwd PSH Flags', 'Bwd PSH Flags', 'FIN Flag Cnt', 'URG Flag Cnt', 'Src Port', 'Dst Port', 'Flow ID', 'Timestamp']
        df_explored = df_cleaned.drop(to_be_removed, axis=1)        
        
        # Stage 3 - Preperation
        scaler = self.load_obj('models/std_scaler.joblib')
        pca = self.load_obj('models/pca.joblib')
        encoder = self.load_obj('models/one_hot_encoder.transformer')
        
        
        category_vars = ['Protocol','SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'dst_port_transformed', 'src_port_transformed']
        object_vars = ['Dst IP', 'Src IP']
        continuous_vars = df_explored.columns[~df_explored.columns.isin(category_vars) & ~df_explored.columns.isin(object_vars)]
        
        
        X = scaler.transform(df_explored[continuous_vars].values)
        principal_components = pca.transform(X)
        col_names = [f'PC-{i}' for i in range(principal_components.shape[1])]
        df_pca = pd.DataFrame(data = principal_components, columns = col_names)
        
        to_be_encoded = ["Protocol", "dst_port_transformed", "src_port_transformed"]
        df_encoded = pd.DataFrame(encoder.transform(df_explored[to_be_encoded]))
        df_encoded.columns = encoder.get_feature_names(to_be_encoded)
        
        to_be_not_encoded = [i for i in category_vars if i not in to_be_encoded]
        
        max_pca = 24
        df_categories = pd.concat([df_encoded, df_explored[to_be_not_encoded].reset_index()[to_be_not_encoded]], axis = 1)
        df_prepared = pd.concat([df_categories, df_pca.iloc[:,list(range(max_pca))]], axis = 1)
   
        return df_prepared
    
    def save_obj(self, obj, file):
        pass
 

In [7]:
import dask
import dask.dataframe as dd
import dask.array as da
df = dd.read_csv('unbalaced_20_80_dataset.csv')

In [None]:
df_test = df.sample(frac=0.001, random_state = 4).compute()
labels = df_test["Label"]
df_X = df_test.drop(["Label", "Unnamed: 0"], axis=1)
df_ddos_X = df_ddos.drop(["Label", "Unnamed: 0"], axis=1)

In [77]:
c = Classifier()
df_result = c.pipeline(df_X)
predictions = c.predict(df_X)

array(['ddos', 'ddos', 'ddos', ..., 'Benign', 'Benign', 'Benign'],
      dtype=object)

In [46]:
df_ddos = df_test.loc[df_test['Label'] == 'ddos']

In [26]:
len(predictions), labels.shape

(7567, (7608,))

In [None]:
for index, row in df_test.iterrows():
    if row["Label"] = "Benign":
        continue
     