In [1]:
import os
import sys
import re
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle

from sklearn.ensemble import IsolationForest

from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier

from sklearn.decomposition import PCA

from sklearn.model_selection import StratifiedKFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.model_selection import GridSearchCV

data_dir = "./datos"

def get_data(data_dir: str) -> pd.DataFrame:
    
    data = pd.DataFrame()

    for path in os.listdir(data_dir):
        dir_content = os.path.join(data_dir, path)
        if os.path.isfile(dir_content):
            data = data.append(pd.read_csv(dir_content))

    return data

def process_data(data: pd.DataFrame):
    
    subnet1 = "192\.168\.8\.\d{1,3}"
    subnet2 = "192\.168\.3\.\d{1,3}"
    subnet3 = "200\.175\.2\.\d{1,3}"
    subnet4 = "192\.168\.20\.\d{1,3}"
    subnet5 = "172\.17\.\d{1,3}\.\d{1,3}"
    
    data["Label"].replace({"DDoS ": "DDoS"}, inplace=True)
    data.drop("Flow ID", inplace=True, axis=1)
    
    data["Src 192.168.8.0/24"] = data["Src IP"].str.match(subnet1)
    data["Src 192.168.3.0/24"] = data["Src IP"].str.match(subnet2)
    data["Src 200.175.2.0/24"] = data["Src IP"].str.match(subnet3)
    data["Src 192.168.20.0/24"] = data["Src IP"].str.match(subnet4)
    data["Src 172.17.0.0/16"] = data["Src IP"].str.match(subnet5)
    data["Src exterior ip"] = ~data["Src IP"].str.match("(" + subnet1 + "|" + subnet2 + "|" + subnet3 + "|" + subnet4 + "|" + subnet5 + ")")
    
    data["Src 192.168.8.0/24"] = data["Src 192.168.8.0/24"].astype(int)
    data["Src 192.168.3.0/24"] = data["Src 192.168.3.0/24"].astype(int)
    data["Src 200.175.2.0/24"] = data["Src 200.175.2.0/24"].astype(int)
    data["Src 192.168.20.0/24"] = data["Src 192.168.20.0/24"].astype(int)
    data["Src 172.17.0.0/16"] = data["Src 172.17.0.0/16"].astype(int)
    data["Src exterior ip"] = data["Src exterior ip"].astype(int)
    
    data["Dst 192.168.8.0/24"] = data["Dst IP"].str.match(subnet1)
    data["Dst 192.168.3.0/24"] = data["Dst IP"].str.match(subnet2)
    data["Dst 200.175.2.0/24"] = data["Dst IP"].str.match(subnet3)
    data["Dst 192.168.20.0/24"] = data["Dst IP"].str.match(subnet4)
    data["Dst 172.17.0.0/16"] = data["Dst IP"].str.match(subnet5)
    data["Dst exterior ip"] = ~data["Dst IP"].str.match("(" + subnet1 + "|" + subnet2 + "|" + subnet3 + "|" + subnet4 + "|" + subnet5 + ")")
    
    data["Dst 192.168.8.0/24"] = data["Dst 192.168.8.0/24"].astype(int)
    data["Dst 192.168.3.0/24"] = data["Dst 192.168.3.0/24"].astype(int)
    data["Dst 200.175.2.0/24"] = data["Dst 200.175.2.0/24"].astype(int)
    data["Dst 192.168.20.0/24"] = data["Dst 192.168.20.0/24"].astype(int)
    data["Dst 172.17.0.0/16"] = data["Dst 172.17.0.0/16"].astype(int)
    data["Dst exterior ip"] = data["Dst exterior ip"].astype(int)
    
    data[["Day", "Hour"]] = data["Timestamp"].str.split(" ", 1, expand=True)
    data[["Hour","Minute","PM"]] = data["Hour"].str.split(":", 2, expand=True)
    data[["Day","PM"]] = data["Day"].str.split("/", 1, expand=True)
    data["PM"] = data["Timestamp"].str.match(".*PM$")
    
    data["PM"] = data["PM"].astype(int)
    data["PM"] = 12 * data["PM"]
    data["Hour"] = data["Hour"].astype(int) + data["PM"]
    data["Minute"] = data["Minute"].astype(int)
    data["Day"] = data["Day"].astype(int)
    
    data["Hour sin"] = np.sin(data["Hour"]*(2.*np.pi/24))
    data["Hour cos"] = np.cos(data["Hour"]*(2.*np.pi/24))
    data["Minute sin"] = np.sin(data["Minute"]*(2.*np.pi/60))
    data["Minute cos"] = np.cos(data["Minute"]*(2.*np.pi/60))
    data["Day sin"] = np.sin((data["Day"]-1)*(2.*np.pi/31))
    data["Day cos"] = np.cos((data["Day"]-1)*(2.*np.pi/31))
    
    data["Attack"] = ~data["Label"].str.match("Normal")
    data["Attack"] = data["Attack"].astype(int)
    
    data.drop("Hour", inplace=True, axis=1)
    data.drop("Minute", inplace=True, axis=1)
    data.drop("Day", inplace=True, axis=1)
    data.drop("PM", inplace=True, axis=1)
    
    data.drop("Src IP", inplace=True, axis=1)
    data.drop("Dst IP", inplace=True, axis=1)
    data.drop("Timestamp", inplace=True, axis=1)
    data.drop("Label", inplace=True, axis=1)
    
    #columnas irrelevantes
    data.drop("Fwd PSH Flags", inplace=True, axis=1)
    data.drop("Fwd URG Flags", inplace=True, axis=1)
    data.drop("CWE Flag Count", inplace=True, axis=1)
    data.drop("ECE Flag Cnt", inplace=True, axis=1)
    data.drop("Fwd Byts/b Avg", inplace=True, axis=1)
    data.drop("Fwd Pkts/b Avg", inplace=True, axis=1)
    data.drop("Fwd Blk Rate Avg", inplace=True, axis=1)
    data.drop("Bwd Byts/b Avg", inplace=True, axis=1)
    data.drop("Bwd Pkts/b Avg", inplace=True, axis=1)
    data.drop("Bwd Blk Rate Avg", inplace=True, axis=1)
    data.drop("Init Fwd Win Byts", inplace=True, axis=1)
    data.drop("Fwd Seg Size Min", inplace=True, axis=1)
    
    data = shuffle(data)
    data.reset_index(drop=True, inplace=True)    
    
    return data

def graphs(data: pd.DataFrame):
    
    column = input("Inserta una columna valida: ")
    width = 0.35
    
    if column not in data.columns:
        raise Exception("Error.\n")
    
    attack = data[data["Attack"] == 1].reset_index()
    normal = data[data["Attack"] == 0].reset_index()
        
    plt.scatter(attack.index, attack[column], alpha=0.5)
    plt.title("Gráfica base")
    plt.legend(["Ataque"])
    plt.show()
    
    plt.scatter(normal.index, normal[column], alpha=0.5)
    plt.title("Gráfica base")
    plt.legend(["Normal"])
    plt.show()
    
    plt.scatter(normal.index, normal[column], alpha=0.5)
    plt.scatter(attack.index, attack[column], alpha=0.5)
    plt.title("Gráfica de contraste")
    plt.legend(["Normal","Ataque"])
    plt.show()
    
    fig, ax = plt.subplots()
    ax.bar(1 + width/2, normal[column].mean(), width, label="Normal")
    ax.bar(1 - width/2, attack[column].mean(), width, label="Ataque")
    ax.set_title("Gráfica de medias")
    ax.set_xticks([])
    ax.legend()
    plt.show()
    
    fig, ax = plt.subplots()
    ax.bar(1 + width/2, normal[column].max(), width, label="Normal")
    ax.bar(1 - width/2, attack[column].max(), width, label="Ataque")
    ax.set_title("Gráfica de máximos")
    ax.set_xticks([])
    ax.legend()
    plt.show()
    
    fig, ax = plt.subplots()
    ax.bar(1 + width/2, normal[column].min(), width, label="Normal")
    ax.bar(1 - width/2, attack[column].min(), width, label="Ataque")
    ax.set_title("Gráfica de mínimos")
    ax.set_xticks([])
    ax.legend()
    plt.show()
    
    return

def mapping(data: pd.DataFrame):
    
    # añadir a  heatmap para ver todos los nombres de columnas -> xticklabels=data.corr().columns, yticklabels=data.corr().columns 
    sns.heatmap(data.corr(), cmap=sns.diverging_palette(220, 10, as_cmap=True))
    plt.title("Mapa de calor")
    plt.show()
    
    #cor_target = abs(data.corr()["Attack"])
    #relevant_features = cor_target[cor_target>0]
    #print(relevant_features)
    
    return
    
def isof(data: pd.DataFrame):
    
    y = np.array(data["Attack"])
    x = data.drop("Attack", axis = 1)
    
    isof = IsolationForest(bootstrap=True, contamination=0.0001, n_jobs=-1)
    y_isof = isof.fit_predict(x, y)
    
    # Descomentar esto para filas descartadas por isof
    #x.drop(x.index[np.asarray(np.where(y_isof == -1)).tolist()[0]]).to_csv("x.csv")
    #np.savetxt("y.txt", np.delete(y, np.asarray(np.where(y_isof != -1)).tolist()[0]))
    
    x_isof = x.drop(x.index[np.asarray(np.where(y_isof == -1)).tolist()[0]])
    y_isof = np.delete(y, np.asarray(np.where(y_isof == -1)).tolist()[0])
    
    print("Ctcas:", x_isof.shape, x.shape)
    
    return x_isof,y_isof 

def rfe(x_isof: pd.DataFrame, y_isof: np.array, seed: int):
    
    rfe = RFECV(DecisionTreeClassifier(random_state=seed), step=1, n_jobs=-1)
    rfe.fit(x_isof, y_isof)
    
    x_isof_rfe = rfe.transform(x_isof)
    
    print("Ctcas:", x_isof_rfe.shape, x_isof.shape)
    
    # Descomentar esto para ctcas. concretas descartadas por rfe
    #ctcas = []
    #for i in range(len(x_isof.columns)): 
    #    if rfe.get_support()[i]:
    #        ctcas.append(x_isof.columns[i])
    #print("Ctcas (concretas): {}".format(ctcas))
    
    return x_isof_rfe,y_isof

def pca(x_isof: pd.DataFrame, y_isof: np.array):
    
    pca = PCA(n_components=20)
    pca.fit(x_isof, y_isof)
    
    x_isof_pca = pca.transform(x_isof)
    
    print("Ctcas:", x_isof_pca.shape, x_isof.shape)
    
    return x_isof_pca,y_isof

def rf(x_isof_rfe: pd.DataFrame, y_isof: np.array, seed: int):
    
    metrics = {"accuracy": [], "kappa": [], "f1": [], "precision": [], "recall": []}
    for train, test in StratifiedKFold().split(x_isof_rfe, y_isof):
        
        x_train, x_test = x_isof_rfe[train, :], x_isof_rfe[test, :]
        y_train, y_test = y_isof[train], y_isof[test]
    
        rf = RandomForestClassifier(random_state=seed)
        grid = {"n_estimators": [100, 50, 10], "max_depth": [None, 5], "max_features": ["sqrt"]}
        scorer = {"accuracy": "accuracy", "kappa": make_scorer(cohen_kappa_score),"f1": "f1","precision": "precision", "recall": "recall"}
        gscv = GridSearchCV(rf, grid, scoring=scorer, refit="accuracy", n_jobs=-1)
        
        result = gscv.fit(x_train, y_train)
        
        print(">>> Metrics (Mean It. - CV) <<<")
        print("Accuracy: ", result.cv_results_["mean_test_accuracy"])
        print("Kappa: ", result.cv_results_["mean_test_kappa"])
        print("F1: ", result.cv_results_["mean_test_f1"])
        print("Precision: ", result.cv_results_["mean_test_precision"])
        print("Recall: ", result.cv_results_["mean_test_recall"])
        print("Fit time: ", result.cv_results_["mean_fit_time"])
        print("Score time: ", result.cv_results_["mean_score_time"])
                
        print("Best score: ", result.best_score_)
        print("Best params: ", result.best_params_)
        
        best_model = result.best_estimator_
        predictions = best_model.predict(x_test)
        
        metrics["accuracy"].append(accuracy_score(y_test, predictions))
        metrics["kappa"].append(cohen_kappa_score(y_test, predictions))
        metrics["f1"].append(f1_score(y_test, predictions))
        metrics["precision"].append(precision_score(y_test, predictions))
        metrics["recall"].append(recall_score(y_test, predictions))
        
    print(">>> Metrics <<<")
    print("Accuracy: ", metrics["accuracy"])
    print("Kappa: ", metrics["kappa"])
    print("F1: ", metrics["f1"])
    print("Precision: ", metrics["precision"])
    print("Recall: ", metrics["recall"])
        
    return 

def svm(x_isof_rfe: pd.DataFrame, y_isof: np.array, seed: int):
    
    metrics = {"accuracy": [], "kappa": [], "f1": [], "precision": [], "recall": []}
    for train, test in StratifiedKFold().split(x_isof_rfe, y_isof):
        
        x_train, x_test = x_isof_rfe[train, :], x_isof_rfe[test, :]
        y_train, y_test = y_isof[train], y_isof[test]
    
        svm = LinearSVC(random_state=seed)
        grid = {"tol": [1e-3, 1e-4], "C": [100, 10, 1], "dual": [False]}
        scorer = {"accuracy": "accuracy", "kappa": make_scorer(cohen_kappa_score),"f1": "f1","precision": "precision", "recall": "recall"}
        gscv = GridSearchCV(svm, grid, scoring=scorer, refit="accuracy", n_jobs=-1)
        
        result = gscv.fit(x_train, y_train)       
        
        print(">>> Metrics (Mean It. - CV) <<<")
        print("Accuracy: ", result.cv_results_["mean_test_accuracy"])
        print("Kappa: ", result.cv_results_["mean_test_kappa"])
        print("F1: ", result.cv_results_["mean_test_f1"])
        print("Precision: ", result.cv_results_["mean_test_precision"])
        print("Recall: ", result.cv_results_["mean_test_recall"])
        print("Fit time: ", result.cv_results_["mean_fit_time"])
        print("Score time: ", result.cv_results_["mean_score_time"])
                
        print("Best score: ", result.best_score_)
        print("Best params: ", result.best_params_)
        
        best_model = result.best_estimator_
        predictions = best_model.predict(x_test)
        
        metrics["accuracy"].append(accuracy_score(y_test, predictions))
        metrics["kappa"].append(cohen_kappa_score(y_test, predictions))
        metrics["f1"].append(f1_score(y_test, predictions))
        metrics["precision"].append(precision_score(y_test, predictions))
        metrics["recall"].append(recall_score(y_test, predictions))
        
    print(">>> Metrics <<<")
    print("Accuracy: ", metrics["accuracy"])
    print("Kappa: ", metrics["kappa"])
    print("F1: ", metrics["f1"])
    print("Precision: ", metrics["precision"])
    print("Recall: ", metrics["recall"])
        
    return 

def nb(x_isof_rfe: pd.DataFrame, y_isof: np.array):
    
    metrics = {"accuracy": [], "kappa": [], "f1": [], "precision": [], "recall": []}
    b_columns = []
    
    for i in range(len(x_isof_rfe[0])):
            if len(x_isof_rfe[:, i][~np.isin(x_isof_rfe[:, i], [0, 1])]) == 0:
                b_columns.append(i)
    
    for train, test in StratifiedKFold().split(x_isof_rfe, y_isof):
        
        x_train, x_test = x_isof_rfe[train, :], x_isof_rfe[test, :]
        y_train, y_test = y_isof[train], y_isof[test]
        
        b_train = []
        b_test = []
        
        g_train = []
        g_test = []
        
        for i in range(len(x_isof_rfe[0])):
            if i in b_columns:
                b_train.append(x_train[:, i])
                b_test.append(x_test[:, i])
            else:
                g_train.append(x_train[:, i])
                g_test.append(x_test[:, i])
        
        b_train = np.vstack(b_train).transpose()
        b_test = np.vstack(b_test).transpose()
        
        g_train = np.vstack(g_train).transpose()
        g_test = np.vstack(g_test).transpose()
        
        bnb = BernoulliNB()
        grid = {"alpha": [1, 0.5, 0.1, 0.01, 0.001, 0], "binarize": [None]}
        scorer = {"accuracy": "accuracy", "kappa": make_scorer(cohen_kappa_score),"f1": "f1","precision": "precision", "recall": "recall"}
        gscv = GridSearchCV(bnb, grid, scoring=scorer, refit="accuracy", n_jobs=-1)
        
        b_result = gscv.fit(b_train, y_train)
        
        print(">>> Metrics Bernoulli/Binary (Mean It. - CV) <<<")
        print("Accuracy: ", b_result.cv_results_["mean_test_accuracy"])
        print("Kappa: ", b_result.cv_results_["mean_test_kappa"])
        print("F1: ", b_result.cv_results_["mean_test_f1"])
        print("Precision: ", b_result.cv_results_["mean_test_precision"])
        print("Recall: ", b_result.cv_results_["mean_test_recall"])
        print("Fit time: ", b_result.cv_results_["mean_fit_time"])
        print("Score time: ", b_result.cv_results_["mean_score_time"])    
        
        print("Best score: ", b_result.best_score_)
        print("Best params: ", b_result.best_params_)
        
        best_b_model = b_result.best_estimator_
        b_proba = best_b_model.predict_log_proba(b_test) 
        
        gnb = GaussianNB()
        grid = {"var_smoothing": [1e-8, 5e-9, 1e-9, 5e-10, 1e-10, 1e-11]}
        scorer = {"accuracy": "accuracy", "kappa": make_scorer(cohen_kappa_score),"f1": "f1","precision": "precision", "recall": "recall"}
        gscv = GridSearchCV(gnb, grid, scoring=scorer, refit="accuracy", n_jobs=-1)
        
        g_result = gscv.fit(g_train, y_train)       
        
        print(">>> Metrics Gaussian (Mean It. - CV) <<<")
        print("Accuracy: ", g_result.cv_results_["mean_test_accuracy"])
        print("Kappa: ", g_result.cv_results_["mean_test_kappa"])
        print("F1: ", g_result.cv_results_["mean_test_f1"])
        print("Precision: ", g_result.cv_results_["mean_test_precision"])
        print("Recall: ", g_result.cv_results_["mean_test_recall"])
        print("Fit time: ", g_result.cv_results_["mean_fit_time"])
        print("Score time: ", g_result.cv_results_["mean_score_time"])    
        
        print("Best score: ", g_result.best_score_)
        print("Best params: ", g_result.best_params_)
        
        best_g_model = g_result.best_estimator_
        g_proba = best_g_model.predict_log_proba(g_test) 
        
        pos = np.exp(b_proba + g_proba)
        proba = pos / pos.sum(axis=1)[:,None]
        predictions = []
        
        for a, b in proba:
            if a > b:
                predictions.append(0)
            else:
                predictions.append(1)
                
        metrics["accuracy"].append(accuracy_score(y_test, predictions))
        metrics["kappa"].append(cohen_kappa_score(y_test, predictions))
        metrics["f1"].append(f1_score(y_test, predictions))
        metrics["precision"].append(precision_score(y_test, predictions))
        metrics["recall"].append(recall_score(y_test, predictions))
        
    print(">>> Metrics <<<")
    print("Accuracy: ", metrics["accuracy"])
    print("Kappa: ", metrics["kappa"])
    print("F1: ", metrics["f1"])
    print("Precision: ", metrics["precision"])
    print("Recall: ", metrics["recall"])
        
    return 

seed = 1999
np.random.seed(seed)
data = process_data(get_data(data_dir))
#print(*data.columns, sep=" - ")
#graphs(data)
#mapping(data)

t0 = time.time()
x, y = isof(data)
t1 = time.time()
print(f"ISOF: {t1-t0}")

t0 = time.time()
x, y = rfe(x, y, seed)
t1 = time.time()
print(f"RFE: {t1-t0}")

#t0 = time.time()
#x, y = pca(x, y) 
#t1 = time.time()
#print(f"PCA: {t1-t0}")

print("---RF---")
t0 = time.time()
rf(x, y, seed)
t1 = time.time()
print(f"RF: {t1-t0}")

print("---L.SVM---")
t0 = time.time()
svm(x, y, seed)
t1 = time.time()
print(f"L-SVC: {t1-t0}")

print("---NB---")
t0 = time.time()
nb(x, y)
t1 = time.time()
print(f"NB: {t1-t0}")

Ctcas: (343854, 85) (343889, 85)
ISOF: 34.30203628540039
Ctcas: (343854, 26) (343854, 85)
RFE: 636.3799707889557
---RF---
>>> Metrics (Mean It. - CV) <<<
Accuracy:  [0.99998909 0.99998546 0.99997819 0.99969827 0.99973826 0.99957831]
Kappa:  [0.99996578 0.99995437 0.99993155 0.99905259 0.9991782  0.99867607]
F1:  [0.99999319 0.99999092 0.99998639 0.99981172 0.99983667 0.99973685]
Precision:  [0.99999092 0.99999092 0.99998639 0.99962805 0.99968247 0.99957812]
Recall:  [0.99999546 0.99999092 0.99998639 0.99999546 0.99999092 0.99989563]
Fit time:  [59.90131235 30.41358047  6.26455798 33.81116557 16.76346722  3.17216015]
Score time:  [0.78974462 0.49209108 0.22892618 0.59554605 0.3497745  0.1806725 ]
Best score:  0.99998909427995
Best params:  {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 100}
>>> Metrics (Mean It. - CV) <<<
Accuracy:  [0.99999273 0.99999273 0.99998909 0.99973826 0.99968373 0.99968373]
Kappa:  [0.99997718 0.99997718 0.99996578 0.99917824 0.99900691 0.99900694]

>>> Metrics Gaussian (Mean It. - CV) <<<
Accuracy:  [0.88525645 0.88704864 0.89031311 0.8916109  0.88711044 0.88706318]
Kappa:  [0.55205758 0.56121002 0.57708475 0.58332272 0.57102595 0.57131309]
F1:  [0.93274178 0.93371108 0.93550476 0.93622002 0.93337423 0.9333276 ]
Precision:  [0.87924846 0.8811191  0.88430128 0.88556954 0.88522526 0.88540434]
Recall:  [0.9931661  0.99298005 0.99300274 0.99301635 0.98706278 0.98673606]
Fit time:  [0.3352407  0.32798328 0.32535601 0.28694644 0.27882981 0.21953917]
Score time:  [0.19375973 0.2007206  0.19885626 0.20427527 0.20199299 0.12871046]
Best score:  0.8916108971581798
Best params:  {'var_smoothing': 5e-10}
>>> Metrics Bernoulli/Binary (Mean It. - CV) <<<
Accuracy:  [0.96947466 0.96947466 0.96947466 0.96947466 0.96947466 0.96947466]
Kappa:  [0.89834358 0.89834358 0.89834358 0.89834358 0.89834358 0.89834358]
F1:  [0.98130434 0.98130434 0.98130434 0.98130434 0.98130434 0.98130434]
Precision:  [0.96329932 0.96329932 0.96329932 0.96329932 0.9632993