In [6]:
import os
import re
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score

from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle

from sklearn.tree import export_graphviz
from graphviz import Source
import pydot

from sklearn.ensemble import IsolationForest

from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier

from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import GridSearchCV

data_dir = "./datos"

def get_data(data_dir: str) -> pd.DataFrame:
    
    data = pd.DataFrame()

    for path in os.listdir(data_dir):
        dir_content = os.path.join(data_dir, path)
        if os.path.isfile(dir_content):
            data = data.append(pd.read_csv(dir_content))

    return data

def process_data(data: pd.DataFrame):
    
    subnet1 = "192\.168\.8\.\d{1,3}"
    subnet2 = "192\.168\.3\.\d{1,3}"
    subnet3 = "200\.175\.2\.\d{1,3}"
    subnet4 = "192\.168\.20\.\d{1,3}"
    subnet5 = "172\.17\.\d{1,3}\.\d{1,3}"
    
    data["Label"].replace({"DDoS ": "DDoS"}, inplace=True)
    data.drop("Flow ID", inplace=True, axis=1)
    
    data["Src 192.168.8.0/24"] = data["Src IP"].str.match(subnet1)
    data["Src 192.168.3.0/24"] = data["Src IP"].str.match(subnet2)
    data["Src 200.175.2.0/24"] = data["Src IP"].str.match(subnet3)
    data["Src 192.168.20.0/24"] = data["Src IP"].str.match(subnet4)
    data["Src 172.17.0.0/16"] = data["Src IP"].str.match(subnet5)
    data["Src exterior ip"] = ~data["Src IP"].str.match("(" + subnet1 + "|" + subnet2 + "|" + subnet3 + "|" + subnet4 + "|" + subnet5 + ")")
    
    data["Src 192.168.8.0/24"] = data["Src 192.168.8.0/24"].astype(int)
    data["Src 192.168.3.0/24"] = data["Src 192.168.3.0/24"].astype(int)
    data["Src 200.175.2.0/24"] = data["Src 200.175.2.0/24"].astype(int)
    data["Src 192.168.20.0/24"] = data["Src 192.168.20.0/24"].astype(int)
    data["Src 172.17.0.0/16"] = data["Src 172.17.0.0/16"].astype(int)
    data["Src exterior ip"] = data["Src exterior ip"].astype(int)
    
    data["Dst 192.168.8.0/24"] = data["Dst IP"].str.match(subnet1)
    data["Dst 192.168.3.0/24"] = data["Dst IP"].str.match(subnet2)
    data["Dst 200.175.2.0/24"] = data["Dst IP"].str.match(subnet3)
    data["Dst 192.168.20.0/24"] = data["Dst IP"].str.match(subnet4)
    data["Dst 172.17.0.0/16"] = data["Dst IP"].str.match(subnet5)
    data["Dst exterior ip"] = ~data["Dst IP"].str.match("(" + subnet1 + "|" + subnet2 + "|" + subnet3 + "|" + subnet4 + "|" + subnet5 + ")")
    
    data["Dst 192.168.8.0/24"] = data["Dst 192.168.8.0/24"].astype(int)
    data["Dst 192.168.3.0/24"] = data["Dst 192.168.3.0/24"].astype(int)
    data["Dst 200.175.2.0/24"] = data["Dst 200.175.2.0/24"].astype(int)
    data["Dst 192.168.20.0/24"] = data["Dst 192.168.20.0/24"].astype(int)
    data["Dst 172.17.0.0/16"] = data["Dst 172.17.0.0/16"].astype(int)
    data["Dst exterior ip"] = data["Dst exterior ip"].astype(int)
    
    data[["Day", "Hour"]] = data["Timestamp"].str.split(" ", 1, expand=True)
    data[["Hour","Minute","PM"]] = data["Hour"].str.split(":", 2, expand=True)
    data[["Day","PM"]] = data["Day"].str.split("/", 1, expand=True)
    data["PM"] = data["Timestamp"].str.match(".*PM$")
    
    data["PM"] = data["PM"].astype(int)
    data["PM"] = 12 * data["PM"]
    data["Hour"] = data["Hour"].astype(int) + data["PM"]
    data["Minute"] = data["Minute"].astype(int)
    data["Day"] = data["Day"].astype(int)
    
    data["Hour sin"] = np.sin(data["Hour"]*(2.*np.pi/24))
    data["Hour cos"] = np.cos(data["Hour"]*(2.*np.pi/24))
    data["Minute sin"] = np.sin(data["Minute"]*(2.*np.pi/60))
    data["Minute cos"] = np.cos(data["Minute"]*(2.*np.pi/60))
    data["Day sin"] = np.sin((data["Day"]-1)*(2.*np.pi/31))
    data["Day cos"] = np.cos((data["Day"]-1)*(2.*np.pi/31))
    
    data["Attack"] = ~data["Label"].str.match("Normal")
    data["Attack"] = data["Attack"].astype(int)
    
    data.drop("Hour", inplace=True, axis=1)
    data.drop("Minute", inplace=True, axis=1)
    data.drop("Day", inplace=True, axis=1)
    data.drop("PM", inplace=True, axis=1)
    
    data.drop("Src IP", inplace=True, axis=1)
    data.drop("Dst IP", inplace=True, axis=1)
    data.drop("Timestamp", inplace=True, axis=1)
    data.drop("Label", inplace=True, axis=1)
    
    #columnas irrelevantes
    data.drop("Fwd PSH Flags", inplace=True, axis=1)
    data.drop("Fwd URG Flags", inplace=True, axis=1)
    data.drop("CWE Flag Count", inplace=True, axis=1)
    data.drop("ECE Flag Cnt", inplace=True, axis=1)
    data.drop("Fwd Byts/b Avg", inplace=True, axis=1)
    data.drop("Fwd Pkts/b Avg", inplace=True, axis=1)
    data.drop("Fwd Blk Rate Avg", inplace=True, axis=1)
    data.drop("Bwd Byts/b Avg", inplace=True, axis=1)
    data.drop("Bwd Pkts/b Avg", inplace=True, axis=1)
    data.drop("Bwd Blk Rate Avg", inplace=True, axis=1)
    data.drop("Init Fwd Win Byts", inplace=True, axis=1)
    data.drop("Fwd Seg Size Min", inplace=True, axis=1)
    
    data = shuffle(data)
    data.reset_index(drop=True, inplace=True)    
    
    return data

def graphs(data: pd.DataFrame):
    
    column = input("Inserta una columna valida: ")
    width = 0.35
    
    if column not in data.columns:
        raise Exception("Error.\n")
    
    attack = data[data["Attack"] == 1].reset_index()
    normal = data[data["Attack"] == 0].reset_index()
        
    plt.scatter(attack.index, attack[column], alpha=0.5)
    plt.title("Gráfica base")
    plt.legend(["Ataque"])
    plt.show()
    
    plt.scatter(normal.index, normal[column], alpha=0.5)
    plt.title("Gráfica base")
    plt.legend(["Normal"])
    plt.show()
    
    plt.scatter(normal.index, normal[column], alpha=0.5)
    plt.scatter(attack.index, attack[column], alpha=0.5)
    plt.title("Gráfica de contraste")
    plt.legend(["Normal","Ataque"])
    plt.show()
    
    fig, ax = plt.subplots()
    ax.bar(1 + width/2, normal[column].mean(), width, label="Normal")
    ax.bar(1 - width/2, attack[column].mean(), width, label="Ataque")
    ax.set_title("Gráfica de medias")
    ax.set_xticks([])
    ax.legend()
    plt.show()
    
    fig, ax = plt.subplots()
    ax.bar(1 + width/2, normal[column].max(), width, label="Normal")
    ax.bar(1 - width/2, attack[column].max(), width, label="Ataque")
    ax.set_title("Gráfica de máximos")
    ax.set_xticks([])
    ax.legend()
    plt.show()
    
    fig, ax = plt.subplots()
    ax.bar(1 + width/2, normal[column].min(), width, label="Normal")
    ax.bar(1 - width/2, attack[column].min(), width, label="Ataque")
    ax.set_title("Gráfica de mínimos")
    ax.set_xticks([])
    ax.legend()
    plt.show()
    
    return

def mapping(data: pd.DataFrame):
    
    # añadir a  heatmap para ver todos los nombres de columnas -> xticklabels=data.corr().columns, yticklabels=data.corr().columns 
    sns.heatmap(data.corr(), cmap=sns.diverging_palette(220, 10, as_cmap=True))
    plt.title("Mapa de calor")
    plt.show()
    
    #cor_target = abs(data.corr()["Attack"])
    #relevant_features = cor_target[cor_target>0]
    #print(relevant_features)
    
    return
    
def isof(data: pd.DataFrame):
    
    y = np.array(data["Attack"])
    x = data.drop("Attack", axis = 1)
    
    isof = IsolationForest(bootstrap=True, contamination=0.0001, n_jobs=-1)
    y_isof = isof.fit_predict(x, y)
    
    x_isof = x.drop(x.index[np.asarray(np.where(y_isof == -1)).tolist()[0]])
    y_isof = np.delete(y, np.asarray(np.where(y_isof == -1)).tolist()[0])
    
    print("Ctcas:", x_isof.shape, x.shape)
    
    return x_isof,y_isof 

def rfe(x_isof: pd.DataFrame, y_isof: np.array):
    
    rfe = RFECV(DecisionTreeClassifier(), step=1, n_jobs=-1)
    rfe.fit(x_isof, y_isof)
    
    x_isof_rfe = rfe.transform(x_isof)
    
    print("Ctcas:", x_isof_rfe.shape, x_isof.shape)
    
    return x_isof_rfe,y_isof

def pca(x_isof: pd.DataFrame, y_isof: np.array):
    
    pca = PCA(n_components=20)
    pca.fit(x_isof, y_isof)
    
    x_isof_pca = pca.transform(x_isof)
    
    print("Ctcas:", x_isof_pca.shape, x_isof.shape)
    
    return x_isof_pca,y_isof

def rf(x_isof_rfe: pd.DataFrame, y_isof: np.array):
    
    x_train, x_test, y_train, y_test = train_test_split(x_isof_rfe, y_isof, test_size=0.2)
    
    rf = RandomForestClassifier(n_estimators = 100, max_features="sqrt")
    rf.fit(x_train, y_train)
    
    grid = {}
    gscv = GridSearchCV(rf, grid, refit=True, verbose=4, n_jobs=-1)
    
    scores = cross_val_score(gscv, x_isof_rfe, y_isof, n_jobs=-1)
    print("Cross-validation scores: {}".format(scores))
    print("Average cross-validation score: {}".format(scores.mean()))
    
    predictions = rf.predict(x_test)
    print("Accuracy:", accuracy_score(predictions.astype(np.int32), y_test))
    print("Kappa:", cohen_kappa_score(predictions.astype(np.int32), y_test))
    print("F1:", f1_score(predictions.astype(np.int32), y_test))

    return 
    
def svm(x_isof_rfe: pd.DataFrame, y_isof: np.array):
    
    x_train, x_test, y_train, y_test = train_test_split(x_isof_rfe, y_isof, test_size=0.2)
    
    svm = LinearSVC(dual=False)
    svm.fit(x_train, y_train)
    
    grid = {}
    gscv = GridSearchCV(svm, grid, refit=True, verbose=4, n_jobs=-1)
    
    scores = cross_val_score(gscv, x_isof_rfe, y_isof, n_jobs=-1)
    print("Cross-validation scores: {}".format(scores))
    print("Average cross-validation score: {}".format(scores.mean()))
    
    predictions = svm.predict(x_test)
    print("Accuracy:", accuracy_score(predictions.astype(np.int32), y_test))
    print("Kappa:", cohen_kappa_score(predictions.astype(np.int32), y_test))
    print("F1:", f1_score(predictions.astype(np.int32), y_test))
    
    return 
    
np.random.seed(1999)
data = process_data(get_data(data_dir))
#print(*data.columns, sep=" - ")
#graphs(data)
#mapping(data)

t0 = time.time()
x, y = isof(data)
t1 = time.time()
print(f"ISOF: {t1-t0}")

t0 = time.time()
x, y = rfe(x, y)
t1 = time.time()
print(f"RFE: {t1-t0}")

#t0 = time.time()
#x, y = pca(x, y)
#t1 = time.time()
#print(f"PCA: {t1-t0}")

t0 = time.time()
rf(x_isof_rfe=x, y_isof=y)
t1 = time.time()
print(f"RF: {t1-t0}")

t0 = time.time()
svm(x_isof_rfe=x, y_isof=y)
t1 = time.time()
print(f"L-SVC: {t1-t0}")

Ctcas: (343854, 85) (343889, 85)
ISOF: 33.692408323287964
Ctcas: (343854, 22) (343854, 85)
RFE: 671.8800809383392
Cross-validation scores: [0.99998546 0.99997092 1.         0.99998546 1.        ]
Average cross-validation score: 0.9999883671896583
Accuracy: 0.999985458987073
Kappa: 0.9999548667691287
F1: 0.9999908911215761
RF: 227.03534865379333
Cross-validation scores: [0.9629495  0.94595105 0.94529671 0.94592197 0.94571761]
Average cross-validation score: 0.9491673691457428
Accuracy: 0.9453839554463364
Kappa: 0.8369950305194112
F1: 0.9653230422660044
L-SVC: 230.1864275932312
