In [2]:
import os
import re
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.tree import export_graphviz
from sklearn.utils import shuffle
from sklearn.pipeline import make_pipeline
import pydot

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score

from sklearn.tree import DecisionTreeClassifier

from graphviz import Source

data_dir = "./datos"

def get_data(data_dir: str) -> pd.DataFrame:
    
    data = pd.DataFrame()

    for path in os.listdir(data_dir):
        dir_content = os.path.join(data_dir, path)
        if os.path.isfile(dir_content):
            data = data.append(pd.read_csv(dir_content))

    return data

def process_data(data: pd.DataFrame):
    
    subnet1 = "192\.168\.8\.\d{1,3}"
    subnet2 = "192\.168\.3\.\d{1,3}"
    subnet3 = "200\.175\.2\.\d{1,3}"
    subnet4 = "192\.168\.20\.\d{1,3}"
    subnet5 = "172\.17\.\d{1,3}\.\d{1,3}"
    
    data["Label"].replace({"DDoS ": "DDoS"}, inplace=True)
    data.drop("Flow ID", inplace=True, axis=1)
    
    data["Src 192.168.8.0/24"] = data["Src IP"].str.match(subnet1)
    data["Src 192.168.3.0/24"] = data["Src IP"].str.match(subnet2)
    data["Src 200.175.2.0/24"] = data["Src IP"].str.match(subnet3)
    data["Src 192.168.20.0/24"] = data["Src IP"].str.match(subnet4)
    data["Src 172.17.0.0/16"] = data["Src IP"].str.match(subnet5)
    data["Src exterior ip"] = ~data["Src IP"].str.match("(" + subnet1 + "|" + subnet2 + "|" + subnet3 + "|" + subnet4 + "|" + subnet5 + ")")
    
    data["Src 192.168.8.0/24"] = data["Src 192.168.8.0/24"].astype(int)
    data["Src 192.168.3.0/24"] = data["Src 192.168.3.0/24"].astype(int)
    data["Src 200.175.2.0/24"] = data["Src 200.175.2.0/24"].astype(int)
    data["Src 192.168.20.0/24"] = data["Src 192.168.20.0/24"].astype(int)
    data["Src 172.17.0.0/16"] = data["Src 172.17.0.0/16"].astype(int)
    data["Src exterior ip"] = data["Src exterior ip"].astype(int)
    
    data["Dst 192.168.8.0/24"] = data["Dst IP"].str.match(subnet1)
    data["Dst 192.168.3.0/24"] = data["Dst IP"].str.match(subnet2)
    data["Dst 200.175.2.0/24"] = data["Dst IP"].str.match(subnet3)
    data["Dst 192.168.20.0/24"] = data["Dst IP"].str.match(subnet4)
    data["Dst 172.17.0.0/16"] = data["Dst IP"].str.match(subnet5)
    data["Dst exterior ip"] = ~data["Dst IP"].str.match("(" + subnet1 + "|" + subnet2 + "|" + subnet3 + "|" + subnet4 + "|" + subnet5 + ")")
    
    data["Dst 192.168.8.0/24"] = data["Dst 192.168.8.0/24"].astype(int)
    data["Dst 192.168.3.0/24"] = data["Dst 192.168.3.0/24"].astype(int)
    data["Dst 200.175.2.0/24"] = data["Dst 200.175.2.0/24"].astype(int)
    data["Dst 192.168.20.0/24"] = data["Dst 192.168.20.0/24"].astype(int)
    data["Dst 172.17.0.0/16"] = data["Dst 172.17.0.0/16"].astype(int)
    data["Dst exterior ip"] = data["Dst exterior ip"].astype(int)
    
    data[["Day", "Hour"]] = data["Timestamp"].str.split(" ", 1, expand=True)
    data[["Hour","Minute","PM"]] = data["Hour"].str.split(":", 2, expand=True)
    data[["Day","PM"]] = data["Day"].str.split("/", 1, expand=True)
    data["PM"] = data["Timestamp"].str.match(".*PM$")
    
    data["PM"] = data["PM"].astype(int)
    data["PM"] = 12 * data["PM"]
    data["Hour"] = data["Hour"].astype(int) + data["PM"]
    data["Minute"] = data["Minute"].astype(int)
    data["Day"] = data["Day"].astype(int)
    
    data["Hour sin"] = np.sin(data["Hour"]*(2.*np.pi/24))
    data["Hour cos"] = np.cos(data["Hour"]*(2.*np.pi/24))
    data["Minute sin"] = np.sin(data["Minute"]*(2.*np.pi/60))
    data["Minute cos"] = np.cos(data["Minute"]*(2.*np.pi/60))
    data["Day sin"] = np.sin((data["Day"]-1)*(2.*np.pi/31))
    data["Day cos"] = np.cos((data["Day"]-1)*(2.*np.pi/31))
    
    data["Attack"] = ~data["Label"].str.match("Normal")
    data["Attack"] = data["Attack"].astype(int)
    
    data.drop("Hour", inplace=True, axis=1)
    data.drop("Minute", inplace=True, axis=1)
    data.drop("Day", inplace=True, axis=1)
    data.drop("PM", inplace=True, axis=1)
    
    data.drop("Src IP", inplace=True, axis=1)
    data.drop("Dst IP", inplace=True, axis=1)
    data.drop("Timestamp", inplace=True, axis=1)
    data.drop("Label", inplace=True, axis=1)
    
    #columnas irrelevantes
    data.drop("Fwd PSH Flags", inplace=True, axis=1)
    data.drop("Fwd URG Flags", inplace=True, axis=1)
    data.drop("CWE Flag Count", inplace=True, axis=1)
    data.drop("ECE Flag Cnt", inplace=True, axis=1)
    data.drop("Fwd Byts/b Avg", inplace=True, axis=1)
    data.drop("Fwd Pkts/b Avg", inplace=True, axis=1)
    data.drop("Fwd Blk Rate Avg", inplace=True, axis=1)
    data.drop("Bwd Byts/b Avg", inplace=True, axis=1)
    data.drop("Bwd Pkts/b Avg", inplace=True, axis=1)
    data.drop("Bwd Blk Rate Avg", inplace=True, axis=1)
    data.drop("Init Fwd Win Byts", inplace=True, axis=1)
    data.drop("Fwd Seg Size Min", inplace=True, axis=1)
    
    data = shuffle(data)
    data.reset_index(drop=True, inplace=True)    
    
    return data

def graphs(data: pd.DataFrame):
    
    column = input("Inserta una columna valida: ")
    width = 0.35
    
    if column not in data.columns:
        raise Exception("Error.\n")
    
    attack = data[data["Attack"] == 1].reset_index()
    normal = data[data["Attack"] == 0].reset_index()
        
    plt.scatter(attack.index, attack[column], alpha=0.5)
    plt.title("Gráfica base")
    plt.legend(["Ataque"])
    plt.show()
    
    plt.scatter(normal.index, normal[column], alpha=0.5)
    plt.title("Gráfica base")
    plt.legend(["Normal"])
    plt.show()
    
    plt.scatter(normal.index, normal[column], alpha=0.5)
    plt.scatter(attack.index, attack[column], alpha=0.5)
    plt.title("Gráfica de contraste")
    plt.legend(["Normal","Ataque"])
    plt.show()
    
    fig, ax = plt.subplots()
    ax.bar(1 + width/2, normal[column].mean(), width, label="Normal")
    ax.bar(1 - width/2, attack[column].mean(), width, label="Ataque")
    ax.set_title("Gráfica de medias")
    ax.set_xticks([])
    ax.legend()
    plt.show()
    
    fig, ax = plt.subplots()
    ax.bar(1 + width/2, normal[column].max(), width, label="Normal")
    ax.bar(1 - width/2, attack[column].max(), width, label="Ataque")
    ax.set_title("Gráfica de máximos")
    ax.set_xticks([])
    ax.legend()
    plt.show()
    
    fig, ax = plt.subplots()
    ax.bar(1 + width/2, normal[column].min(), width, label="Normal")
    ax.bar(1 - width/2, attack[column].min(), width, label="Ataque")
    ax.set_title("Gráfica de mínimos")
    ax.set_xticks([])
    ax.legend()
    plt.show()
    
    return

def mapping(data: pd.DataFrame):
    
    # añadir a  heatmap para ver todos los nombres de columnas -> xticklabels=data.corr().columns, yticklabels=data.corr().columns 
    sns.heatmap(data.corr(), cmap=sns.diverging_palette(220, 10, as_cmap=True))
    plt.title("Mapa de calor")
    plt.show()
    
    #cor_target = abs(data.corr()["Attack"])
    #relevant_features = cor_target[cor_target>0]
    #print(relevant_features)
    
    return

def random_forest(data: pd.DataFrame):
    
    labels = np.array(data["Attack"])
    features = data.drop("Attack", axis = 1)
    feature_list = list(features.columns)
    features = np.array(features)
    
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state = 42)
    print(train_features.shape, train_labels.shape, test_features.shape, test_labels.shape)
    
    rf = RandomForestClassifier(n_estimators = 100, max_features="sqrt", random_state = 42)
    rf.fit(train_features, train_labels);
    
    predictions = rf.predict(test_features)
    print("Accuracy:", accuracy_score(predictions.astype(np.int32), test_labels))
    
    '''importances = list(rf.feature_importances_)# List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]# Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)# Print out the feature and importances 
    [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];'''
    
    '''rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
    rf_small.fit(train_features, train_labels)# Extract the small tree
    tree_small = rf_small.estimators_[5]# Save the tree as a png image
    export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)
    (graph, ) = pydot.graph_from_dot_file('small_tree.dot')
    graph.write_png('small_tree.png')'''
    
    graph = Source(export_graphviz(rf.estimators_[5], out_file=None, feature_names=feature_list))
    graph.format = 'png'
    graph.render('dt0', view=False);
    
    print("Kappa:", str(cohen_kappa_score(predictions.astype(np.int32), test_labels)))

    return 
    
def random_forest_isof(data: pd.DataFrame):
    
    y = np.array(data["Attack"])
    X = data.drop("Attack", axis = 1)
    
    iforest = IsolationForest(bootstrap=True,
                          contamination=0.0001, 
                          max_features=10, 
                          max_samples=10, 
                          n_estimators=100, 
                          n_jobs=-1,
                         random_state=1)
    y_pred = iforest.fit_predict(X)
    
    X.drop(X.index[np.asarray(np.where(y_pred != -1)).tolist()[0]]).to_csv("x.csv")
    np.savetxt("y.txt", np.delete(y, np.asarray(np.where(y_pred != -1)).tolist()[0]))
    
    X_isof = X.drop(X.index[np.asarray(np.where(y_pred == -1)).tolist()[0]])
    y_isof = np.delete(y, np.asarray(np.where(y_pred == -1)).tolist()[0])

    X_train, X_test, y_train, y_test = train_test_split(X_isof, y_isof, test_size=0.2, random_state=42)
    
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    
    rf = RandomForestClassifier(n_estimators = 100, max_features="sqrt", random_state = 42)
    rfe = RFECV(rf, cv=5)
    rfe.fit(X_train, y_train);
    
    predictions = rfe.predict(X_test)
    
    print("Accuracy:", accuracy_score(predictions.astype(np.int32), y_test))
    print("Kappa:", str(cohen_kappa_score(predictions.astype(np.int32), y_test)))
    
    graph = Source(export_graphviz(rf.estimators_[0], out_file=None, feature_names=list(X.columns)))
    graph.format = 'png'
    graph.render('dt1', view=False);
    
    return

def linear_svm_isof(data: pd.DataFrame):
    
    y = np.array(data["Attack"])
    X = data.drop("Attack", axis = 1)
    
    iforest = IsolationForest(bootstrap=True,
                          contamination=0.0001, 
                          max_features=10, 
                          max_samples=10, 
                          n_estimators=100, 
                          n_jobs=-1,
                         random_state=1)
    y_pred = iforest.fit_predict(X)
    
    X_isof = X.drop(X.index[np.asarray(np.where(y_pred == -1)).tolist()[0]])
    y_isof = np.delete(y, np.asarray(np.where(y_pred == -1)).tolist()[0])

    X_train, X_test, y_train, y_test = train_test_split(X_isof, y_isof, test_size=0.2, random_state=42)
    
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    
    lsvm = LinearSVC(random_state=42)
    #lsvm.n_iter_ = 20
    lsvm.fit(X_train, y_train);
    
    predictions = lsvm.predict(X_test)
    
    print("Accuracy:", accuracy_score(predictions.astype(np.int32), y_test))
    print("Kappa:", str(cohen_kappa_score(predictions.astype(np.int32), y_test)))
    
    return

def rf(data: pd.DataFrame):
    
    
    y = np.array(data["Attack"])
    X = data.drop("Attack", axis = 1)
    
    #isof
    isof = IsolationForest(bootstrap=True, contamination=0.0001, n_jobs=-1)
    y_isof = isof.fit_predict(X, y)
    
    X_isof = X.drop(X.index[np.asarray(np.where(y_isof == -1)).tolist()[0]])
    y_isof = np.delete(y, np.asarray(np.where(y_isof == -1)).tolist()[0])
    #end isof
    
    #rfe
    rfe = RFECV(DecisionTreeClassifier(), n_jobs=-1)
    rfe.fit(X_isof, y_isof)
    
    X_isof_rfe = rfe.transform(X_isof)
    #end rfe
    
    X_train, X_test, y_train, y_test = train_test_split(X_isof_rfe, y_isof, test_size=0.2)
    
    rf = RandomForestClassifier(n_estimators = 100, max_features="sqrt")
    rf.fit(X_train, y_train)
    
    scores = cross_val_score(rf, X_train, y_train, n_jobs=-1)
    print("Cross-validation scores: {}".format(scores))
    print("Average cross-validation score: {}".format(scores.mean()))
    
    predictions = rf.predict(X_test)
    print("Accuracy:", accuracy_score(predictions.astype(np.int32), y_test))
    print("Kappa:", cohen_kappa_score(predictions.astype(np.int32), y_test))

    return 
    
def svm(data: pd.DataFrame):
    
    
    y = np.array(data["Attack"])
    X = data.drop("Attack", axis = 1)
    
    #isof
    isof = IsolationForest(bootstrap=True, contamination=0.0001, n_jobs=-1)
    y_isof = isof.fit_predict(X, y)
    
    X_isof = X.drop(X.index[np.asarray(np.where(y_isof == -1)).tolist()[0]])
    y_isof = np.delete(y, np.asarray(np.where(y_isof == -1)).tolist()[0])
    #end isof
    
    #rfe
    rfe = RFECV(DecisionTreeClassifier(), n_jobs=-1)
    rfe.fit(X_isof, y_isof)
    
    X_isof_rfe = rfe.transform(X_isof)
    print("Ctcas:", X_isof_rfe.shape, X_isof.shape)
    #end rfe
    
    #x val externa en vez de split, añadir f1 f1_score
    X_train, X_test, y_train, y_test = train_test_split(X_isof_rfe, y_isof, test_size=0.2)
    
    svm = LinearSVC(dual=False)
    svm.fit(X_train, y_train)
    
    scores = cross_val_score(svm, X_train, y_train, n_jobs=-1)
    print("Cross-validation scores: {}".format(scores))
    print("Average cross-validation score: {}".format(scores.mean()))
    
    predictions = svm.predict(X_test)
    print("Accuracy:", accuracy_score(predictions.astype(np.int32), y_test))
    print("Kappa:", cohen_kappa_score(predictions.astype(np.int32), y_test))

    return 
    
data = process_data(get_data(data_dir))
#print(*data.columns, sep=" - ")
#graphs(data)
#mapping(data)
'''random_forest(data)
random_forest_isof(data)
linear_svm_isof(data)'''
t0 = time.time()
rf(data)
t1 = time.time()
print(f"RF: {t1-t0}")
t1 = time.time()
svm(data)
t2 = time.time()
print(f"SVM: {t2-t1}")

Cross-validation scores: [1.         1.         0.99997092 1.         0.99997092]
Average cross-validation score: 0.9999883671050807
Accuracy: 0.999985458987073
Kappa: 0.9999546243666732
RF: 856.0297689437866
Ctcas: (343854, 21) (343854, 85)
Cross-validation scores: [0.77637376 0.78073607 0.83174594 0.77512324 0.81705686]
Average cross-validation score: 0.7962071717929249
Accuracy: 0.8042634249901848
Kappa: 0.4589747070463098
SVM: 780.5863149166107
