# Detecção em duas fases
Modelos do scikit-learn, parâmetros para criação de cenários, pre-processamento dos dados, aprendizado dos modelos e predições.  
Gera arquivo para análise e gráficos para uso em outro caderno.

In [1]:
import os
import numpy as np
import pandas as pd
from itertools import product
from math import ceil
from time import time
from timer import timer
# from tqdm.notebook import tnrange,tqdm
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,recall_score,f1_score,roc_auc_score,confusion_matrix

from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,ExtraTreesClassifier,BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier # cuidado com versões > 1.2.2: bug c_contiguous
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

from sklearnex import patch_sklearn
patch_sklearn()

# Parâmetros de execução
table_name = "table4.csv" # Nome do arquivo a ser gerado
path_to_datasets = 'datasets' # Caminho das derivações
random_state = 2023 # Parâmetro de aleatorização
n_jobs = 14 # Cores a usar quando possível
n_iter = 50 # Quantidade de iterações

# Parâmetros para construção dos cenários
test_sizes = [0.5,0.35,0.1] # floats, 0.0 < valor <= 1.0
hits = [0.05,0.1,0.25,0.5] # floats, 0.0 < valor < 1.0
samples_sizes = [5000] # valor automático adicionado posteriormente, int ou float entre 0 e 1

# Valores para amostragem pela estimativa da média populacional (https://pessoal.dainf.ct.utfpr.edu.br/maurofonseca/doku.php?id=cursos:sadrc:exdimesionamento)
E = 0.01 # Erro T-Student
GL = 1.645 # G.L. da tabela (no caso infinita amostras, 90% bicaudal)

# Parâmetros de modelos
lr_d = 18 # Limiar de dimensionalidade para mudança de solver do Logistic Regression
knns = [2,5,10] # Lista para "k" do KNN

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Função para executar o aprendizado e predição

In [2]:
def run(X,Y,models,n_iter:int=1,n_samples:int=1,random_state:int=0,test_size:float=0.5,hit:float=0.1):
    import warnings
    warnings.simplefilter("ignore")

    resultados = []
    for name,clf in models.items():
        XY = pd.concat([X,Y],axis=1)
        for i in range(n_iter):
            if 'train' in XY.columns:
                X_o = XY[XY.Attack == 1].sample(n=int(n_samples*hit),random_state=random_state+i)
                X_o_train = X_o.sample(frac=(1-test_size))
                X_o_test = X_o[~X_o.isin(X_o_train).all(1)]
                X_i_train = XY[XY.train == 1]
                X_i_train = X_i_train.sample(n=int(n_samples*(1-test_size))-X_o_train.shape[0],random_state=random_state+i)
                X_i_test = XY[XY.train == 0]
                X_i_test = X_i_test[X_i_test.Attack == 0].sample(n=int(n_samples*test_size)-X_o_test.shape[0],random_state=random_state+i)
                X_train = pd.concat([X_o_train,X_i_train],axis=0)
                X_test = pd.concat([X_o_test,X_i_test],axis=0)
                Y_train = X_train['Attack']
                Y_test = X_test['Attack']
                X_train.drop(columns=['train','Attack'],inplace=True)
                X_test.drop(columns=['train','Attack'],inplace=True)
            else:
                X_o = XY[XY.Attack == 1].sample(n=int(n_samples*hit),random_state=random_state+i)
                X_i = XY[XY.Attack == 0].sample(n=n_samples-X_o.shape[0],random_state=random_state+i)
                X_ = pd.concat([X_i,X_o],axis=0).reset_index(drop=True)
                Y_ = X_['Attack']
                X_ = X_.drop(columns=['Attack'])
                X_train,X_test,Y_train,Y_test = train_test_split(X_,Y_,random_state=random_state,stratify=Y_,test_size=test_size)
            
            if type(clf) == type(OneClassSVM()) or type(clf) == type(LocalOutlierFactor()) and clf.novelty:
                X_train = X_train[Y_train == 0]
                start = time()
                clf.fit(X_train)
                lap = time()
                Y_pred = clf.predict(X_test)
                stop = time()
                Y_pred[Y_pred>0] = 0
                Y_pred[Y_pred<0] = 1
            elif type(clf) == type(IsolationForest()) or type(clf) == type(LocalOutlierFactor()):
                start = time()
                Y_pred = clf.fit_predict(X_test)
                stop = time()
                lap = start
                Y_pred[Y_pred>0] = 0
                Y_pred[Y_pred<0] = 1
            else:
                start = time()
                clf.fit(X_train,Y_train)
                lap = time()
                Y_pred = clf.predict(X_test)
                stop = time()
            accuracy = accuracy_score(y_true=Y_test,y_pred=Y_pred)
            recall = recall_score(y_true=Y_test,y_pred=Y_pred)
            f1 = f1_score(y_true=Y_test,y_pred=Y_pred)
            roc_auc = roc_auc_score(y_true=Y_test,y_score=Y_pred)
            train_time = lap - start
            test_time = stop - lap
            cm = confusion_matrix(y_true=Y_test,y_pred=Y_pred)
            fp = (cm.sum(axis=0) - np.diag(cm))[1]
            resultado = [name,i,train_time,test_time,fp,accuracy,recall,f1,
                         roc_auc,Y_test,Y_pred]
            resultados.append(resultado)
    return resultados

Separação dos modelos segundo tarefa e pacote

In [3]:
# Supervised scikit-learn
models_sup_sk = {"Perceptron":Perceptron(n_jobs=n_jobs,random_state=random_state),
                 "LogisticRegression":LogisticRegression(max_iter=400,n_jobs=n_jobs,random_state=random_state),
                 "SVC-Linear":SVC(kernel='linear',random_state=random_state),
                 "SVC-RBF":SVC(kernel='rbf',random_state=random_state),
                 "SGDClassifier":SGDClassifier(n_jobs=n_jobs,random_state=random_state),
                 "DecisionTreeClassifier":DecisionTreeClassifier(random_state=random_state),
                 "RandomForestClassifier":RandomForestClassifier(n_jobs=n_jobs,random_state=random_state),
                 "AdaBoostClassifier":AdaBoostClassifier(random_state=random_state),
                 "MLPClassifier":MLPClassifier(random_state=random_state),
                 "GaussianProcessClassifier":GaussianProcessClassifier(n_jobs=n_jobs,random_state=random_state),
                 "GaussianNB":GaussianNB(),
                 "QuadraticDiscriminantAnalysis":QuadraticDiscriminantAnalysis(),
                 "ExtraTrees":ExtraTreesClassifier(n_jobs=n_jobs,random_state=random_state),
                 "Bagging-DTree":BaggingClassifier(DecisionTreeClassifier(),n_jobs=n_jobs,random_state=random_state),
                 "Bagging-RF":BaggingClassifier(RandomForestClassifier(),n_jobs=n_jobs,random_state=random_state),
                 "Bagging-ETree":BaggingClassifier(ExtraTreesClassifier(),n_jobs=n_jobs,random_state=random_state)}

# Unsupervised scikit-learn                 
models_unsup_sk = {"OneClassSVM-Linear":OneClassSVM(kernel='linear'),
                   "OneClassSVM-RBF":OneClassSVM(kernel='rbf'),
                   "LOF-Novelty":LocalOutlierFactor(novelty=True,n_jobs=n_jobs),
                   "LOF":LocalOutlierFactor(n_jobs=n_jobs),
                   "IsolationForest":IsolationForest(n_jobs=n_jobs,random_state=random_state),
                   "OneClassSVM-Linear-C":OneClassSVM(kernel='linear'),
                   "OneClassSVM-RBF-C":OneClassSVM(kernel='rbf'),
                   "LOF-Novelty-C":LocalOutlierFactor(novelty=True,n_jobs=n_jobs),
                   "LOF-C":LocalOutlierFactor(n_jobs=n_jobs),
                   "IsolationForest-C":IsolationForest(n_jobs=n_jobs,random_state=random_state)}

for knn in knns:
        models_sup_sk["KNN-"+str(knn)] = KNeighborsClassifier(n_neighbors=knn,n_jobs=n_jobs)

Obtenção de lista dos nomes dos datasets para carga em memória conforme demanda.

In [4]:
# Carga da lista de datasets do diretório
dataset_list = [os.path.splitext(_)[0] for _ in os.listdir(path_to_datasets) 
                if os.path.splitext(_)[1] == '.csv']

Criação de variável para armazenar quantidade de amostras segundo estimativa populacional

In [5]:
auto_sample = {}
for dataset in dataset_list:
    df = pd.read_csv(path_to_datasets+"/"+dataset+".csv")
    if 'train' in df.columns:
        df.drop(columns=['train'],inplace=True)
    df.drop(columns=['Attack'],inplace=True)
    scaler = MinMaxScaler()
    df = pd.DataFrame(scaler.fit_transform(df))
    dvp = max(df.std())
    n = (GL * dvp / E)**2
    auto_sample[dataset] = ceil(n)
    del df

## Criação da tabela de resultados da detecção

In [7]:
while 0 in samples_sizes:
    samples_sizes.pop(0)
samples_sizes.append(0) # utilizado para valor padrão de cada derivação
samples_sizes.append(-1) # introduzido para dobrar o auto-smaples
experiment_params = product(dataset_list,samples_sizes,test_sizes,hits)
table = {"HAI":[],
         "Files":[],
         "Selector":[],
         "N_Samples":[],
         "Test_Size":[],
         "Contamination":[],
         "Classifier_1s":[],
         "Classifier_2s":[],
         "Accuracy":[],
         "Recall":[],
         "F1":[],
         "AUC":[],
         "FP":[],
         "FN":[],
         "FPR":[],
         "FNR":[],
         "Train_Time":[],
         "Test_Time":[],
         "Iteration":[]}
done = []
has_tag = False
tic = time()

for dataset,sample_size,test_size,hit_ in tqdm(experiment_params):
    # Carga do dataset
    df = pd.read_csv(path_to_datasets+"/"+dataset+".csv")
    Y = df['Attack']
    if 'train' in df:
        tag = df['train']
        has_tag = True
        df.drop(columns=['train'],inplace=True)
    X = df.drop(columns=['Attack'])
    del df

    # Normalização
    scaler = MinMaxScaler()
    X = pd.DataFrame(scaler.fit_transform(X))
    if has_tag:
        X['train'] = tag
        has_tag = False

    # Tamanho da amostra
    if sample_size == 0:
        n_samples = auto_sample[dataset]
    elif sample_size == -1:
        n_samples = 2 * auto_sample[dataset]
    elif type(sample_size) == float:
        n_samples =  int(len(Y) * sample_size)
    elif type(sample_size) == int:
        n_samples = sample_size
    else:
        raise NotImplementedError
    if n_samples < 2:
        n_samples = 2
    if n_samples > len(Y):
        n_samples = len(Y)

    # Teste de restrição de hit/test_size
    if int(hit_*n_samples) > Y.sum():
        hit = Y.sum()/n_samples
    else:
        hit = hit_
    
    todo = (dataset,n_samples,test_size,hit)
    if todo in done:
        continue
    
    if X.shape[1] > lr_d:
        models_sup_sk["LogisticRegression"].set_params(**{"solver":"sag"})
    else:
        models_sup_sk["LogisticRegression"].set_params(**{"solver":"lbfgs"})
    resultado_sup = run(X,Y,models_sup_sk,
                    n_iter=n_iter,
                    n_samples=n_samples,
                    random_state=random_state,
                    test_size=test_size,
                    hit=hit)
    
    for model in models_unsup_sk:
        if "-C" in model:
            if type(models_unsup_sk[model]) == type(OneClassSVM()):
                models_unsup_sk[model].set_params(**{'nu':hit})
            elif type(models_unsup_sk[model]) == type(LocalOutlierFactor()):
                models_unsup_sk[model].set_params(**{'contamination':hit})
            elif type(models_unsup_sk[model]) == type(IsolationForest()):
                models_unsup_sk[model].set_params(**{'contamination':hit})
            else:
                raise NotImplementedError

    resultado_unsup = run(X,Y,models_unsup_sk,
                    n_iter=n_iter,
                    n_samples=n_samples,
                    random_state=random_state,
                    test_size=test_size,
                    hit=hit)
    
    # Composição
    data_1s = pd.DataFrame(resultado_sup,columns=['Classifier',
                                                  'iteration',
                                                  'train_time',
                                                  'test_time',
                                                  'FP',
                                                  'Accuracy','Recall','F1',
                                                  'ROC AUC',
                                                  'Y_true','Y_pred'])
    data_2s = pd.DataFrame(resultado_unsup,columns=['Classifier',
                                                    'iteration',
                                                    'train_time',
                                                    'test_time',
                                                    'FP',
                                                    'Accuracy','Recall','F1',
                                                    'ROC AUC',
                                                    'Y_true','Y_pred'])
    n_test = data_1s.loc[0,'Y_true'].size
    n_hit = data_1s.loc[0,'Y_true'].sum()
    chain = []
    for clf_1s in data_1s['Classifier'].unique():
        for iter in range(n_iter):
            dataux = data_1s.query("Classifier == '"+clf_1s+"' and iteration == "+str(iter))
            Y_pred_1 = dataux['Y_pred'].to_list()[0]
            Y_test = dataux['Y_true'].to_list()[0]
            ttr1 = dataux['train_time'].values[0]
            tte1 = dataux['test_time'].values[0]
            fp0 = dataux['FP'].values[0]
            cm = confusion_matrix(y_true=Y_test,y_pred=Y_pred_1)
            fn0 = (cm.sum(axis=1) - np.diag(cm))[1]
            score_acc = dataux['Accuracy'].values[0]
            score_recall = dataux['Recall'].values[0]
            score_f1 = dataux['F1'].values[0]
            score_rocauc = dataux['ROC AUC'].values[0]
            link = [clf_1s,iter,"1step_only",
                    ttr1,tte1,fp0,fn0,
                    score_acc,score_recall,score_f1,
                    score_rocauc,
                    Y_test,Y_pred_1]
            chain.append(link)
            for od in data_2s['Classifier'].unique():
                dataux = data_2s.query("Classifier == '"+od+"' and iteration == "+str(iter))
                Y_pred_2 = dataux['Y_pred'].to_list()[0]
                Y_test = dataux['Y_true'].to_list()[0]
                Y_pred_3 = Y_pred_1 + Y_pred_2
                Y_pred_3[Y_pred_3 == 2] = 1
                ttr2 = dataux['train_time'].values[0]
                tte2 = dataux['test_time'].values[0]
                cm = confusion_matrix(y_true=Y_test,y_pred=Y_pred_3)
                fp = (cm.sum(axis=0) - np.diag(cm))[1]
                fn = (cm.sum(axis=1) - np.diag(cm))[1]
                score_acc = accuracy_score(Y_test,Y_pred_3)
                score_recall = recall_score(Y_test,Y_pred_3,zero_division=0)
                score_f1 = f1_score(Y_test,Y_pred_3,zero_division=0)
                score_rocauc = roc_auc_score(Y_test,Y_pred_3)
                link = [clf_1s,iter,od,
                        ttr1+ttr2,tte1+tte2,fp,fn,
                        score_acc,score_recall,score_f1,
                        score_rocauc,
                        Y_test,Y_pred_3]
                chain.append(link)

    name = dataset.split('_')
    for link in chain:
        table["HAI"].append(name[0])

        if 'tr' in name[1]:
            table['Files'].append('_'.join([name[1],name[2]]))
        else:
            table['Files'].append(name[1])

        table['Selector'].append(name[-1])
        table['N_Samples'].append(n_samples)
        table['Test_Size'].append(n_test)
        table['Contamination'].append(n_hit)
        table['Classifier_1s'].append(link[0])
        table['Classifier_2s'].append(link[2])
        table['Accuracy'].append(link[7])
        table['Recall'].append(link[8])
        table['F1'].append(link[9])
        table['AUC'].append(link[10])
        table['FP'].append(link[5])
        table['FN'].append(link[6])
        table['FPR'].append(link[5]/(n_test-n_hit))
        table['FNR'].append(link[6]/n_hit)
        table['Train_Time'].append(link[3])
        table['Test_Time'].append(link[4])
        table['Iteration'].append(link[1])

    done.append(todo)
toc = time()
timer(toc-tic)

360it [114:25:25, 1144.24s/it]

Done in 4d 18h 25m 25s





Salvando tabela

In [8]:
dftable = pd.DataFrame(table)
dftable.to_csv(table_name,index=False)