In [None]:
pip install brminer

In [None]:
pip install scikit-posthocs

In [None]:
pip install Orange

In [None]:
pip install Orange3-Associate

In [5]:
import numpy as np
import math
import os
import random
import pandas as pd
import tensorflow as tf
import sys
import math
import csv
import zipfile
import Orange
import matplotlib.pyplot as plt

# Post hoc tests
import scikit_posthocs as sp
from scipy.stats import friedmanchisquare

# Classifiers
import brminer
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.mixture import GaussianMixture

# Others
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from zipfile import ZipFile

# Sklearn
from sklearn import metrics
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# BRMiner and distance metrics
from sklearn.base import BaseEstimator
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_distances
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

  import pandas.util.testing as tm


In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
class BRM_dissimilarities(BaseEstimator):
    def __init__(self, classifier_count=100, bootstrap_sample_percent=100, use_bootstrap_sample_count=False,
                 bootstrap_sample_count=0, use_past_even_queue=False, max_event_count=3, alpha=0.5, 
                 user_threshold=95, dissimilarity_meassure = euclidean_distances):
        self.classifier_count = classifier_count
        self.bootstrap_sample_percent = bootstrap_sample_percent
        self.use_bootstrap_sample_count = use_bootstrap_sample_count
        self.bootstrap_sample_count = bootstrap_sample_count
        self.use_past_even_queue = use_past_even_queue
        self.max_event_count = max_event_count
        self.alpha = alpha
        self.user_threshold = user_threshold
        self.dissimilarity_meassure = dissimilarity_meassure
        
    def _evaluate(self, current_similarity):
        if (current_similarity < 0):
            current_similarity = 0

        if (self.use_past_even_queue == False):
            return -1+2*current_similarity
        
        result_similarity = (self.alpha * self._similarity_sum / self.max_event_count + (1 - self.alpha) * current_similarity)
        if (result_similarity < 0):
            result_similarity = 0

        self._similarity_sum += current_similarity

        if (len(self._past_events) == self.max_event_count):
            self._similarity_sum -= self._past_events.pop(0)

        self._past_events.append(current_similarity)

        if (self._similarity_sum < 0):
            self._similarity_sum = 0

        return -1+2*result_similarity

    def score_samples(self, X):
        X = np.array(X)
        X_test = self._scaler.transform(X)

        result = []
        batch_size = 100
        for i in range(min(len(X_test), batch_size), len(X_test) + batch_size, batch_size):
            current_X_test = X_test[[j for j in range(max(0, i-batch_size), min(i, len(X_test)))]]

            current_similarity = np.average([np.exp(-np.power(np.amin(self.dissimilarity_meassure(current_X_test, self._centers[i]), axis=1)/self._max_dissimilarity, 2)/(self._sd[i])) for i in range(len(self._centers))], axis=0)
        
            result = result + [j for j in list(map(self._evaluate, current_similarity))]

        return result
        

    def predict(self, X):
        if (len(X.shape) < 2):
            raise ValueError('Reshape your data')

        if (X.shape[1] != self.n_features_in_):
            raise ValueError('Reshape your data')

        if not self._is_threshold_Computed:            
            x_pred_classif = self.score_samples(self._X_train)            
            x_pred_classif.sort()
            self._inner_threshold = x_pred_classif[(100-self.user_threshold)*len(x_pred_classif)//100]
            self._is_threshold_Computed = True

        y_pred_classif = self.score_samples(X)
        return [-1 if s <= self._inner_threshold else 1 for s in y_pred_classif]
        

    def fit(self, X, y = None):
        # Check that X and y have correct shape
        if y is not None:
            X_train, y_train = check_X_y(X, y)
        else:
             X_train = check_array(X)
                
        self._similarity_sum = 0
        self._is_threshold_Computed = False

        self.n_features_in_ = X_train.shape[1]

        if self.n_features_in_ < 1:
            raise ValueError('Unable to instantiate the train dataset - Empty vector')     
        
        self._scaler = MinMaxScaler()
        X_train = pd.DataFrame(X_train)
        X_train = pd.DataFrame(self._scaler.fit_transform(X_train[X_train.columns]), index=X_train.index, columns=X_train.columns)


        self._max_dissimilarity = math.sqrt(self.n_features_in_)
        self._sd = np.empty(0)
        sampleSize = int(self.bootstrap_sample_count) if (self.use_bootstrap_sample_count) else int(0.01 * self.bootstrap_sample_percent * len(X_train));
        self._centers = np.empty((0, sampleSize, self.n_features_in_))

        list_instances = X_train.values.tolist()
        for i in range(0, self.classifier_count):            
            centers = random.choices(list_instances, k=sampleSize)
            self._centers = np.insert(self._centers, i, centers, axis=0)
            self._sd = np.insert(self._sd, i, 2*(np.mean(self.dissimilarity_meassure(centers, centers))/self._max_dissimilarity)**2)

        return self

In [None]:
# Function to split target from data 
def splitdataset(train, test): 
    cols = len(train.columns)-1
    train = train.drop(train[train.iloc[:,cols] == ' positive'].index)
    ohe = OneHotEncoder(sparse=True)
    objInTrain = len(train)
    

    allData = pd.concat([train, test], ignore_index=True, sort =False, axis=0)
    AllDataWihoutClass = allData.iloc[:, :-1]
    AllDataWihoutClassOnlyNominals = AllDataWihoutClass.select_dtypes(include=['object'])
    AllDataWihoutClassNoNominals = AllDataWihoutClass.select_dtypes(exclude=['object'])

    encAllDataWihoutClassNominals = ohe.fit_transform(AllDataWihoutClassOnlyNominals)
    encAllDataWihoutClassNominalsToPanda = pd.DataFrame(encAllDataWihoutClassNominals.toarray())
    
    if AllDataWihoutClassOnlyNominals.shape[1] > 0:
      codAllDataAgain = pd.concat([encAllDataWihoutClassNominalsToPanda, AllDataWihoutClassNoNominals], ignore_index=True, sort =False, axis=1)
    else:
      codAllDataAgain = AllDataWihoutClass

    # Seperating the target variable 
    X_train = codAllDataAgain[:objInTrain]
    y_train = train.values[:, -1]

    X_test = codAllDataAgain[objInTrain:]
    y_test = test.values[:, -1]
    
    mm_scaler = MinMaxScaler()
    X_train_minmax = pd.DataFrame(mm_scaler.fit_transform(X_train[X_train.columns]), index=X_train.index, columns=X_train.columns)
    X_test_minmax = pd.DataFrame(mm_scaler.transform(X_test[X_test.columns]), index=X_test.index, columns=X_test.columns)
    
    std_scaler = StandardScaler()
    X_train_std = pd.DataFrame(std_scaler.fit_transform(X_train[X_train.columns]), index=X_train.index, columns=X_train.columns)
    X_test_std = pd.DataFrame(std_scaler.transform(X_test[X_test.columns]), index=X_test.index, columns=X_test.columns)
    
    #X_train_minmax_std = pd.DataFrame(std_scaler.fit_transform(X_train_minmax[X_train_minmax.columns]), index=X_train_minmax.index, columns=X_train_minmax.columns)
    #X_test_minmax_std = pd.DataFrame(std_scaler.transform(X_test_minmax[X_test_minmax.columns]), index=X_test_minmax.index, columns=X_test_minmax.columns)
    
    return X_train, X_test, y_train, y_test, X_train_minmax, X_test_minmax, X_train_std, X_test_std#, X_train_minmax_std, X_test_minmax_std

# Function to make predictions 
def prediction(X_test, clf_object):  
    y_pred = clf_object.score_samples(X_test) 
    return y_pred 

def result_of_Class(y_test, y_pred, saveFile):       
    np.savetxt(saveFile, y_pred, fmt='%.4f')

In [None]:
# convert Arff files into CSV considering the column names
def convert_arff_to_csv(arff_file, file_name):
    f = open(arff_file)
    content = f.readlines()
    data = False
    header = ""
    newContent = []
    for line in content:
        if not data:
            if "@attribute" in line:
                attri = line.split()
                columnName = attri[attri.index("@attribute")+1]
                header = header + columnName + ","
            elif "@data" in line:
                data = True
                header = header[:-1]
                header += '\n'
                newContent.append(header)
        else:
            newContent.append(line)

    with open(file_name+".csv", "w") as outFile:
        outFile.writelines(newContent)


# Give the paths of every file and folder in the ZIP file
def read_zip_files(zip_file_name):
    with ZipFile(zip_file_name, 'r') as zipObj:
        # Get a list of all archived file names from the zip
        listOfFileNames = zipObj.namelist()
        zipObj.extractall()
    return listOfFileNames


# Extract the names of all the files given the path of every file in the ZIP
def extract_file_names(list_Of_File_Names):
  list_of_files = []
  h = 0
  for i in range(len(list_Of_File_Names)):
      names = list_Of_File_Names[i].split("/")
      if len(names)>2 and names[2]!='':
         list_of_files.append(names[2])
         csv_file = convert_arff_to_csv(listOfFileNamesComplete[i], list_of_files[h]) 
         h += 1
  return list_of_files


# Function importing Dataset 
def importdata(trainFile, testFile): 
    train = pd.read_csv(trainFile + ".csv") 
    test = pd.read_csv(testFile + ".csv") 
    return train, test



# Put all the x_tests into a variable
def x_test_into_a_variable(test,train):
    X_train, X_test, y_train, y_test, X_train_minmax, X_test_minmax, X_train_std, X_test_std = splitdataset(train, test)
    x_tests = [[X_train, X_test], [X_train_minmax, X_test_minmax], [X_train_std, X_test_std]]
    return y_train, y_test, x_tests


# classify data 
def classify_data(classifier, auc_array, x_train, y_train, x_test, y_test):
    classifier.fit(x_train, y_train)
    y_pred = classifier.score_samples(x_test)
    auc = roc_auc_score(y_test,  y_pred)
    auc_array.append(1 - auc if auc < 0.5 else auc)
    return auc_array


# Obtain AUC Score according to classifier
def classifier_auc(classifier, train, test, auc_array, normalize):
    X_train, X_test, y_train, y_test, X_train_minmax, X_test_minmax, X_train_std, X_test_std = splitdataset(train, test)
    if normalize == 'No' or normalize == 'MinMax' or normalize == 'Std':
        if normalize == 'No':
            auc_array = classify_data(classifier, auc_array, X_train, y_train, X_test, y_test)
        if normalize == 'MinMax':
            auc_array = classify_data(classifier, auc_array, X_train_minmax, y_train, X_test_minmax, y_test)
        if normalize == 'Std':
            auc_array = classify_data(classifier, auc_array, X_train_std, y_train, X_test_std, y_test)
        return auc_array
    else:
      print("Error: No normalization was indicated")


# Obtain dataset with AUC results
def obtain_dataframe_with_auc_results(normalize, list_of_files, dis_measure):
    classifiers = [['BRM', BRM_dissimilarities(dissimilarity_meassure = dis_measure)], ['GMM', GaussianMixture()],['ISOF', IsolationForest()],['ocSVM', OneClassSVM()]]
    auc_results = pd.DataFrame(columns = ['BRM', 'GMM','ISOF', 'ocSVM'])
    for classifier_name, clsf in classifiers:
        auc_arr = []
        for i in range(0,len(list_of_files),2):
            train, test = importdata(list_of_files[i], list_of_files[i+1])
            auc_arr = classifier_auc(clsf, train, test, auc_arr, normalize)
        auc_results[classifier_name] = auc_arr
    return auc_results

In [None]:
# create boxplot    
def box_plot_creation(list_of_datasets, datasets_names):
    figure, [ax1, ax2, ax3] = plt.subplots(nrows=1, ncols=3, figsize=(20,6))
    boxplot_data = [[list_of_datasets[0], ax1],[list_of_datasets[1], ax2],[list_of_datasets[2],ax3]]

    box_plot1 = ax1.boxplot(list_of_datasets[0].T, vert=True, patch_artist=True)
    box_plot2 = ax2.boxplot(list_of_datasets[1].T, vert=True, patch_artist=True)
    box_plot3 = ax3.boxplot(list_of_datasets[2].T, vert=True, patch_artist=True)
    bp = [box_plot1, box_plot2, box_plot3]

    colors = ['tab:blue', 'tab:orange', 'tab:green','tab:red']
    for bplot in bp:
        for patch, color in zip(bplot['boxes'], colors):
            patch.set_facecolor(color)
            patch.set(linewidth=1.5)
        for whisker in bplot['whiskers']:
            whisker.set(color ='k',
                        linewidth = 2,
                        linestyle =":")
        for median in bplot['medians']:
            median.set(color ='k',
                      linewidth = 1.5)

    x_names = ['a)','b)','c)']
    axs = [ax1, ax2, ax3]
    for norm, ax, x_n in zip(datasets_names, axs, x_names):
        ax.set_xticklabels(['BRM', 'GMM', 'ISOF', 'ocSVM'], fontsize=16)
        ax.set_title(norm, fontsize = 16)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel(x_n, fontsize=20)


# Make the Friedman Test
def friedman_test(list_of_datasets):
    results = []
    for df in list_of_datasets:
        results.append(friedmanchisquare(df['BRM'],df['ISOF'],df['ocSVM'],df['GMM']))
    return results


# Connover test
def post_hoc_tests(dataframe):
    longDf=pd.melt(dataframe,var_name='criteria',value_name='score')
    dunn = sp.posthoc_dunn(longDf,val_col='score',group_col='criteria',p_adjust='bonferroni')
    conn = sp.posthoc_conover(longDf, val_col='score', group_col='criteria')
    return dunn, conn


# Make the post hoc tests
def plot_post_hoc_tests(list_of_datasets):
    for df in list_of_datasets:
        plt.figure()
        dunn, conn = post_hoc_tests(df)
        heatmap_args = {'linewidths': 1, 'linecolor': '0', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]}
        display(dunn,conn)
        sp.sign_plot(conn, **heatmap_args)


# Critical Difference Diagram
def cd_diagram(dataframes):
  for df in dataframes:
    plt.figure
    dunn, conn = post_hoc_tests(df)
    classifier_names = conn.iloc[0,:].index
    cd = Orange.evaluation.compute_CD(conn.iloc[0,:], 95,alpha='0.05', test='bonferroni-dunn') #tested on 95 datasets 
    print('cd=',cd)
    Orange.evaluation.graph_ranks(conn.iloc[0,:], classifier_names, cd=cd, width=5, textspace=1.5, cdmethod=0)
    plt.show()

In [None]:
def normalizing_datasets():
    zip_file = 'Unsupervised_Anomaly_Detection.zip'

    listOfFileNamesComplete = read_zip_files(zip_file)
    list_of_files = extract_file_names(listOfFileNamesComplete)

    data = obtain_dataframe_with_auc_results('No', list_of_files, euclidean_distances)
    data_minmax = obtain_dataframe_with_auc_results('MinMax', list_of_files, euclidean_distances)
    data_std = obtain_dataframe_with_auc_results('Std', list_of_files, euclidean_distances)

    dataframes = [data, data_minmax,data_std]
    datasets_names = ['Without Normalization', 'Min-Max Normalization', 'Std Normalizing']

    box_plot_creation(dataframes, datasets_names)
    fried = friedman_test(dataframes)
    display(fried)
    plot_post_hoc_tests(dataframes)
    cd_diagram(dataframes)

def different_dissimilarity_measures():
    zip_file = 'Unsupervised_Anomaly_Detection.zip'

    listOfFileNamesComplete = read_zip_files(zip_file)
    list_of_files = extract_file_names(listOfFileNamesComplete)

    data_euc = obtain_dataframe_with_auc_results('No', list_of_files, euclidean_distances)
    data_man = obtain_dataframe_with_auc_results('No', list_of_files, manhattan_distances)
    data_cos = obtain_dataframe_with_auc_results('No', list_of_files, cosine_distances)

    dataframes = [data_euc, data_man, data_cos]
    datasets_names = ['Euclidean Distance', 'Manhattan Distance', 'Cosine Distance']

    box_plot_creation(dataframes, datasets_names)
    fried = friedman_test(dataframes)
    display(fried)
    plot_post_hoc_tests(dataframes)
    cd_diagram(dataframes)

In [None]:
normalizing_datasets()

In [None]:
different_dissimilarity_measures()