# Caderno para criação dos gráficos de distribuição de métricas e ganhos de composição

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from timer import timer
from tqdm import tqdm

# Limiar para métrica da primeira fase
threshold = 0.9 
tables = ['table2.csv','table3.csv'] #Arquivos de entrada gerados pelo caderno Detector.ipynb

Função para gráfico de distribuição de escores

In [2]:
def plote(data:dict,metric:str,title:str,show:bool=True,path="",save=True,file:str=None,figsize:tuple=(10,6)):
    fig,ax = plt.subplots(figsize=figsize)
    boxes = list(data.values())
    bp = ax.boxplot(boxes)
    ax.set_xticks(np.arange(1,len(data)+1),labels=data.keys())
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",rotation_mode="anchor")
    ax.set_ylabel(metric.capitalize())
    ax.set_title(title,y=1.075)
    ax.yaxis.grid(True,linestyle='-',which='major',color='lightgrey',alpha=0.5)

    num_boxes = len(data)
    medians = np.empty(num_boxes)
    for i in range(num_boxes):
        med = bp['medians'][i]
        medians[i] = med.get_ydata()[0]

    pos = np.arange(num_boxes) + 1
    upper_labels = [str(round(s,3)) for s in medians]
    for tick in range(num_boxes):
        ax.text(pos[tick],1.01, upper_labels[tick], rotation = 30,
                transform=ax.get_xaxis_transform(),
                horizontalalignment='center')
    ax.text(ax.get_xlim()[0],1.01,"Medians:",
            transform=ax.get_xaxis_transform(),horizontalalignment='right')
    if save:
        if file != None:
            fig.savefig(path+file,bbox_inches='tight')
        else:
            fig.savefig(path+title+'_'+metric+'.png',bbox_inches='tight')
    if show:
        plt.show()        
    plt.close(fig)

Função para gráfico de barras da melhoria em valores absolutos

In [3]:
# Melhoria absoluta
def melhoria_absoluta(df_var,metric:str,title:str,file:str,path:str):
    fig,ax = plt.subplots(1)
    hasplot = False
    for clf in df_var.columns:
        diff = 0
        m0 = pd.Series(df_var.loc[df_var.index[0],clf]).median()
        for i in df_var.index[1:]:
            mi = pd.Series(df_var.loc[i,clf]).median()
            if mi-m0 > diff:
                diff = mi-m0
        if diff != 0:
            plt.bar(clf,diff,fill=False)
            hasplot = True
    if hasplot:
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",rotation_mode="anchor")
        ax.yaxis.grid(True,linestyle='-',which='major',color='lightgrey',alpha=0.5)
        plt.title("Melhoria pela composição\n"+title)
        plt.ylabel("Acréscimo na mediana - "+metric)
        fig.savefig(path+"melhoria_"+file,bbox_inches='tight')
    plt.close(fig)

Função para gráfico de barras da melhoria percentual

In [4]:
# Melhoria percentual
def melhoria_percentual(df_var,data,metric,title,file,path):
    fig,ax = plt.subplots(1,figsize=(10,6))
    cell_text = [[],[],[],[],[]]
    hasplot = False
    for clf in df_var.columns:
        diff = 0
        last = "1step_only"
        last_fp = 9999999
        m0 = pd.Series(df_var.loc[df_var.index[0],clf]).median()
        mfp0 = pd.Series(data.query("Classifier_1s == '"+clf+"' and Classifier_2s == '1step_only'")['FP']).median()
        mfn0 = pd.Series(data.query("Classifier_1s == '"+clf+"' and Classifier_2s == '1step_only'")['FN']).median()
        if m0 == 0.0:
            for i in df_var.index[1:]:
                mi = pd.Series(df_var.loc[i,clf]).median()
                fpi = pd.Series(data.query("Classifier_1s == '"+clf+"' and Classifier_2s == '"+i+"'")['FP']).median()
                if mi-m0 > diff:
                    diff = mi-m0
                    last = i
                    last_fp = fpi
                elif mi-m0 == diff and last_fp > fpi:
                    last = i
                    last_fp = fpi
            if diff != 0:
                cell_text[0].append('*'+str('%1.2f' % diff))
        else:
            for i in df_var.index[1:]:
                mi = pd.Series(df_var.loc[i,clf]).median()
                fpi = pd.Series(data.query("Classifier_1s == '"+clf+"' and Classifier_2s == '"+i+"'")['FP']).median()
                temp = (mi-m0)*100/m0
                if temp > diff:
                    diff = temp
                    last = i
                    last_fp = fpi
                elif temp == diff and last_fp > fpi:
                    last = i
                    last_fp = fpi
            if diff != 0:
                cell_text[0].append(str('%1.2f' % diff))
        if diff != 0:
            mfpl = pd.Series(data.query("Classifier_1s == '"+clf+"' and Classifier_2s == '"+last+"'")['FP']).median()
            mfnl = pd.Series(data.query("Classifier_1s == '"+clf+"' and Classifier_2s == '"+last+"'")['FN']).median()
            if mfp0 == 0.0:
                cell_text[1].append('-')
            else:
                fpc = (mfpl-mfp0)*100/mfp0
                if fpc >= 1000:
                    cell_text[1].append(str('%5.2fk' % (fpc/1000)))
                else:
                    cell_text[1].append(str('%1.2f' % fpc))
            cell_text[2].append(str(int(mfp0))+"→"+str(int(mfpl)))
            if mfn0 == 0.0:
                cell_text[3].append('-')
            else:
                fnc = (mfn0-mfnl)*100/mfn0
                if fnc >= 1000:
                    cell_text[3].append(str('%5.2fk' % (fnc/1000)))
                else:
                    cell_text[3].append(str('%1.2f' % fnc))
            cell_text[4].append(str(int(mfn0))+"→"+str(int(mfnl)))
            if m0 == 0.0 or mfp0 == 0.0:
                color = 'c'
            elif diff >= fpc:
                color = 'g'
            else:
                color = 'r'
            ax.bar(clf,diff,fill=True,color=color)
            hasplot = True
    
    if hasplot:
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",rotation_mode="anchor")
        ax.yaxis.grid(True,linestyle='-',which='major',color='lightgrey',alpha=0.5)
        ax.set_ylabel("% "+metric)
        ax.set_title("Melhoria percentual\n"+title,y=1.32)
        table = plt.table(cellText=cell_text,
                        rowLabels=["Metric Gain (%)","%FP","FP","%FN","FN"],
                        loc='top')
        table.scale(1,1.8)
        table.auto_set_font_size(False)
        table.set_fontsize(6.45)
        for (r,c),cell in table.get_celld().items():
            cell.set_text_props(va='center_baseline')
        plt.margins(x=0.0051)
        fig.savefig(path+"melhoriapp_"+file,bbox_inches='tight')
    plt.close(fig)

Função para gráfico de barras da melhoria absoluta com restrição do limiar

In [5]:
# Melhora com limiar
def melhoria_limiar(df_var,data_m1s,threshold,metric,title,file,path):
    fig,ax = plt.subplots(1)
    hasplot = False
    for clf in df_var.loc[:,data_m1s.median() >= threshold].columns:
        diff = 0
        m0 = pd.Series(df_var.loc[df_var.index[0],clf]).median()
        for i in df_var.index[1:]:
            mi = pd.Series(df_var.loc[i,clf]).median()
            if mi-m0 > diff:
                diff = mi-m0
        if diff != 0:
            plt.bar(clf,diff,fill=False)
            hasplot = True
    if hasplot:
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",rotation_mode="anchor")
        ax.yaxis.grid(True,linestyle='-',which='major',color='lightgrey',alpha=0.5)
        plt.title("Melhoria pela composição - limiar\n"+title)
        plt.ylabel("Acréscimo na mediana - "+metric)
        fig.savefig(path+"melhoria_al_"+file,bbox_inches='tight')
    plt.close(fig)

Função para gráfico de barras da melhoria percentual com restrição de limiar

In [6]:
# Melhora percentual com limiar
def melhoria_percentual_limiar(df_var,data_m1s,data,metric,title,file,path):
    w = df_var.loc[:,data_m1s.median() >= threshold].columns.size
    if w == 0:
        return
    w = round(w*11/19,1)
    if w < 6:
        w = 6
    fig,ax = plt.subplots(1,figsize=(w,6))
    cell_text = [[],[],[],[],[]]
    hasplot = False
    for clf in df_var.loc[:,data_m1s.median() >= threshold].columns:
        diff = 0
        last="1step_only"
        last_fp = 9999999
        m0 = pd.Series(df_var.loc[df_var.index[0],clf]).median()
        mfp0 = pd.Series(data.query("Classifier_1s == '"+clf+"' and Classifier_2s == '1step_only'")['FP']).median()
        mfn0 = pd.Series(data.query("Classifier_1s == '"+clf+"' and Classifier_2s == '1step_only'")['FN']).median()
        if m0 == 0.0:
            for i in df_var.index[1:]:
                mi = pd.Series(df_var.loc[i,clf]).median()
                fpi = pd.Series(data.query("Classifier_1s == '"+clf+"' and Classifier_2s == '"+i+"'")['FP']).median()
                if mi-m0 > diff:
                    diff = mi-m0
                    last = i
                    last_fp = fpi
                elif mi-m0 == diff and last_fp > fpi:
                    last = i
                    last_fp = fpi
            if diff != 0:
                cell_text[0].append('*'+str('%1.2f' % diff))
        else:
            for i in df_var.index[1:]:
                mi = pd.Series(df_var.loc[i,clf]).median()
                fpi = pd.Series(data.query("Classifier_1s == '"+clf+"' and Classifier_2s == '"+i+"'")['FP']).median()
                temp = (mi-m0)*100/m0
                if temp > diff:
                    diff = temp
                    last = i
                    last_fp = fpi
                elif temp == diff and last_fp > fpi:
                    last = i
                    last_fp = fpi
            if diff != 0:
                cell_text[0].append(str('%1.2f' % diff))
        if diff != 0:
            mfpl = pd.Series(data.query("Classifier_1s == '"+clf+"' and Classifier_2s == '"+last+"'")['FP']).median()
            mfnl = pd.Series(data.query("Classifier_1s == '"+clf+"' and Classifier_2s == '"+last+"'")['FN']).median()
            if mfp0 == 0.0:
                cell_text[1].append('-')
            else:
                fpc = (mfpl-mfp0)*100/mfp0
                if fpc >= 1000:
                    cell_text[1].append(str('%5.2fk' % (fpc/1000)))
                else:
                    cell_text[1].append(str('%1.2f' % fpc))
            cell_text[2].append(str(int(mfp0))+"→"+str(int(mfpl)))
            if mfn0 == 0.0:
                cell_text[3].append('-')
            else:
                fnc = (mfn0-mfnl)*100/mfn0
                if fnc >= 1000:
                    cell_text[3].append(str('%5.2fk' % (fnc/1000)))
                else:
                    cell_text[3].append(str('%1.2f' % fnc))
            cell_text[4].append(str(int(mfn0))+"→"+str(int(mfnl)))
            if m0 == 0.0 or mfp0 == 0.0:
                color = 'c'
            elif diff >= fpc:
                color = 'g'
            else:
                color = 'r'
            ax.bar(clf,diff,fill=True,color=color)
            hasplot = True
    if hasplot:
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",rotation_mode="anchor")
        ax.yaxis.grid(True,linestyle='-',which='major',color='lightgrey',alpha=0.5)
        ax.set_ylabel("% "+metric)
        ax.set_title("Melhoria percentual - limiar\n"+title,y=1.32)
        if len(cell_text[1]) != 0:
            table = plt.table(cellText=cell_text,
                            rowLabels=["Metric Gain (%)","%FP","FP","%FN","FN"],
                            loc='top')
            table.scale(1,1.8)
            table.auto_set_font_size(False)
            table.set_fontsize(6.45)
            for (r,c),cell in table.get_celld().items():
                cell.set_text_props(va='center_baseline')
        plt.margins(x=0.0051)
        fig.savefig(path+"melhoria_pl_"+file,bbox_inches='tight')
    plt.close(fig)

## Carga dos arquivos e batch para geração dos gráficos

In [None]:
tic = time()
metrics = ['Accuracy','Recall','F1','AUC']
for table in tables:
    df = pd.read_csv(table)
    aux = df.groupby(['HAI','Files','Selector','N_Samples','Test_Size','Contamination']).count().reset_index()
    params = []
    for i in range(aux.shape[0]):
        params.append((aux.iloc[i,0],
                       aux.iloc[i,1],
                       aux.iloc[i,2],
                       aux.iloc[i,3],
                       aux.iloc[i,4],
                       aux.iloc[i,5]))
    del aux

    for param in tqdm(params):
        version = param[0]
        files = param[1]
        fselector = param[2]
        n_samples = str(param[3])
        test_size = str(param[4])
        contamination = str(param[5])
        dataset = '_'.join([version,files,fselector])
        title = dataset+'\n'+n_samples+' samples, test:'+test_size+' hit:'+contamination
        data = df.query("HAI == '"+version+
                        "' and Files == '"+files+
                        "' and Selector == '"+fselector+
                        "' and N_Samples == "+n_samples+
                        " and Test_Size == "+test_size+
                        " and Contamination == "+contamination)
        for metric in metrics:
            data_m1s = data.query("Classifier_2s == '1step_only'").pivot(index='Iteration',columns='Classifier_1s',values=metric)
            path = "composition/" + metric + "/" + dataset + "/" + n_samples + "/"
            variations = {}
            for s1clf in data['Classifier_1s'].unique():
                aux = {}
                dataux = data.query("Classifier_1s == '"+s1clf+"' and Classifier_2s == '1step_only'")
                aux["1step_only"] = dataux[metric].to_list()
                for s2clf in data['Classifier_2s'].unique():
                    dataux = data.query("Classifier_1s == '"+s1clf+"' and Classifier_2s == '"+s2clf+"'")
                    aux[s2clf] = dataux[metric].to_list()
                variations[s1clf] = aux

                file = '_'.join([dataset,n_samples,test_size,contamination,s1clf,metric,'.png'])
                os.makedirs(path[:-1],exist_ok=True)
                plote(aux,metric=metric,title=s1clf+" "+title,show=False,path=path,file=file,figsize=(8,6))

            df_var = pd.DataFrame(variations)
            file = '_'.join([dataset,n_samples,test_size,contamination,metric,'.png'])
            # Melhoria absoluta
            melhoria_absoluta(df_var,metric,title,file,path)

            # Melhoria percentual
            melhoria_percentual(df_var,data,metric,title,file,path)

            # Melhora com limiar
            melhoria_limiar(df_var,data_m1s,threshold,metric,title,file,path)

            # Melhora percentual com limiar
            melhoria_percentual_limiar(df_var,data_m1s,data,metric,title,file,path)
toc = time()
timer(toc-tic)

 84%|███████████████████████████████████████████████████████████████████▋             | 178/213 [1:20:13<15:06, 25.90s/it]