In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import os

plt.rcParams['figure.dpi']=256
plt.rcParams['figure.figsize']=(13,6)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

class PDF(object):
    def __init__(self, pdf, size=(200,200)):
        self.pdf = pdf
        self.size = size

    def _repr_html_(self):
        return '<iframe src={0} width={1[0]} height={1[1]}></iframe>'.format(self.pdf, self.size)

    def _repr_latex_(self):
        return r'\includegraphics[width=1.0\textwidth]{{{0}}}'.format(self.pdf)


In [9]:
#fpath = 'results/data_classified_v6/25-11-2020.csv'
# fpath = 'results/data_classified_v6/18-11-2020.csv'
#fpath = 'results/data_classified_v6/tmp.csv'


def extract_query_strategy(row):
    name=row['classifier name']
    if('alldata' in name):
        return 'alldata'
    return name.split('[')[-1].replace(']','').strip()

def base_classifier(row):
    names = ['knn','QDA','DT','RF','NB']
    for name in names:
        if(name in row['classifier name']):
            return name.strip()
    return None

def space_col(row):
    name = row['classifier name']
    if('triplet' in name):
        return 'triplet'
    if('convnet' in name):
        return 'convnet'
    return 'hand-crafted'

CACHED_DF=("",None)
def loadDataResults(fpath):
    global CACHED_DF
    if(fpath==CACHED_DF[0]):
        return CACHED_DF[1]
    df = pd.read_csv(fpath)

    df['query strategy'] = df.apply(extract_query_strategy, axis=1)
    df['base_classifier']=df.apply(base_classifier, axis=1)
    df['space'] = df.apply(space_col, axis=1)
    df['classifier name'] = df['classifier name'].str.split('[').str[0].str.strip()
    df['classifier name']=df['classifier name'].str.replace('\(alldata\)','').str.strip()
    #if(fpath=='results/data_classified_v6/25-11-2020.csv'):
    df['train size']+=8
    CACHED_DF=(fpath,df)
    return df

#df=loadDataResults(fpath)
#df
#df[(df['value']>=0.9) & (df['metric name']=='f1_macro') & (df['train size']>=950)]
#df[df['classifier name'].str.split(' ').str[0]=='convnet']

In [11]:
def interact_results(methods, dataset_path, query_strategy='entropy', metric='f1_macro'):
    df=loadDataResults(dataset_path)
    mask = df['metric name']==metric
    #mask &= ~(df['query strategy']=='alldata')
    #mask &= ~df['classifier name'].str.contains('QDA')
    mask &= df['classifier name'].str.strip().isin(methods)
    mask2 = mask & df['query strategy'].isin(query_strategy)#,'random'])
    #mask &= (df['base_classifier']==base_classifier) | (df['space']=='hand-crafted')
    
    df_alldata=df[mask]
    df_alldata=df_alldata[df_alldata['query strategy']=='alldata']
    df3=df[mask2]
    
    n_cname = len(df3['classifier name'].unique())
    n_space = len(df3['space'].unique())
    n_qstrat = len(df3['query strategy'].unique())
    
    max_train_size=df3['train size'].max()
    min_train_size=df3['train size'].min()
    
    style=None
    if(n_cname>1):
        hue='classifier name'
        style='query strategy'
    else:
        hue='query strategy'
    if(n_space>1):
        style='space'
    style = 'query strategy'
            
    for v in df_alldata['value']:
        plt.plot([min_train_size,max_train_size],[v,v], linestyle=':', linewidth=3)
    sns.lineplot(data=df3,x='train size',y='value', hue=hue, marker='o', style=style);
    plt.ylabel(metric)
    plt.xticks(df3['train size'].unique())
    plt.ylim([0.0,1.0]);


results_dir='results/data_classified_v6'
widget_dataset = widgets.Dropdown(
    #options=[results_dir+'/'+f for f in os.listdir(results_dir) if 'tmp' not in f],
    options=[results_dir+'/'+f for f in os.listdir(results_dir)],
    value=results_dir+'/'+'25-11-2020.csv',
    description='Results file:',
    disabled=False,
    layout=widgets.Layout(width='30%')
)
    
    
widget_methods = widgets.SelectMultiple(
    value=[],
    description='Methods:',
    disabled=False,
    layout=widgets.Layout(width='30%', height='120px')
)

widget_querystrat = widgets.SelectMultiple(
    value=[],
    description='Query Strategy:',
    disabled=False
)

def update_widgets(*args):
    df=loadDataResults(widget_dataset.value)
    widget_methods.options = df['classifier name'].str.strip().unique()
    widget_querystrat.options=df['query strategy'].unique()

widget_dataset.observe(update_widgets, 'value')
update_widgets()

interact(interact_results,
         methods=widget_methods,
         dataset_path=widget_dataset,
         query_strategy=widget_querystrat,
         metric=df['metric name'].unique());

interactive(children=(SelectMultiple(description='Methods:', layout=Layout(height='120px', width='30%'), optio…

$\newcommand{\PP}{\mathbf{P}}$
$\newcommand{\xx}{\mathbf{x}}$

$p_i=\PP(y_i=1|\xx)$

-  Três query functions: 
   -  Top Margin: $1-\max_i p_i$
   -  1-2 Margin: $\max_i(p_i) - \max^{(2)}_i (p_i)$
   -  Entropy:  $-\sum_i p_i log(p_i)$
   -  Random.

# 19/11/2020
### Configuração experimental

-  5600 exemplos (v6)
-  Base de treino inicial (Estratificada): 100
-  Query size: 50
-  Budget: +900
-  Exemplos de teste: 1835
-  Hand-crafted space: 8 Features do ICTAI2016.

### Observações

-  Os classificadores no hand-crafted space já começam alto, em especial o DT e RF. 
-  Triplet-net com knn ou RF foram bons.
-  O triplet-net com DT ficou bom no final.
-  A convergência é bem rápida para os métodos no hand-crafted space. 
-  Parece três deles convergem pra um mesmo ponto, mas não são o mesmo ponto.

# 26/11/2020
### Configuração experimental

-  5616 exemplos (v6)
-  Base de treino inicial: 40 (5 de cada defeito e 20 normais)
-  Query size: 15
-  Budget: +420
-  Exemplos de teste: 1855
-  Hand-crafted space: 8 Features do ICTAI2016.

### Observações
-  Os métodos por triplet loss conseguiram alcançar um desempenho bem mais rápido, comparado ao experimentos de 19/11/2020.
-  MC dropout é muito bom! (ele usa uma média de 20 predições)
-  Qualquer query strategy é melhor do que o random em quase todos os casos.
-  O triplenet é horrivel no inicio pq ele prediz consegue precisão ou recall 0 em certas classes.

In [22]:
from scipy.special import logit

df['value_logit'] = logit(df['value'])

def interact_results(method, metric='f1_macro', base_classifier='ALL'):
    mask = df['metric name']==(metric)
    mask &= ~df['classifier name'].str.contains('alldata')
    mask &= df['classifier name'].str.split(' ').str[0].isin(method)
    #mask &= ~df['base_classifier'].str.contains('QDA')
    
    if(base_classifier!='ALL'):
        mask &= df['base_classifier']==base_classifier
    sns.lineplot(data=df[mask],x='train size',y='value', hue='query strategy', marker='o', style='base_classifier');
    #plt.xticks(df[mask]['train size'])
    plt.ylabel('Macro F-measure')
    plt.ylim([0.0,1.0]);

    
widget_methods = widgets.SelectMultiple(
    options=df['classifier name'].str.split(' ').str[0].unique(),
    value=[],
    description='Methods:',
    disabled=False,
)
    

interact(interact_results,
         method=widget_methods,
         metric=df['metric name'].unique(),
         base_classifier=['ALL']+list(df['base_classifier'].unique()));

interactive(children=(SelectMultiple(description='Methods:', options=('tripletnet_mcdropout', 'ensemble_triple…

## Observações

-  Agora, todos os tripletnet iniciais são iguais. (Mesmo desempenho na primeira iteração)

In [None]:
mask = df['metric name']=='f1_macro'
mask &= df['classifier name'].str.split(' ').str[0]=='tripletnet'
mask &= df['classifier name'].str.contains('RF')
mask &= df['train size']<=200

df.loc[526,'value']=0.71
# display(df[mask])