In [44]:
import json
import os
from statsmodels.stats.contingency_tables import mcnemar
from collections import defaultdict
from tqdm import tqdm
import plotly.plotly as py
import plotly.graph_objs as go

results_dir = os.path.abspath("../results/pairtest/classifier_results/")



In [45]:
noise_types = ["AA","RV","VA"]
accuracy_tables = {ntype:defaultdict(list) for ntype in noise_types}

for filename in tqdm(os.listdir(results_dir)):
    fullpath = os.path.join(results_dir, filename)
    results = []
    if not os.path.isdir(fullpath):
        parameters = filename.split("_") 
        exp_id = parameters[0]
        condition = parameters[1]
        exp_name = "_".join(parameters[0:-2])
        noise_type = parameters[-2]
        with open(fullpath, "r") as res_file:
            for line in res_file.readlines():
                obj = json.loads(line)
                results.append(obj)
        for result in results:
            is_correct = result["gold_label"] == result["predicted_label"]
            accuracy_tables[noise_type][exp_name].append(is_correct)

  

100%|██████████| 75/75 [02:18<00:00,  1.93s/it]


In [46]:
print(list(accuracy_tables['AA']))

['exp-1_no-WS_350-1000_lab', 'exp-1.3_16-512', 'exp-2.1_2uni-2-layer', 'exp-1.3_256-512', 'exp-1_all-WS_350-1000_lab', 'exp-1.3_512-512', 'exp-2.1_6bi-3-layer', 'exp-1.1_512-512', 'exp-1.1_256-256', 'exp-1.1_32-32', 'exp-2.1_1uni-1-layer', 'exp-1.2_pretrained-freeze', 'exp-1.2_pretrained-cont', 'exp-1.1_128-128', 'exp-1.1_64-64', 'exp-2.1_5bi-2-layer', 'exp-1_quarter-WS_350-1000_lab', 'exp-1.3_128-512', 'exp-2.1_3uni-3-layer', 'exp-1.2_random', 'exp-1_half-WS_350-1000_lab', 'exp-2.1_4bi-1-layer', 'exp-1.1_512-1024', 'exp-1.3_32-512', 'exp-1.3_64-512']


In [47]:
def build_contingency_table( first_table, second_table):
    yes_yes = 0
    yes_no = 0
    no_yes = 0
    no_no = 0
    if not first_table or not second_table:
        print("Warning, one of your experiments is empty, double check experiment names")
        return None
    if len(first_table) != len(second_table):
        print("Warning, tables are different lengths, please reevaluate,  your life")
        return None

    for first,second in zip(first_table, second_table):
        if first and second:
            yes_yes += 1
        elif not (first or second):
            no_no +=1
        elif first and not second:
            yes_no +=1
        elif second and not first:
            no_yes +=1
    contingency_table = [[yes_yes,yes_no],[no_yes,no_no]]
    return contingency_table
    

In [48]:
trial = build_contingency_table(accuracy_tables["VA"]["exp-1.1_128-128"], 
                        accuracy_tables["VA"]["exp-1_all-WS_350-1000_lab"])

result = mcnemar(trial, exact=False, correction=True)
print('statistic={}, p-value={}'.format(result.statistic, result.pvalue))

statistic=136323.33587582494, p-value=0.0


In [49]:
trial

[[24133, 165502], [10572, 7367]]

In [50]:
# Pairwise compare the values
noise_type = "VA"
pairwise_mcnemar = {name:{} for name in expe}defaultdict(dict)
pairwise_p_value = defaultdict(dict)
experiment_names = sorted(list(accuracy_tables[noise_type]))
for ind, first_experiment in tqdm(enumerate(experiment_names)):
    first_table = accuracy_tables[noise_type][first_experiment]
    for second_experiment in experiment_names[ind+1:]:
        second_table = accuracy_tables[noise_type][second_experiment]
        cont_table = build_contingency_table(first_table,second_table)
        if cont_table[0][1] + cont_table[1][0] < 25:
            stats_result = mcnemar(cont_table, exact=True)
            print("Comparisson between {} and {} has less than 25 disagreement".format(first_experiment,second_experiment))
        if cont_table[0][1] + cont_table[1][0]:
            stats_result = mcnemar(cont_table, exact=False, correction=True)
        pairwise_mcnemar[first_experiment][second_experiment] = stats_result.statistic
        pairwise_p_value[first_experiment][second_experiment] = stats_result.pvalue


4it [00:00,  4.86it/s]

Comparisson between exp-1.1_512-1024 and exp-2.1_1uni-1-layer has less than 25 disagreement
Comparisson between exp-1.1_512-512 and exp-1.3_512-512 has less than 25 disagreement


25it [00:02,  9.38it/s]


In [56]:
#pairwise_p_value
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)



In [59]:
def convert_dict_matrix(results_dict):
    experiment_list = sorted(list(results_dict))
    matrix = [[0 for i in range(len(experiment_list))] for j in range(len(experiment_list))]
    print(len(matrix))
    print(len(matrix[0]))
    exp2_id = {name:i for (i,name) in enumerate(experiment_list)}
    id2_exp = {i:name for (i,name) in enumerate(experiment_list)}
    for ind, exp1 in enumerate(experiment_list):
        for exp2 in experiment_list[ind+1:]:
            matrix[exp2_id[exp1]][exp2_id[exp2]] = results_dict[exp1][exp2]
            matrix[exp2_id[exp2]][exp2_id[exp1]] = results_dict[exp1][exp2]
    return matrix
    
pvalue_matrix = convert_dict_matrix(pairwise_p_value)

trace = go.Table(header = {'values':["X"] + experiment_names},
                cells = {'values': pvalue_matrix})
data = [trace]
py.iplot(data, file_id="myplot")

24
24
Aw, snap! We didn't get a username with your request.

Don't have an account? https://plot.ly/api_signup

Questions? accounts@plot.ly


PlotlyError: Because you didn't supply a 'file_id' in the call, we're assuming you're trying to snag a figure from a url. You supplied the url, '', we expected it to start with 'https://plot.ly'.
Run help on this function for more information.