In [None]:
import pandas as pd
import numpy as np
from ProbDistribution import ProbDistTree
from metrics import SubsetLoss,HammingLoss, JaccardDistance, Fmeasure
from mllmethods import classifier_chain, br, clr, LabelPowerset,dbr
from tqdm import tqdm
import ipywidgets as widgets
from ipywidgets import interact
import seaborn as sns
import matplotlib.pyplot as plt

# Experimental Setup

In [None]:
def jaccard_distance_optimizer(P):
    solution, value = JaccardDistance().minrisk(P)
    return solution

def fmeasure_optimizer(P):
    solution, value = Fmeasure().minrisk(P)
    return solution


# classifiers = {'CC':classifier_chain, 'BR':br, "CLR":clr, 
#                'PCC':LabelPowerset, 'JD-opt': jaccard_distance_optimizer, 'Fm-opt':fmeasure_optimizer}
classifiers = {'CC':classifier_chain, 'BR':br, "CLR":clr, 
               'PCC':LabelPowerset, 'Fm-opt':fmeasure_optimizer, 'DBR':dbr}
metrics = {'Subset 0/1 loss': SubsetLoss(), 'Hamming loss':HammingLoss(), 
           'F-measure':Fmeasure(), 'Jaccard distance': JaccardDistance()}
# metrics = {'Subset 0/1 loss': SubsetLoss()}
variables = ['Pid','k','classifier','metric','ld','dd','n','risk']

ld_list=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.025,0.05,0.99]
dd_list=[0.05,0.45]
n=12
k=10000

# Experimental loop

In [None]:
from multiprocessing import Pool

np.random.seed(1)



def run_experiment(args):
    i, ld,dd = args
    Results_tmp = []
    P = ProbDistTree.random(n, ld=ld,dd=dd)
    for clf_name,clf in classifiers.items():
        pred = clf(P)
        for metric_name, metric in metrics.items():
            r = metric.risk(pred,P)
            Results_tmp.append((i,k,clf_name,metric_name,ld,dd,n,r))
#     pbar.update()
    return Results_tmp

    
def callback(res):
    pbar.update()
# Results = {clf_name:{metric_name:np.empty(k,dtype=float) for metric_name in metrics} for clf_name in classifiers}
Results = []
with tqdm(total=len(ld_list)*len(dd_list)*k) as pbar:
    for ld in ld_list:
        for dd in dd_list:
            with Pool() as pool: # You can change the number of cpu cores used here: Ex: Pool(4)
                pool_iterator = pool.imap(run_experiment, [(i, ld,dd) for i in range(k)])
                for result in pool_iterator:
                    Results+=result
                    pbar.update()
                
R = pd.DataFrame(Results, columns=variables)
Rg = R.groupby(['k','classifier','metric','ld','dd','n']).mean()
R = Rg.reset_index()[['k','classifier','metric','ld','dd','n','risk']]
R.to_csv('results/results.csv')

# Visualize results

In [None]:
Rg = R.groupby(['ld','dd','metric','n','k'])[['risk']].min().rename({'risk':'minrisk'},axis=1)
R = R.merge(Rg,on=['ld','dd','metric','n','k'])
R['regret_relative'] = R['risk']/R['minrisk']
R

In [None]:
@interact(dd=widgets.Select(options=dd_list), 
          metric=widgets.Select(options=metrics.keys()))
def plotLD(dd,metric):
    mask = R['dd']==dd
    mask &= R['metric']==metric
    Rtmp = R[mask]
    fig = plt.figure(figsize=(14,10), dpi=128)
    sns.lineplot(data=Rtmp, x='ld',y='regret_relative',hue='classifier', marker='o',
                 ax=fig.gca(), lw=2, markersize=10);
    plt.ylim([0.99,1.45])