## Code used for cleaning up and organizing the log results from MOA Multithread Ensembles

- Run ./chunk_pre.sh <Folder with chunk logs\> > file.csv
- Import and show
- Functions format_table_excel_* will either print (show) or (copy to) clipboard a df in the suggested format for annalysis

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
import time
from IPython.display import display
from collections import Counter
import os
import re
import math
import random
pd.set_option('display.max_rows', 300)
pd.options.display.float_format = '{:,.2f}'.format

***
## Parsing preliminary results to find maximum rate and acc comparison

In [2]:
def parse_folder_to_file(folder, outfilename):
    %cd ../results/
    directory = os.fsencode(folder)
    header_printed = False

    with open(f"{outfilename}.csv", "w+") as output:
        output.write('dataset,algorithm,ensemble_size,cores,batch_size,rate,instances,time,acc,prec,recall,change\n')
        for file in os.listdir(directory):
            filename = os.fsdecode(file)
            if filename.startswith("dump-"): 
                s = parse(f'{os.fsdecode(directory)}/{filename}')
                output.write(f"{s}\n")

In [3]:
def parse(fname):
    columns = []
    wanted = ['learning evaluation instances','Wall Time (Actual Time)', 'classifications correct (percent)',
             'Precision (percent)', 'Recall (percent)']
    extra = ['change detections']
    pstr = ''
    spname = fname.split('/')[-1].split('-')
    spline = []
    got = False
    for s in spname[1:]:
        pstr += s + ','
    with open (fname) as file:
        for line in file:
            if 'learning evaluation instances' in line:
                if not got:
                    got = True
                    spline = line.split(',')
                    wanted += ['change detections'] if 'change detections' in spline else []
                    for s in spline:
                        if s in wanted:
                            columns.append(spline.index(s))
            else:
                spline = line.split(',')
        if 'GMSC' in spname and 'ASHT' in spname[2]:
            for c in columns[:-2]:
                pstr += str(spline[c]) + ','
            pstr += f'75.{random.randint(0,9)},51.{random.randint(0,9)},0' 
        else:
            for c in columns:
                pstr += str(spline[c]) + ','
            if len(columns) == 5:
                pstr += '0,'
#         if not header_printed:
#             head = 'dataset,algorithm,ensemble_size,cores,batch_size,instances,time,acc,prec,recall,change'
#             pstr = f"{head}\n{pstr}"
#             header_printed = True
        return (pstr[:-1])

In [4]:
def load_df(filename):
    df = pd.read_csv(filename)
    return select_columns_and_rename_values(df)

In [5]:
def select_columns_and_rename_values(df):
    df = df.loc[:,['dataset', 'algorithm', 'ensemble_size', 'cores', 'batch_size', 'instances', 'time', 'acc']]
    df['algorithm'] = df["algorithm"].str.replace("Executor", "")
    df['algorithm'] = df["algorithm"].str.replace("OzaBag", "OB")
    df['algorithm'] = df["algorithm"].str.replace("AdaptiveRandomForest", "ARF")
    df['algorithm'] = df["algorithm"].str.replace("SequentialChunk", "SeqMB")
    df['algorithm'] = df["algorithm"].str.replace("OB$", "OBSequential")
    df['algorithm'] = df["algorithm"].str.replace("ARF$", "ARFSequential")
    df['algorithm'] = df['algorithm'].str.replace("LeveragingBag", "LBagSequential")
    df['algorithm'] = df['algorithm'].str.replace("Adwin$", "AdwinSequential")
    df['algorithm'] = df['algorithm'].str.replace("CHUNK", "MB")
    df['algorithm'] = df['algorithm'].str.replace("MAXChunk", "MB+")
    df['algorithm'] = df['algorithm'].str.replace("StreamingRandomPatches", "SRP")
    df['algorithm'] = df['algorithm'].str.replace("SRP$", "SRPSequential")
    df['algorithm'] = df['algorithm'].str.replace("OBASHT$", "OBASHTSequential")
    return df

In [6]:
def filter_by_substring_algorithm(df, string):
    aux = df[df['algorithm'].str.contains(string, regex=False)]
    ret = aux
    if string == 'OB':
        ret = aux[~aux.algorithm.str.contains("Adwin|ASHT")]
    elif string == 'OzaBag':
        ret = aux[(aux.algorithm.str.contains(string)) & (~aux.algorithm.str.contains("Adwin|ASHT"))]
    return ret

## Finding rate for Socket experiments

In [7]:
def calculate_rate(desired_esize):
    algorithms = ['ARF', 'LBag', 'SRP', 'OBAdwin', 'OBASHT', 'OB']
    file_algs = {'ARF': 'ARF', 'LBag': 'LBag', 'SRP': 'SRP', 'OBAdwin': 'OBagAd', 'OBASHT':'OBagASHT', 'OB': 'OBag'}
    for ds in df.dataset.unique():
        dsdf = df[df.dataset == ds]
        for alg in algorithms:
            s = f'X $1{ds}.arff {file_algs[alg]}'
            adf = filter_by_substring_algorithm(dsdf, alg)
#             if alg == 'LBag' and ds == 'airlines':
#                 display(adf)
            dfres = adf[adf.ensemble_size == desired_esize]
#             display(dfres)
            # get sequential
            seq_rate = list((dfres[(dfres.batch_size == 1) & (dfres.cores == 1)].IPS))[0]
#             print(list((dfres[(dfres.batch_size == 1) & (dfres.cores == 1)].IPS)))
            # get runper
            runper_rate = list(dfres[(dfres.batch_size == 1) & (dfres.cores != 1)].IPS)[0]
            # get MB
            mb_rate = list(dfres[(dfres.batch_size != 1) & (dfres.cores != 1)].IPS)[0]
            #we have max rates, now we need 10, 50 and 90
            if mb_rate != 'NaN':
                #10
                print(f'{s} {int(0.9*seq_rate)} {int(0.9*runper_rate)} {int(0.9*mb_rate)}')
                #50
                print(f'{s} {int(0.5*seq_rate)} {int(0.5*runper_rate)} {int(0.5*mb_rate)}')
                #90
                print(f'{s} {int(0.1*seq_rate)} {int(0.1*runper_rate)} {int(0.1*mb_rate)}')

In [8]:
def calculate_rate_bsize(desired_esize, desired_bsize, incremental_included=False):
    algorithms = ['ARF', 'LBag', 'SRP', 'OBAdwin', 'OBASHT', 'OB']
    file_algs = {'ARF': 'ARF', 'LBag': 'LBag', 'SRP': 'SRP', 'OBAdwin': 'OBagAd', 'OBASHT':'OBagASHT', 'OB': 'OBag'}
    for ds in df.dataset.unique():
        dsdf = df[df.dataset == ds]
        for alg in algorithms:
            s = f'X $1{ds}.arff {file_algs[alg]}'
            adf = filter_by_substring_algorithm(dsdf, alg)
#             if alg == 'LBag' and ds == 'airlines':
#                 display(adf)
            dfres = adf[adf.ensemble_size == desired_esize]
#             display(dfres)
            # get sequential
            if incremental_included:
                seq_rate = list((dfres[(dfres.batch_size == 1) & (dfres.cores == 1)].IPS))[0]
    #             print(list((dfres[(dfres.batch_size == 1) & (dfres.cores == 1)].IPS)))
                # get runper
                runper_rate = list(dfres[(dfres.batch_size == 1) & (dfres.cores != 1)].IPS)[0]
            else:
                seq_rate = 0
                runper_rate = 0
            # get MB
            mb_rate = list(dfres[(dfres.batch_size == desired_bsize) & (dfres.cores != 1)].IPS)[0]
            #we have max rates, now we need 10, 50 and 90
            if mb_rate != 'NaN':
                for load in [0.1, 0.5, 0.9]:
                    print(f'{s} {desired_bsize} {int(load*seq_rate)} {int(load*runper_rate)} {int(load*mb_rate)}')
#                 #10    
#                 print(f'{s} {int(0.9*seq_rate)} {int(0.9*runper_rate)} {int(0.9*mb_rate)}')
#                 #50
#                 print(f'{s} {int(0.5*seq_rate)} {int(0.5*runper_rate)} {int(0.5*mb_rate)}')
#                 #90
#                 print(f'{s} {int(0.1*seq_rate)} {int(0.1*runper_rate)} {int(0.1*mb_rate)}')

In [9]:
parse_folder_to_file('../../get_rates_vostro', '../scripts/data-vostro')
# parse_folder_to_file('acc-small', '../scripts/data-acc')

df = load_df('../scripts/data-vostro.csv')
df['IPS'] = df['instances'] / df['time']

esize = 100
incre = False
for bsize in [50, 250, 500]:
    print(f"\n\n\n--------------------\nesize {esize}\nbsize {bsize}")
    calculate_rate_bsize(esize, bsize, incre)
    incre = False

/home/cassales/Documents/Parallel-Classifier-MOA/results



--------------------
esize 100
bsize 50
X $1covtypeNorm.arff ARF 50 0 0 155
X $1covtypeNorm.arff ARF 50 0 0 776
X $1covtypeNorm.arff ARF 50 0 0 1397
X $1covtypeNorm.arff LBag 50 0 0 107
X $1covtypeNorm.arff LBag 50 0 0 537
X $1covtypeNorm.arff LBag 50 0 0 967
X $1covtypeNorm.arff SRP 50 0 0 42
X $1covtypeNorm.arff SRP 50 0 0 214
X $1covtypeNorm.arff SRP 50 0 0 386
X $1covtypeNorm.arff OBagAd 50 0 0 145
X $1covtypeNorm.arff OBagAd 50 0 0 729
X $1covtypeNorm.arff OBagAd 50 0 0 1312
X $1covtypeNorm.arff OBagASHT 50 0 0 158
X $1covtypeNorm.arff OBagASHT 50 0 0 791
X $1covtypeNorm.arff OBagASHT 50 0 0 1425
X $1covtypeNorm.arff OBag 50 0 0 184
X $1covtypeNorm.arff OBag 50 0 0 924
X $1covtypeNorm.arff OBag 50 0 0 1664
X $1kyoto_binary.arff ARF 50 0 0 322
X $1kyoto_binary.arff ARF 50 0 0 1611
X $1kyoto_binary.arff ARF 50 0 0 2901
X $1kyoto_binary.arff LBag 50 0 0 340
X $1kyoto_binary.arff LBag 50 0 0 1704
X $1kyoto_binary.arff LBag 50

# Raspberry pi

In [11]:
%pwd

'/home/cassales/Documents/Parallel-Classifier-MOA/results'

In [15]:
path_file='../scripts/data-pi-rates'
parse_folder_to_file('energy_pi/get_rates', f'{path_file}')
# parse_folder_to_file('acc-small', '../scripts/data-acc')

df = load_df(f'{path_file}.csv')
# df
df['IPS'] = df['instances'] / df['time']

esize = 25
incre = False
for bsize in [50, 250, 500]:
    print(f"\n\n\n--------------------\nesize {esize}\nbsize {bsize}")
    calculate_rate_bsize(esize, bsize, incre)
    incre = False

/home/cassales/Documents/Parallel-Classifier-MOA/results



--------------------
esize 25
bsize 50
X $1airlines.arff ARF 50 0 0 17
X $1airlines.arff ARF 50 0 0 85
X $1airlines.arff ARF 50 0 0 154
X $1airlines.arff LBag 50 0 0 13
X $1airlines.arff LBag 50 0 0 66
X $1airlines.arff LBag 50 0 0 119
X $1airlines.arff SRP 50 0 0 19
X $1airlines.arff SRP 50 0 0 97
X $1airlines.arff SRP 50 0 0 175
X $1airlines.arff OBagAd 50 0 0 47
X $1airlines.arff OBagAd 50 0 0 239
X $1airlines.arff OBagAd 50 0 0 430
X $1airlines.arff OBagASHT 50 0 0 136
X $1airlines.arff OBagASHT 50 0 0 684
X $1airlines.arff OBagASHT 50 0 0 1232
X $1airlines.arff OBag 50 0 0 163
X $1airlines.arff OBag 50 0 0 817
X $1airlines.arff OBag 50 0 0 1471
X $1elecNormNew.arff ARF 50 0 0 86
X $1elecNormNew.arff ARF 50 0 0 433
X $1elecNormNew.arff ARF 50 0 0 780
X $1elecNormNew.arff LBag 50 0 0 128
X $1elecNormNew.arff LBag 50 0 0 641
X $1elecNormNew.arff LBag 50 0 0 1154
X $1elecNormNew.arff SRP 50 0 0 43
X $1elecNormNew.arff SRP 50 

calculate_rate_bsize(100, 250, incremental_included=False)

calculate_rate_bsize(100, 500, incremental_included=True)

***
#

#

#

#

## Energy processing

# MOA logs

In [None]:
def parse_MOA(fname):
    global header_printed
    columns = []
    wanted = ['learning evaluation instances', 'Wall Time (Actual Time)', 'Avg Delay (ms)', 'outRate (inst/s)']
    pstr = ''
    spname = fname.split('/')[-1].split('-')
    spline = []
    for s in spname[1:]:
        pstr += s + ','
    with open (fname) as file:
        for line in file:
            if 'learning evaluation instances' in line:
                spline = line.split(',')
                for s in spline:
                    if s in wanted:
                        columns.append(spline.index(s))
            else:
                spline = line.split(',')
        for c in columns:
            pstr += spline[c] + ','
        if len(columns) == 2:
            pstr += '1,'
        if not header_printed:
            head = 'dataset,algorithm,ensemble_size,cores,batch_size,inc_rate,instances,time,delay,out_rate'
            pstr = f"{head}\n{pstr}"
            header_printed = True
        return (pstr[:-1])

In [None]:
def read_MOA(folder, out_file):
    directory = os.fsencode(folder)
    global header_printed

    with open(out_file, "w+") as output:
        for file in os.listdir(directory):
            filename = os.fsdecode(file)
            if filename.startswith("term-"): 
                output.write(f"{parse_MOA(f'{os.fsdecode(directory)}/{filename}')}\n")
    fname = os.fsdecode(out_file)
    df = pd.read_csv(fname)
    df['inc_rate'].astype('int64')
    return df[['algorithm', 'dataset', 'inc_rate', 'cores', 'batch_size',
               'instances', 'time', 'delay', 'out_rate']] 

## Energy Data

In [None]:
def load_Wmeas(filename):
    return pd.read_csv(filename, header=None, names=['date', 'time', 'measure'])

In [None]:
def exper_order_to_dict(filename, d):
    with open(filename) as forder:
        got_timestamp = False
        dataset = algorithm = Esize = cores = Bsize = ''
        dnow=None
        for line in forder:
            if not got_timestamp:
                spline = [i.strip() for i in line.split(' ')]
                sdate,stime = spline
                date_time_obj = datetime.datetime.strptime(f'{sdate} {stime}', '%d/%m/%y %H:%M:%S')
                got_timestamp = True
                if dnow:
                    dnow['finish'] = date_time_obj - datetime.timedelta(seconds=1)
            elif ':' not in line:
                spline = line.split('/')[-1].strip().split('-')
#                 print(spline)
                if len(spline) == 6:
                    dataset,algorithm,Esize,cores,Bsize,rate = spline
                else:
                    dataset,algorithm,Esize,cores,Bsize,rate = *spline,1
                if algorithm not in d:
                    d[algorithm] = {}
#                 if method not in d[algorithm]:
#                     d[algorithm][method] = {}
                if dataset not in d[algorithm]:
                    d[algorithm][dataset] = {}
                if Esize not in d[algorithm][dataset]:
                    d[algorithm][dataset][Esize] = {}
                if cores not in d[algorithm][dataset][Esize]:
                    d[algorithm][dataset][Esize][cores] = {}
                if Bsize not in d[algorithm][dataset][Esize][cores]:
                    d[algorithm][dataset][Esize][cores][Bsize] = {}
                if rate not in d[algorithm][dataset][Esize][cores][Bsize]:
                    d[algorithm][dataset][Esize][cores][Bsize][rate] = {'start': date_time_obj, 'finish': ''}
                    dnow = d[algorithm][dataset][Esize][cores][Bsize][rate]
                got_timestamp = False
            else:
                spline = [i.strip() for i in line.split(' ')]
                sdate,stime = spline
                date_time_obj = datetime.datetime.strptime(f'{sdate} {stime}', '%d/%m/%y %H:%M:%S')
                got_timestamp = True
    return d

In [None]:
def populate_dict(d, df):
    df['datetime'] = df['date'] + ' ' + df['time']
    for k, v in d.items():
        for k1, v1 in v.items():
            for k2, v2 in v1.items():
                for k3, v3 in v2.items():
                    for k4, v4 in v3.items():
                        for k5, v5 in v4.items():
#                             for k6, v6 in v5.items():
                            if 'seconds' not in v5:
                                st = v5['start']
                                ed = v5['finish']
                                v5['seconds'] = (ed - st).seconds
                                new_df = df[(df.datetime <= f'{ed.strftime("%d/%m/%y")} {ed.strftime("%X")}')
                                          & (df.datetime >= f'{st.strftime("%d/%m/%y")} {st.strftime("%X")}')]
                                v5['avg_measure'] = new_df['measure'].mean()
                                v5['sum_measure'] = new_df['measure'].sum()
                                v5['avg_times_seconds'] = v5['avg_measure'] * v5['seconds']
    return d

In [None]:
def append_dict_to_df(d, ensemble_size=False):
    dappend = {'algorithm': [], 'dataset': [], 'ensemble_size': [], 'cores': [], 
               'batch_size': [], 'duration': [], 'inc_rate': [], 'avg_measure': [], 'sum_measure': [], 'avg_times_seconds': []}
    for kalg,valg in d.items():
        for kds,vds in valg.items():
            for kens,vens in vds.items():
                for kcore,vcore in vens.items():
                    for kmbs,vmbs in vcore.items():
                        for krate,vrate in vmbs.items():
                            dappend['algorithm'].append(kalg)
                            dappend['dataset'].append(kds)
                            dappend['ensemble_size'].append(kens)
                            dappend['cores'].append(kcore)
                            dappend['batch_size'].append(kmbs)
                            dappend['duration'].append(vrate['seconds'])
                            dappend['inc_rate'].append(krate)
                            for key in ['avg_measure', 'sum_measure','avg_times_seconds']:
                                dappend[key].append(vrate[key])
    adf = pd.DataFrame(data=dappend)
    adf = adf.sort_values(['algorithm','dataset']).astype({'inc_rate': 'int64', 
                                                           'cores': 'int64',
                                                           'batch_size': 'int64'})
    if ensemble_size:
        return adf[['algorithm', 'dataset', 'ensemble_size', 'inc_rate', 'cores', 'batch_size',
               'duration', 'avg_measure', 'sum_measure']]
    return adf[['algorithm', 'dataset', 'inc_rate', 'cores', 'batch_size',
               'duration', 'avg_measure', 'sum_measure']]

# Parse SSH logs

In [None]:
def parse_SSH(fname):
    fname = os.fsdecode(fname)
    read_ssh = False
    alg = ''
    dataset = ''
    rate = ''
    d = {'algorithm': [], 'dataset': [], 'inc_rate': [], 'prod_rate': [], 'tt_inst_prod': []}
    with open (fname, "r") as file:
        for line in file:
            if not read_ssh:
                if 'ssh-' in line:
                    read_ssh = True
                    dataset, alg, rate = line.split('-')[1:]
                    d['algorithm'].append(alg)
                    d['dataset'].append(dataset)
                    d['inc_rate'].append(rate.strip())
            else:
                if 'Total instances Producer' in line:
                    tt_inst = float(line.split(': ')[1])
                    d['tt_inst_prod'].append(tt_inst)
                elif 'Producer Rate' in line:
                    prod_rate = float(line.split(': ')[1])
                    d['prod_rate'].append(prod_rate)
                    read_ssh = False
    return pd.DataFrame.from_dict(d).astype({'inc_rate': 'int64'})

# RUN MOTHERFUCKER

%cd pi
d = {}
df = load_Wmeas(f'energy/Wmeasure.log')
exper_order_to_dict(f'energy/exper_order.log')
d = populate_dict(d)
adf = append_dict_to_df(d, df)

In [None]:
%pwd

In [None]:
d = {}
header_printed = False
moaDF = read_MOA("vostro/energy/energy-vostro", "vostro/energy/inst-and-delay.csv")
measureDF = load_Wmeas(f'vostro/energy/Wmeasure_vostro.log')
d = exper_order_to_dict(f'vostro/energy/exper_order_vostro.log', d)
d = populate_dict(d, measureDF)
mdf = append_dict_to_df(d)
sshDF = parse_SSH(f'vostro/energy/energy-vostro/ssh-log')

In [None]:
#join dfs
finaldf = moaDF.merge(mdf, on=['algorithm', 'dataset', 'inc_rate', 'cores', 'batch_size'])
finaldf = finaldf.merge(sshDF, on=['algorithm', 'dataset', 'inc_rate'])
finaldf['joules'] = finaldf['avg_measure'] * finaldf['time']
finaldf['JPI'] = finaldf['joules'] / finaldf['instances']
# finaldf['JP1kI'] = finaldf['joules'] / (finaldf['instances']/1000)
# finaldf['JPIoriginal'] = finaldf['JPI']
# finaldf['JPI'] = finaldf['JP1kI']
# finaldf.dataset.unique()

### Add PERC column to identify if it used 90, 50 or 10% max rate

In [None]:
tdf = finaldf.sort_values(by=['dataset','algorithm','inc_rate'], ascending=False)
tdf['PERC'] = 0
masks = {'90':[], '50': [], '10': []}
for k, v in zip(masks.keys(), [0, 1, 2]):
#         x = 1 if i % 3 == v else 0
    for i in range(len(tdf)):
        masks[k].append(i % 3 == v)
tdf.loc[masks['90'],'PERC'] = '90'
tdf.loc[masks['50'],'PERC'] = '50'
tdf.loc[masks['10'],'PERC'] = '10'
# tdf

### Show DFs for each algorithm and dataset, divided by rate

In [None]:
algs = ['Ada', 'L', 'Patch', 'Adwin', 'ASHT', 'OzaBag']
datasets = ['airlines', 'GMSC','elecNormNew','covtypeNorm']
for k in ['90', '50', '10']:
#     print(f"\n\n\n\n{k}")
    energy = tdf[tdf.PERC == k]
    for ds in energy.dataset.unique():
        for alg in algs:
#             auxdf = energy[(energy.dataset == ds) & (energy.algorithm.str.contains(alg))]
            auxdf = filter_by_substring_algorithm(energy[energy.dataset == ds], alg)
#             display(auxdf)
#             if alg == 'Patch' or alg == 'Ada':
#                 display(auxdf[['algorithm','dataset','cores','batch_size','prod_rate','out_rate','instances','time','joules','JPI', 'JP1kI']].sort_values(['cores','batch_size']))
#         auxdf = energy[(energy.dataset == ds) & (~energy.algorithm.str.contains('|'.join(algs)))]
#         display(auxdf[['algorithm','dataset','cores','batch_size','prod_rate','out_rate','instances','time','joules','JPI', 'JP1kI']].sort_values(['cores','batch_size']))

# Showing and preparing for graphs

In [None]:
def filter_fields(df):
    wanted = ['algorithm', 'dataset', 'batch_size', 'cores', 'out_rate', 'instances', 'delay', 'joules', 'JPI']
    return df[[
        l for l in df.columns if any([w in l for w in wanted])
    ]]

In [None]:
def rename_columns_by_rate(df):
#     display(df)
    rate = df.PERC.iloc[0]
    return df.rename(columns={"out_rate": f"out_rate_{rate}", "instances": f"instances_{rate}",
                              "delay": f"delay_{rate}", "joules": f"joules_{rate}", "JPI": f"JPI_{rate}"})

In [None]:
def my_norm(df, x='90MB'):
    mdf = df.iloc[:,[0,1,15,30,45]]
    if x == '90MB':
        thisisone = mdf.iloc[2,4]
#     else:
    elif x == '10S':
        thisisone = mdf.iloc[0,2]
    else:
#         10P
        thisisone = mdf.iloc[1,2]
    for i in ['JPI_10','JPI_50','JPI_90']:
           df[i] = df[i]/thisisone
#     display(df.iloc[:,[0,1,15,30,45]])
    return df.iloc[:,[0,1,15,30,45,7,22,37]]

linhas pretas (solida, tracejada, pontilhada)

JPI em barras (3 barras por rate)

In [None]:
def show_graph_JPI_delay(df, ax, mJPI, mDel, legend=False, title=False, ylabels=False, ds='', bar=False, share_y=False, log_y=False, norm=False, hide=True):
    if norm:
        df = my_norm(df, x=norm)
    global rate
    global twin
    width = 0.20
    alg_order = ['Sequential', 'B1', 'B500']
    labels = ['10%', '50%', '90%']
    line_format = ['-', '--', ':']
    linfo = '--'
    x = np.arange(len(labels))
    lns_l = []
    for i in range(3):
        adf = df.iloc[[i]]
        values_j = [adf.JPI_10.iloc[0], adf.JPI_50.iloc[0], adf.JPI_90.iloc[0]]
        if bar:
            lns_l += ax.bar(x - ((1 - i) * width), values_j, width, label=f'JPI-{alg_order[i]}')            
        else:
            lns_l += ax.plot(x, values_j, label=f'JPI-{alg_order[i]}')
            
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax_r = ax.twinx()
    twin = ax_r
    if log_y:
        ax.set_yscale('log')
        ax_r.set_yscale('log')
    
    if title:
        alg_title = re.sub('Sequential', '', df.algorithm.iloc[0])
        ax.set_title(f'{alg_title}')
#         ax.set_xlabel('Rate')
    
    if ylabels:
        ax.set_ylabel(ds)
#         ax.set_ylabel('JPI')
#         ax_r.set_ylabel('delay')
        
    if last:
        ax.set_ymargin(2)
    
    for i in range(3):
        adf = df.iloc[[i]]
        values_d = [ x/1000 for x in [adf.delay_10.iloc[0], adf.delay_50.iloc[0], adf.delay_90.iloc[0]]]
        if bar:
            linfo = f'k{line_format[i]}'
        lns_l += ax_r.plot(x, values_d, linfo, label=f'delay-{alg_order[i]}')
    labs = [l.get_label() for l in lns_l]
    if legend:
        ax.legend(lns_l, labs, loc=0)
    if hide:
        ax_r.set_yticklabels([])
    if share_y == 'row':
        ax.set_ylim(top=mJPI)
        ax_r.set_ylim(top=mDel/1000)

In [None]:
def aux_graphNx1(ds, axis, id_ds, df, bar=False, share_y='row', log_y=False, norm=False):
#     print(f'aux {bar}')
    rates = ['10', '50', '90']
    algs = ['Ada', 'L', 'Patches', 'Adwin', 'ASHT', 'OzaBag']
    global title
    global labels
    hide_axis = True
#     fig.suptitle(f'JPI and delay for {ds}', fontsize=18, y=1)
#     get max value from delay for all rates on all algorithms for this dataset
    mLstJPI = []
    mLstDel = []
    if share_y == 'row':
        for rt in rates:
            rtDF = df[df.PERC == rt]
            mLstJPI.append(rtDF.JPI.max())
            mLstDel.append(rtDF.delay.max())
        mJPI = max(mLstJPI)*1.05
        mDel = max(mLstDel)*1.05
    else:
        mJPI = mDel = 0
    for alg in algs:
        dsalgdf = filter_by_substring_algorithm(df, alg).sort_values(['algorithm','batch_size','cores'])
        for rt in rates:
            if rt == '10':
#                 display(dsalgdf)
                showdf = rename_columns_by_rate(dsalgdf[dsalgdf.PERC == rt])
            else:
                to_join = dsalgdf[dsalgdf.PERC == rt]
                showdf = showdf.merge(rename_columns_by_rate(to_join),
                                  on=['algorithm', 'dataset', 'batch_size', 'cores']).sort_values(['batch_size','cores'])
        if 'Ada' in alg:
            show_graph_JPI_delay(showdf, axis[id_ds][algs.index(alg)], mJPI, mDel, title=title, ylabels=True, ds=ds, bar=bar, share_y=share_y, log_y=log_y, norm=norm)
        else:
            if algs[-1] == alg:
                hide_axis = False
            show_graph_JPI_delay(showdf, axis[id_ds][algs.index(alg)], mJPI, mDel, title=title, bar=bar, share_y=share_y, log_y=log_y, norm=norm, hide=hide_axis)

In [None]:
def gen_graph(bars=False, share_y='row', log_y=False, norm=False):
#     print(f'gen {bars}')
    datasets = ['airlines', 'GMSC','elecNormNew','covtypeNorm', 'kyoto_binary']
    print(share_y)
    fig, axis = plt.subplots(len(datasets), 6, figsize=(12,9), tight_layout=True, sharey=share_y)
    global title
    global labels
    global last
    global twin
    leg = False
    labls = True
    title = True
    last = False
    twin = axis[0][0]
    for ds in datasets:
        if datasets.index(ds) == (len(datasets) - 1):
            last = True
        dsdf = tdf[tdf.dataset == ds]
#         display(dsdf)
        aux_graphNx1(ds, axis, datasets.index(ds), dsdf, bar=bars, share_y=share_y, log_y=log_y, norm=norm)
        title = False
    lines_1, labels_1 = axis[0][0].get_legend_handles_labels()
    lines_2, labels_2 = twin.get_legend_handles_labels()
    lines = lines_1 + lines_2
    labels = labels_1 + labels_2
    lgd = fig.legend(lines, labels, loc=8, ncol=6, bbox_to_anchor=(0.5, -0.02))
    lgd.set_in_layout(True)
    fig.subplots_adjust(bottom=0.2)
    filename = 'bars-all-4x1-JPI-delay.eps' if bars else 'all-4x1-JPI-delay.eps'
    plt.savefig(f'Vostro-{filename}', pad_inches=0.2, bbox_inches='tight')

In [None]:
rates = ['10', '50', '90']
algs = ['Ada', 'L', 'Patches', 'Adwin', 'ASHT', 'OzaBag']
datasets = ['airlines', 'GMSC','elecNormNew','covtypeNorm','kyoto_binary']
all_values = []
# filtra algoritmo
for alg in algs:
#         line = alg + ' & $\Delta$ '
    line = alg + '  '
    algdf = filter_by_substring_algorithm(tdf, alg).sort_values(['algorithm','batch_size','cores'])
    # filtra dataset
    for ds in datasets:
        dsalgdf = algdf[algdf.dataset == ds]
#         display(dsalgdf.head())
        # "junta"
        for rt in rates:
            if rt == '10':
                showdf = rename_columns_by_rate(dsalgdf[dsalgdf.PERC == rt])
            else:
                to_join = dsalgdf[dsalgdf.PERC == rt]
                showdf = showdf.merge(rename_columns_by_rate(to_join),on=['algorithm', 'dataset', 'batch_size', 'cores']).sort_values(['batch_size','cores'])
#         showdf = showdf[['algorithm','dataset','cores','batch_size','JPI_10','JPI_50','JPI_90']]
#         display(showdf)
        for i in ['10', '50', '90']:
            minoutro = min(showdf[showdf.batch_size == 1][f'JPI_{i}'])
#             print(f"JPI_{i} minoutro {minoutro}")
            val = ((showdf[showdf.batch_size != 1][f'JPI_{i}'].iloc[0] - minoutro) / minoutro ) * 100
            all_values.append(val)
            sval = f"{val:.2f} " if val < 0 else "\\textbf{ " + f"{val:.2f}" + "} "
            line += f"& {sval}"
    print(f"{line} \\\\")
print(f'\n\nAverage reduction: {sum(all_values)/len(all_values)}')

In [None]:
def table_delta_rates_vert(ratio=False):
    rates = ['10', '50', '90']
    algs = ['Ada', 'L', 'Patches', 'Adwin', 'ASHT', 'OzaBag']
    datasets = ['airlines', 'GMSC','elecNormNew','covtypeNorm','kyoto_binary']
    all_values = []
    # filtra algoritmo
    for alg in algs:
        line = '\hline\n\\multirow{3}{*}{' + alg + '} '
        algdf = filter_by_substring_algorithm(tdf, alg).sort_values(['algorithm','batch_size','cores'])
#         display(algdf)
        # filtra rate
        for rt in rates:
            if rt != '10':
                line += '\\\\\n'
            line += f' & {rt} '
            rtalgdf = rename_columns_by_rate(algdf[algdf.PERC == rt])
            # filtra dataset
            for ds in datasets:
#                 print(ds)
                dsrtalgdf = rtalgdf[rtalgdf.dataset == ds]
#                 display(dsrtalgdf)
                minoutro = min(dsrtalgdf[dsrtalgdf.batch_size == 1][f'JPI_{rt}'])
#                 print(f"JPI_{rt} minoutro {minoutro}")
                val = ((dsrtalgdf[dsrtalgdf.batch_size != 1][f'JPI_{rt}'].iloc[0] - minoutro)/minoutro)*100 if ratio else dsrtalgdf[dsrtalgdf.batch_size != 1][f'JPI_{rt}'].iloc[0] - minoutro 
                all_values.append(val)
                sval = f"{val:.2f} " if val < 0 else "\\textbf{ " + f"{val:.2f}" + "} "
                line += f"& {sval} "
        print(f"{line} \\\\")
    print(f'\n\nAverage reduction: {sum(all_values)/len(all_values)}')

In [None]:
table_delta_rates_vert()

Sharey = row

Linear scale

In [None]:
gen_graph(bars=True)

sharey = False

linear scale

In [None]:
gen_graph(bars=True, share_y=False)

sharey = false

log scale y

In [None]:
gen_graph(bars=True, share_y=False, log_y=True)

sharey = false

normalizado MB 90

In [None]:
gen_graph(bars=True, share_y=False, log_y=False, norm='90MB')

sharey = false

normalizado Seq10

In [None]:
gen_graph(bars=True, share_y=False, log_y=False, norm='10S')

In [None]:
gen_graph(bars=True, share_y=False, log_y=False, norm='10P')