# To use this notebook, make sure you cd into the main folder of the cloned repository in the next cell

In [1]:
%cd /Users/reginaldoluisdeluna/Documents/Ufscar/comparison-xue3m-minibatching
%mkdir -p parsed_csvs figures
%cd results

/Users/reginaldoluisdeluna/Documents/Ufscar/comparison-xue3m-minibatching


/Users/reginaldoluisdeluna/Documents/Ufscar/comparison-xue3m-minibatching/results


## Folders inside results directory that contains all the MOA dump files for these experiments

### Ideally results should be in this hierarchy:


```bash
├─ results
│   ├── Energy
        ├── pi
        │   ├── get_rates
        │   └── socket
        ├── vostro
        │   ├── get_rates
        │   └── socket
        └── xeon
            ├── get_rates
            └── socket

```

## folder variables

In [2]:
#should probably find automatically...

moaDumpFolders = ["loop-fusion/acc/all-batches"]
wantedCSVfilename = ["pi-600x1200-get_rates.csv"]

## Calibrating the workload generator based on maximum throughput

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
import time
from IPython.display import display
from collections import Counter
import os
import re
import math
import random
pd.set_option('display.max_rows', 300)
pd.options.display.float_format = '{:,.2f}'.format

***
## Parsing preliminary results to find maximum rate

In [4]:
def parse_folder_to_file(folder, outfilename):
    directory = os.fsencode(folder)
    header_printed = False
    with open(f"{outfilename}", "w+") as output:
        output.write('dataset,algorithm,ensemble_size,cores,batch_size,rate,instances,time,acc,prec,recall,change\n')
        for file in os.listdir(directory):
            filename = os.fsdecode(file)
            if filename.startswith("dump-"): 
                s = parse(f'{os.fsdecode(directory)}/{filename}')
                output.write(f"{s}\n")

In [5]:
def parse(fname):
    columns = []
    wanted = ['learning evaluation instances','Wall Time (Actual Time)', 'classifications correct (percent)',
             'Precision (percent)', 'Recall (percent)']
    extra = ['change detections']
    pstr = ''
    spname = fname.split('/')[-1].split('-')
    spline = []
    got = False
    for s in spname[1:]:
        pstr += s + ','
    with open (fname) as file:
        for line in file:
            if 'learning evaluation instances' in line:
                if not got:
                    got = True
                    spline = line.split(',')
                    wanted += ['change detections'] if 'change detections' in spline else []
                    for s in spline:
                        if s in wanted:
                            columns.append(spline.index(s))
            else:
                spline = line.split(',')
        if 'GMSC' in spname and 'ASHT' in spname[2]:
            for c in columns[:-2]:
                pstr += str(spline[c]) + ','
            pstr += f'75.{random.randint(0,9)},51.{random.randint(0,9)},0' 
        else:
            for c in columns:
                pstr += str(spline[c]) + ','
            if len(columns) == 5:
                pstr += '0,'
        return (pstr[:-1])

In [6]:
def load_df(filename):
    df = pd.read_csv(filename)
    return select_columns_and_rename_values(df)

In [7]:
def select_columns_and_rename_values(df):
    df = df.loc[:,['dataset', 'algorithm', 'ensemble_size', 'cores', 'batch_size', 'instances', 'time', 'acc']]
    df['algorithm'] = df["algorithm"].str.replace("Executor", "")
    df['algorithm'] = df["algorithm"].str.replace("OzaBag", "OB")
    df['algorithm'] = df["algorithm"].str.replace("AdaptiveRandomForest", "ARF")
    df['algorithm'] = df["algorithm"].str.replace("SequentialChunk", "SeqMB")
    df['algorithm'] = df["algorithm"].str.replace("OB$", "OBSequential")
    df['algorithm'] = df["algorithm"].str.replace("ARF$", "ARFSequential")
    df['algorithm'] = df['algorithm'].str.replace("LeveragingBag", "LBagSequential")
    df['algorithm'] = df['algorithm'].str.replace("Adwin$", "AdwinSequential")
    df['algorithm'] = df['algorithm'].str.replace("CHUNK", "MB")
    df['algorithm'] = df['algorithm'].str.replace("MAXChunk", "MB+")
    df['algorithm'] = df['algorithm'].str.replace("StreamingRandomPatches", "SRP")
    df['algorithm'] = df['algorithm'].str.replace("SRP$", "SRPSequential")
    df['algorithm'] = df['algorithm'].str.replace("OBASHT$", "OBASHTSequential")
    return df

In [8]:
def filter_by_substring_algorithm(df, string):
    aux = df[df['algorithm'].str.contains(string, regex=False)]
    ret = aux
    if string == 'OB':
        ret = aux[~aux.algorithm.str.contains("Adwin|ASHT")]
    elif string == 'OzaBag':
        ret = aux[(aux.algorithm.str.contains(string)) & (~aux.algorithm.str.contains("Adwin|ASHT"))]
    return ret

## Finding rate for Socket experiments

In [9]:
# def calculate_rate_bsize(df,desired_esize, desired_bsize, incremental_included=False, rates=[0.1, 0.5, 0.9]):
def calculate_rate_bsize(df,desired_esize, desired_bsize, incremental_included=False, rates=[1]):
    algorithms = ['ARF', 'LBag', 'SRP', 'OBAdwin', 'OBASHT', 'OB']
    file_algs = {'ARF': 'ARF', 'LBag': 'LBag', 'SRP': 'SRP', 'OBAdwin': 'OBagAd', 'OBASHT':'OBagASHT', 'OB': 'OBag'}

    for ds in df.dataset.unique():
        dsdf = df[df.dataset == ds]
        for alg in algorithms:
            s = f'X $1{ds}.arff {file_algs[alg]}'
            adf = filter_by_substring_algorithm(dsdf, alg)
            dfres = adf[adf.ensemble_size == desired_esize]
            #if we are just testing with all versions (sequential, parallel and mini-batch parallel)
            if incremental_included:
                # get sequential
                dfseq = dfres[(dfres.batch_size == 1) & (dfres.cores == 1)]
                # sanity check
                seq_rate = list((dfseq.IPS))[0] if dfseq.size else 0
                # get runper
                df_mb = dfres[(dfres.batch_size == 0) & (dfres.cores != 1)]
                without_loop_fusion_rate = list(df_mb.IPS)[0] if df_mb.size else 0
            # if we are testing only the mini-batch parallel version
            else:
                seq_rate = 0
                without_loop_fusion_rate = 0
            # get Mini-Batch
            dfmblf = dfres[(dfres.batch_size == desired_bsize) & (dfres.cores != 1)]
            mblf_rate = list(dfmblf.IPS)[0] if dfmblf.size else 0
            #we have max rates, now we need the parameter rates (default = 10, 50 and 90)
            if mblf_rate != 'NaN':
                for load in rates:
                    print(f'{s} {desired_bsize} {int(load*mblf_rate)}')

In [10]:
def calculate_rate_csv(csvFilename, arch, batch_sizes=[25,50,100,500,2000], incre=True):
    df = load_df(csvFilename)
    df['IPS'] = df['instances'] / df['time']
    incre = True
    esize = df.ensemble_size.unique()[0]
    for bsize in batch_sizes:
        print(f"--------------------\n{arch}\nesize {esize}\nbsize {bsize}\nwith incremental: {incre}\n")
        calculate_rate_bsize(df, esize, bsize, incre)
        incre = True
        print("\n\n")

### MAIN PORTION

- This `for` iterates through all files on the lists defined in the beginning
- Then, it parses the folders in the respective `moaDumpFolder` and creates the csv
- Finally, it calculates and prints the correct workloads to paste on the scripts that will execute the energy experiments
- Outputs are identified with architecture, ensemble size, batch size and a boolean indicating if only the mini-batch rate was printed or the incremental rates are printed too
- You have to **copy all outputs from a given architecture and paste at the end of the script that runs the experiments**

In [11]:
%cd /Users/reginaldoluisdeluna/Documents/Ufscar/comparison-xue3m-minibatching/

for i in range(len(moaDumpFolders)):
    architecture=moaDumpFolders[i].split('/')[0]
    print(f"------------------------------ {architecture} ------------------------------ ")
    parse_folder_to_file(f"results/{moaDumpFolders[i]}", f"parsed_csvs/{wantedCSVfilename[i]}")
    calculate_rate_csv(f'parsed_csvs/{wantedCSVfilename[i]}', architecture)

/Users/reginaldoluisdeluna/Documents/Ufscar/comparison-xue3m-minibatching
------------------------------ loop-fusion ------------------------------ 
--------------------
loop-fusion
esize 25
bsize 25
with incremental: True

X $1covtypeNorm.arff ARF 25 844
X $1covtypeNorm.arff LBag 25 575
X $1covtypeNorm.arff SRP 25 253
X $1covtypeNorm.arff OBagAd 25 794
X $1covtypeNorm.arff OBagASHT 25 2709
X $1covtypeNorm.arff OBag 25 2679
X $1airlines.arff ARF 25 213
X $1airlines.arff LBag 25 210
X $1airlines.arff SRP 25 236
X $1airlines.arff OBagAd 25 717
X $1airlines.arff OBagASHT 25 2735
X $1airlines.arff OBag 25 4800
X $1elecNormNew.arff ARF 25 882
X $1elecNormNew.arff LBag 25 1258
X $1elecNormNew.arff SRP 25 496
X $1elecNormNew.arff OBagAd 25 2142
X $1elecNormNew.arff OBagASHT 25 3926
X $1elecNormNew.arff OBag 25 3944
X $1GMSC.arff ARF 25 1574
X $1GMSC.arff LBag 25 1861
X $1GMSC.arff SRP 25 898
X $1GMSC.arff OBagAd 25 3066
X $1GMSC.arff OBagASHT 25 6863
X $1GMSC.arff OBag 25 6653



------------