# Perf Tools Analisys

Neste notebook iremos verificar as analises referentes ao perf tools da implementação do loop fusion, verificando quanto de acesso ao cache misses e cache references é utilizado.

In [150]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
import time
from IPython.display import display
from collections import Counter
import os
import re
import math
import random
pd.set_option('display.max_rows', 300)
pd.options.display.float_format = '{:,.2f}'.format

In [151]:
#loads the csv as a dataframe and standardizes the algorithm names 
def load_df(filename):
    df = pd.read_csv(filename, index_col=False)
    return select_columns_and_rename_values(df)

In [152]:
#filters by substring (there are multiple OzaBag algorithms)
def filter_by_substring_algorithm(df, string):
    aux = df[df['algorithm'].str.contains(string, regex=False)]
    ret = aux
    if string == 'OB':
        ret = aux[~aux.algorithm.str.contains("Adwin|ASHT")]
    elif string == 'OzaBag':
        ret = aux[(aux.algorithm.str.contains(string)) & (~aux.algorithm.str.contains("Adwin|ASHT"))]
    return ret

In [153]:
#standardize algorithm names
def select_columns_and_rename_values(df):
    df.algorithm = df.algorithm.str.replace("Executor", "")
    df['algorithm'] = df["algorithm"].str.replace("OzaBag", "OB")
    df['algorithm'] = df["algorithm"].str.replace("AdaptiveRandomForest", "ARF")
    df['algorithm'] = df["algorithm"].str.replace("SequentialChunk", "SeqMB")
    df['algorithm'] = df["algorithm"].str.replace("OB$", "OBSequential")
    df['algorithm'] = df['algorithm'].str.replace("LeveragingBag", "LBagSequential")
    df['algorithm'] = df['algorithm'].str.replace("Adwin$", "AdwinSequential")
    df['algorithm'] = df['algorithm'].str.replace("CHUNK", "MB")
    df['algorithm'] = df['algorithm'].str.replace("MAXChunk", "MB")
    df['algorithm'] = df['algorithm'].str.replace("StreamingRandomPatches", "SRP")
    df['algorithm'] = df['algorithm'].str.replace("SRP$", "SRPSequential")
    df['algorithm'] = df['algorithm'].str.replace("OBASHT$", "OBASHTSequential")
    df.batch_size.unique()
    return df

In [154]:
#Folder inside results directory that contains all the MOA dump files for these experiments
%cd /Users/reginaldoluisdeluna/Documents/Ufscar/comparison-xue3m-minibatching/
folderMOADumps = "/Users/reginaldoluisdeluna/Documents/Ufscar/comparison-xue3m-minibatching/results/sem-coletor/mini-batching/perf/first"
wantedCSVfilename = "loop-fusion-perf"

/Users/reginaldoluisdeluna/Documents/Ufscar/comparison-xue3m-minibatching


In [171]:
import re
MILLION = 1000000
def parse_perf_file(fname, methodology='MB'):
    spname = fname.split('/')[-1].split('-')

    lineDict = {
        'dataset': spname[1],
        'algorithm': spname[2],
        'ensemble': spname[3],
        'methodology': methodology,
        'cores': 4,
        'batch_size': spname[5],
        'rate': spname[6],
        'cache-misses': None,
        'cache-references': None
    }

    with open (fname) as file:
        cache_misses = "0"
        cache_references = "0"
        for line in file:
            line = re.findall(r'\S+', line)
            try:
                if line[1]:
                    if line[1] == "cache-misses:u":
                        cache_misses = line[0]
                        
                    if line[1] == "cache-references:u":
                        cache_references = line[0]
    
                    response.append(lineDict)
            except IndexError:
                pass

    lineDict['cache-misses'] = float(cache_misses.replace(',', '')) / MILLION
    lineDict['cache-references'] = float(cache_references.replace(',', '')) / MILLION
            
    return lineDict

In [172]:
resultsFolder = f"{folderMOADumps}"
csvFile = f"parsed_csvs/{wantedCSVfilename}"
directory = os.fsencode(resultsFolder)
header_printed = False

response_mb = []
with open(f"{csvFile}", "w+") as output:
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.startswith("perf-"):
            response_mb.append(
                parse_perf_file(f'{os.fsdecode(directory)}/{filename}', 'MB')
            )

df_mini_batching = pd.DataFrame.from_dict(response_mb)
display(df_mini_batching.sort_values(by=['dataset', 'algorithm']).drop_duplicates())

KeyError: 'dataset'

In [157]:
#Folder inside results directory that contains all the MOA dump files for these experiments
%cd /Users/reginaldoluisdeluna/Documents/Ufscar/comparison-xue3m-minibatching/
folderMOADumps = "/Users/reginaldoluisdeluna/Documents/Ufscar/comparison-xue3m-minibatching/results/loop-fusion/loop-fusion-perf/second"
wantedCSVfilename = "loop-fusion-perf"

/Users/reginaldoluisdeluna/Documents/Ufscar/comparison-xue3m-minibatching


In [158]:
resultsFolder = f"{folderMOADumps}"
csvFile = f"parsed_csvs/{wantedCSVfilename}"
directory = os.fsencode(resultsFolder)
header_printed = False

response = []
with open(f"{csvFile}", "w+") as output:
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.startswith("perf-"):
            response.append(
                parse_perf_file(f'{os.fsdecode(directory)}/{filename}', 'MB-LF')
            )

df = pd.DataFrame.from_dict(response)
display(df.sort_values(by=['dataset', 'algorithm']).drop_duplicates())

Unnamed: 0,dataset,algorithm,ensemble,methodology,cores,batch_size,rate,cache-misses,cache-references
464,GMSC,AdaptiveRandomForestExecutorMAXChunk,25,MB-LF,4,50,1,1854.57,72717.01
145,GMSC,LBagExecutorMAXChunk,25,MB-LF,4,50,1,1378.6,57385.89
29,GMSC,OzaBagASHTExecutorMAXChunk,25,MB-LF,4,50,1,360.43,12057.65
290,GMSC,OzaBagAdwinExecutorMAXChunk,25,MB-LF,4,50,1,547.88,23702.32
523,GMSC,OzaBagExecutorMAXChunk,25,MB-LF,4,50,1,400.55,12859.9
435,GMSC,StreamingRandomPatchesExecutorMAXChunk,25,MB-LF,4,50,1,3656.74,141587.31
522,airlines,AdaptiveRandomForestExecutorMAXChunk,25,MB-LF,4,50,1,0.0,0.0
58,airlines,LBagExecutorMAXChunk,25,MB-LF,4,50,1,61329.64,2574339.12
493,airlines,OzaBagASHTExecutorMAXChunk,25,MB-LF,4,50,1,1752.4,150673.11
261,airlines,OzaBagAdwinExecutorMAXChunk,25,MB-LF,4,50,1,17816.01,820965.91


In [163]:
#Folder inside results directory that contains all the MOA dump files for these experiments
%cd /Users/reginaldoluisdeluna/Documents/Ufscar/comparison-xue3m-minibatching/
folderMOADumps = "/Users/reginaldoluisdeluna/Documents/Ufscar/comparison-xue3m-minibatching/results/loop-fusion/sequential-perf/first"
wantedCSVfilename = "sequential-perf"

/Users/reginaldoluisdeluna/Documents/Ufscar/comparison-xue3m-minibatching


In [164]:
resultsFolder = f"{folderMOADumps}"
csvFile = f"parsed_csvs/{wantedCSVfilename}"
directory = os.fsencode(resultsFolder)
header_printed = False

response = []
with open(f"{csvFile}", "w+") as output:
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.startswith("perf-"):
            response.append(
                parse_perf_file(f'{os.fsdecode(directory)}/{filename}', 'Sequential')
            )

df = pd.DataFrame.from_dict(response)
display(df.sort_values(by=['dataset', 'algorithm']).drop_duplicates())

Unnamed: 0,dataset,algorithm,ensemble,methodology,cores,batch_size,rate,cache-misses,cache-references
228,GMSC,AdaptiveRandomForestSequential,25,Sequential,4,1,1,0.0,0.0
39,GMSC,LeveragingBag,25,Sequential,4,1,1,0.0,0.0
41,GMSC,OzaBag,25,Sequential,4,1,1,0.0,0.0
161,GMSC,OzaBagASHT,25,Sequential,4,1,1,0.0,0.0
224,GMSC,OzaBagAdwin,25,Sequential,4,1,1,0.0,0.0
163,GMSC,StreamingRandomPatches,25,Sequential,4,1,1,360.51,11197.19
101,airlines,AdaptiveRandomForestSequential,25,Sequential,4,1,1,44836.27,1989353.67
223,airlines,LeveragingBag,25,Sequential,4,1,1,0.0,0.0
0,airlines,OzaBag,25,Sequential,4,1,1,0.0,0.0
31,airlines,OzaBagASHT,25,Sequential,4,1,1,0.0,0.0
