In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as pltticker
import numpy as np
import pandas as pd
import os

%matplotlib inline

plt.rcParams.update({
    'font.family': 'serif',
    'font.size': 6,
    'axes.titlesize': 6,
    'legend.fontsize': 6,
    'legend.title_fontsize': 6,
})

In [None]:
# Used to "convert" between number of clusters and cluster size
MEMORY_RANGE = 2048

# Prepare output directory
os.makedirs('figures', exist_ok=True)

# Figures about ML evaluation (fig.4 & fig.5)

## Comparison of ML algorithms and cluster sizes

In [None]:
data = pd.read_csv('results/memory/models_accuracy/audio.csv', sep=';')

# in the raw file for results about image functions, function IDs start at 1 again, so I need to shift them to avoid conflicts with audio functions
function_id_shift = data['function_id'].max()

data_image = pd.read_csv('results/memory/models_accuracy/image.csv', sep=';')
data_image['function_id'] = data_image['function_id'].apply(lambda func_id: func_id + function_id_shift)
data = data.append(data_image, ignore_index=True)

# in the raw file for results about video functions, function IDs start at *2*, so I need to shift them to avoid conflicts with audio and image functions
# -1 because IDs start at 2 in the video file
function_id_shift = data['function_id'].max() - 1

data_video = pd.read_csv('results/memory/models_accuracy/video.csv', sep=';')
data_video['function_id'] = data_video['function_id'].apply(lambda func_id: func_id + function_id_shift)
data_video['pred_exact'] = data_video['pred_exact'].apply(lambda p: 100 * p)
data_video['pred_above'] = data_video['pred_above'].apply(lambda p: 100 * p)

data = data.append(data_video, ignore_index=True)

data['cluster_size'] = MEMORY_RANGE / data['clusters']
del data['clusters']

data = data.set_index('cluster_size')
data = data.set_index('algorithm', append=True)
data = data.sort_index(level=[0,1], ascending=[False, True])

In [None]:
data_result_ml = data.groupby(level=[0,1]).mean().sort_index(level=[0,1], ascending=[False,True])
del data_result_ml['function_id']

print(data_result_ml)

## Maturation quickness

In [None]:
data = pd.read_csv('results/memory/maturation/audio.csv', sep=';', usecols=['function_id', 'runs', 'pred_above', 'pred_1under_error'], na_values='None')

# in the raw file for results about image functions, function IDs start at 1 again, so I need to shift them to avoid conflicts with audio functions
function_id_shift = data['function_id'].max()

data_image = pd.read_csv('results/memory/maturation/image.csv', sep=';', usecols=['function_id', 'runs', 'pred_above', 'pred_1under_error'], na_values='None')
data_image['function_id'] = data_image['function_id'].apply(lambda func_id: func_id + function_id_shift)

data = data.append(data_image, ignore_index=True)

# in the raw file for results about image functions, function IDs start at 1 again, so I need to shift them to avoid conflicts with audio functions
# -1 because IDs start at 2 in the video file
function_id_shift = data['function_id'].max() - 1

data_video = pd.read_csv('results/memory/maturation/video.csv', sep=';', usecols=['function_id', 'runs', 'pred_above', 'pred_1under_error'], na_values='None')
data_video['function_id'] = data_video['function_id'].apply(lambda func_id: func_id + function_id_shift)

data = data.append(data_video, ignore_index=True)

In [None]:
MATURITY_MINIMUM_RUNS = 100
MATURITY_ABOVEPREDICTIONS = 90
MATURITY_MINUS1_UNDERPREDICTIONS = 50

maturation_quickness = data[
          (data['runs'] >= MATURITY_MINIMUM_RUNS) &
          (data['pred_above'] >= MATURITY_ABOVEPREDICTIONS) &
          ((data['pred_1under_error'] >= MATURITY_MINUS1_UNDERPREDICTIONS) | data['pred_1under_error'].isna())
]
maturation_quickness = maturation_quickness.loc[maturation_quickness.groupby('function_id')['runs'].idxmin()].set_index('function_id').sort_index()

print(maturation_quickness)

print(f'Maturation quickness: average = {maturation_quickness.runs.mean()}, minimum = {maturation_quickness.runs.min()}, maximum = {maturation_quickness.runs.max()}')
print(f'10% = {maturation_quickness.runs.quantile(0.1)}, 50% = {maturation_quickness.runs.quantile(0.5)}, 75% = {maturation_quickness.runs.quantile(0.75)}, 90% = {maturation_quickness.runs.quantile(0.9)}, 95% = {maturation_quickness.runs.quantile(0.95)}')
print(f'Maturation quickness: {len(maturation_quickness[maturation_quickness.runs == 100])} functions matured in 100 runs or less (which is a fixed minimum).')

## Prediction time

In [None]:
data_dict = {
    'function_id': [],
    'cluster_size': [],
    'time': [],
}

with open('results/memory/prediction_speed/audio.txt', 'r') as infile:
    for header, dataline in zip(infile, infile):
        function_id, nb_clusters = (int(e) for e in header.rstrip().split(';')[1:])
        cluster_size = MEMORY_RANGE / nb_clusters
        # remove trailing newline and trailing comma
        times = [float(e) for e in dataline.rstrip().rstrip(',').split(', ')]

        data_dict['function_id'].extend([function_id] * len(times))
        data_dict['cluster_size'].extend([cluster_size] * len(times))
        data_dict['time'].extend(times)

# in the raw file for results about image functions, function IDs start at 1 again, so I need to shift them to avoid conflicts with audio functions
function_id_shift = max(data_dict['function_id'])
with open('results/memory/prediction_speed/image.txt', 'r') as infile:
    for header, dataline in zip(infile, infile):
        function_id, nb_clusters = (int(e) for e in header.rstrip().split(';')[1:])
        cluster_size = MEMORY_RANGE / nb_clusters
        # remove trailing newline and trailing comma
        times = [float(e) for e in dataline.rstrip().rstrip(',').split(', ')]

        data_dict['function_id'].extend([function_id + function_id_shift] * len(times))
        data_dict['cluster_size'].extend([cluster_size] * len(times))
        data_dict['time'].extend(times)

# in the raw file for results about image functions, function IDs start at 1 again, so I need to shift them to avoid conflicts with audio functions
# -1 because IDs start at 2 in the video file
function_id_shift = max(data_dict['function_id']) - 1
with open('results/memory/prediction_speed/video.txt', 'r') as infile:
    for header, dataline in zip(infile, infile):
        function_id, nb_clusters = (int(e) for e in header.rstrip().split(';')[1:])
        cluster_size = MEMORY_RANGE / nb_clusters
        # remove trailing newline and trailing comma
        times = [float(e) for e in dataline.rstrip().rstrip(',').split(', ')]

        data_dict['function_id'].extend([function_id + function_id_shift] * len(times))
        data_dict['cluster_size'].extend([cluster_size] * len(times))
        data_dict['time'].extend(times)

data = pd.DataFrame(data_dict)

In [None]:
fig, axes = plt.subplots(figsize=((7 - 0.33) / 2 / 2 - 0.05, 1))

axes.set_title('Distribution of times (%)')
axes.set_xlabel('Prediction times (µs)', labelpad=0)

for cluster_size, cluster_data in data.groupby('cluster_size'):
    # note that we eliminate the 1% max (very outlier values) for visualization
    data_sorted = pd.Series(cluster_data['time']).sort_values()[:-int(0.01*len(cluster_data['time']))]
    cdf = pd.Series(np.linspace(0., 1., len(data_sorted)), index=data_sorted)
    axes.plot(cdf, label=f'{cluster_size:.0f}MB')

    print(f'{cluster_size:.0f}MB clusters: median = {data_sorted.median()}, 99% = {data_sorted.quantile(0.99)}')

axes.grid(True, axis='both')

axes.set_yticks([0.1, 0.25, 0.5, 0.75, 0.9, 1.0])
axes.yaxis.set_major_formatter(pltticker.FuncFormatter(lambda y, pos: int(y * 100)))

axes.xaxis.set_major_locator(pltticker.MultipleLocator(10000))
axes.xaxis.set_major_formatter(pltticker.FuncFormatter(lambda x, pos: x / 1000))

plt.setp(axes.get_yticklabels(), position=(0.04,0))

axes.legend(title='Interval size')

fig.savefig('figures/prediction_speed.pdf', bbox_inches='tight', pad_inches=0)

## Prediction errors

In [None]:
data = pd.read_csv('results/memory/predictions/audio.csv', sep=';', usecols=['algorithm', 'function_id', 'clusters', 'truth', 'prediction'])

# in the raw file for results about image functions, function IDs start at 1 again, so I need to shift them to avoid conflicts with audio functions
function_id_shift = data['function_id'].max()

data_image = pd.read_csv('results/memory/predictions/image.csv', sep=';', usecols=['algorithm', 'function_id', 'clusters', 'truth', 'prediction'])
data_image['function_id'] = data_image['function_id'].apply(lambda func_id: func_id + function_id_shift)
data = data.append(data_image, ignore_index=True)

# in the raw file for results about video functions, function IDs start at *2*, so I need to shift them to avoid conflicts with audio and image functions
# -1 because IDs start at 2 in the video file
function_id_shift = data['function_id'].max() - 1

data_video = pd.read_csv('results/memory/predictions/video.csv', sep=';', usecols=['algorithm', 'function_id', 'clusters', 'truth', 'prediction'])
data_video['function_id'] = data_video['function_id'].apply(lambda func_id: func_id + function_id_shift)
data = data.append(data_video, ignore_index=True)

data = data.loc[data['algorithm'] == 'J48']
del data['algorithm']

data['cluster_size'] = MEMORY_RANGE / data['clusters']
del data['clusters']
data['error'] = data['prediction'] - data['truth']
del data['truth']
del data['prediction']

In [None]:
fig, axes = plt.subplots(figsize=((7 - 0.33) / 2 / 2 - 0.05, 1))

axes.set_title('Distribution of errors')
axes.set_xlabel('Difference to truth (MB)', labelpad=0)
axes.set_ylabel('Predictions (log)', labelpad=0)

data_16MB = data[data['cluster_size'] == 16.0]

axes.set_xlim(left=-15, right=15)

axes.hist(data_16MB['error'],
          bins=data_16MB['error'].max() - data_16MB['error'].min() + 1)

axes.set_yscale('log')
axes.grid(True, axis='y')

axes.xaxis.set_major_locator(pltticker.MultipleLocator(8))
axes.xaxis.set_major_formatter(pltticker.FuncFormatter(lambda x, pos: int(x * 16)))
# I don't know how to use numticks, but this produces the result I want
axes.yaxis.set_major_locator(pltticker.LogLocator(numticks=5))

log_formatter = axes.yaxis.get_major_formatter()
def formatter(y, _):
    return str(int(y)) if y < 100 else log_formatter(y)
axes.yaxis.set_major_formatter(pltticker.FuncFormatter(formatter))

plt.setp(axes.get_yticklabels(), position=(0.04,0))

over3 = len(data_16MB[(data_16MB['error'] > 0) & (data_16MB['error'] <= 3)]) / len(data_16MB[data_16MB['error'] > 0])
avrg_overwaste = data_16MB[data_16MB['error'] > 0]['error'].mean() * 16

print(f'Overpredictions within 3 clusters of truth: {over3:.2%}')
print(f'Average waste due to overpredictions: {avrg_overwaste}')

fig.savefig('figures/prediction_errors.pdf', bbox_inches='tight', pad_inches=0)

## Prediction errors: ETL times

In [None]:
data = pd.read_csv('results/caching/predictions/audio.csv', sep=';')

# in the raw file for results about image functions, function IDs start at 1 again, so I need to shift them to avoid conflicts with audio functions
function_id_shift = data['function_id'].max()

data_image = pd.read_csv('results/caching/predictions/image.csv', sep=';')
data_image['function_id'] = data_image['function_id'].apply(lambda func_id: func_id + function_id_shift)
data = data.append(data_image, ignore_index=True)

# in the raw file for results about image functions, function IDs start at 1 again, so I need to shift them to avoid conflicts with audio functions
# -1 because IDs start at 2 in the video file
function_id_shift = data['function_id'].max() - 1

data_video = pd.read_csv('results/caching/predictions/video.csv', sep=';')
data_video['function_id'] = data_video['function_id'].apply(lambda func_id: func_id + function_id_shift)
data = data.append(data_video, ignore_index=True)

data = data.groupby('algorithm')

In [None]:
mlfunc_results = {}
for mlfunc, groupdata in data:
    true_pos = groupdata[(groupdata['truth'] == 1) & (groupdata['prediction'] == 1)]
    false_pos = groupdata[(groupdata['truth'] == 0) & (groupdata['prediction'] == 1)]
    false_neg = groupdata[(groupdata['truth'] == 1) & (groupdata['prediction'] == 0)]
    true_neg = groupdata[(groupdata['truth'] == 0) & (groupdata['prediction'] == 0)]

    tp = len(true_pos)
    fp = len(false_pos)
    fn = len(false_neg)
    tn = len(true_neg)

    print(mlfunc)
    print(f'TP: {tp}; FP: {fp}; FN: {fn}; TN: {tn}')

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_measure = 2 * precision * recall / (precision + recall)

    print(f'Precision: {precision:.1%}; recall: {recall:.1%}; F-measure: {f_measure:.1%}')