In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zipf

In [None]:
MODEL_FACTORS_BANDWIDTH = {
    'RAM': 50,
    'NVMe': 2,
    'SSD': 0.5,
    'HDD': 0.25,
    'EBS': None,
}

# base workload
CPU_H = 20
TOTAL_READS = 800

# caching
CACHE_SKEW = 1.2
FIRST_READ_FROM_S3 = True

instances = pd.read_csv("data/ec2-instances.info.csv")

In [None]:
def calc_storage_speed(inst, network_speed):
    bws = np.array([MODEL_FACTORS_BANDWIDTH[x] for x in inst['storage.type'].astype(str)]) * inst['id.slice.sto'] * inst['id.slice.factor']
    bws[np.isnan(bws)] = network_speed[np.isnan(bws)]
    return bws


def distr_maker(shape, size):
    if np.isnan(size):
        return []
    if size <= 1:
        return [size]

    distr = zipf.pmf(np.arange(1, size+1), shape)
    normd = distr / np.sum(distr) * size
    return normd.tolist()

def model_distr_hsplit(distr, lim):
    dist_low = np.minimum(distr, lim)
    dist_high = np.maximum(distr - dist_low, 0)
    return [dist_low, dist_high]


def model_distr_split_fn(distr, split_first_read):
    if split_first_read:
        split_dist = model_distr_hsplit(distr, 1)
    else:
        split_dist = [np.zeros(len(distr)), distr]
    return {"initial": split_dist[0], "working": split_dist[1]}

def calc_inst_speeds(inst):
    # TODO: change column names when working on clean data
    inst['calc_net_speed'] = inst['network_performance.value.Gib'] / 8
    inst['calc_mem_speed']= MODEL_FACTORS_BANDWIDTH['RAM']
    inst['calc_cpu_real']= inst['vcpu.value.count'] / 2
    inst['calc_mem_caching']= inst['memory.value.gib'] / 2
    inst['calc_sto_caching']= inst['storage.sum.gib'] / 2
    inst['calc_sto_speed']= MODEL_FACTORS_BANDWIDTH['SSD']
    inst['calc_s3_speed'] = inst['calc_net_speed'] * 0.8


def calc_time_for_config(inst, query, distr_cache):
    bins_cache = {
        'data_mem': {'size': round(inst['calc.mem.caching']), 'prio': inst['calc.mem.speed']},
        'data_sto': {'size': round(inst['calc.sto.caching']), 'prio': inst['calc.sto.speed']},
        'data_s3': {'size': [len(distr_cache['working'])] * len(inst), 'prio': inst['calc.net.speed']}
    }
