In [252]:
import pandas as pd
import numpy as np
from scipy.stats import zipf

In [253]:
MODEL_FACTORS_BANDWIDTH = {
    'RAM': 50,
    'NVMe': 2,
    'SSD': 0.5,
    'HDD': 0.25,
    'EBS': None,
}

# base workload
CPU_H = 20
TOTAL_READS = 800

# caching
CACHE_SKEW = 1.2
FIRST_READ_FROM_S3 = True

instances = pd.read_csv("../data/ec2-instances.info.csv")

In [256]:
def calc_storage_speed(inst, network_speed):
    bws = np.array([MODEL_FACTORS_BANDWIDTH[x] for x in inst['storage.type'].astype(str)]) * inst['id.slice.sto'] * inst['id.slice.factor']
    bws[np.isnan(bws)] = network_speed[np.isnan(bws)]
    return bws


def distr_maker(shape, size):
    if np.isnan(size):
        return []
    if size <= 1:
        return [size]

    distr = zipf.pmf(np.arange(1, size+1), shape)
    normd = distr / np.sum(distr) * size
    return normd.tolist()

def model_distr_hsplit(distr, lim):
    dist_low = np.minimum(distr, lim)
    dist_high = np.maximum(distr - dist_low, 0)
    return {'low': dist_low, 'high': dist_high}


def model_distr_split_fn(distr, split_first_read):
    if split_first_read:
        split_dist = model_distr_hsplit(distr, 1)
    else:
        split_dist = [np.zeros(len(distr)), distr]
    return {"initial": split_dist['low'], "working": split_dist['high']}

def calc_inst_speeds(inst):
    # TODO: change column names when working on clean data
    inst['calc_net_speed'] = inst['network_performance.value.Gib'] / 8
    inst['calc_mem_speed']= MODEL_FACTORS_BANDWIDTH['RAM']
    inst['calc_cpu_real']= inst['vcpu.value.count'] / 2
    inst['calc_mem_caching']= inst['memory.value.gib'] / 2
    inst['calc_sto_caching']= inst['storage.sum.gib'] / 2
    inst['calc_sto_speed']= MODEL_FACTORS_BANDWIDTH['SSD']
    inst['calc_s3_speed'] = inst['calc_net_speed'] * 0.8


def calc_groups(sizes, distr_len):
    if len(sizes) == 1:
        return [sizes.index[0]] * min(sizes[0], distr_len)
    elif sizes[0] > distr_len:
        return []
    else:
        return [sizes.index[1]] * (min(sizes[1], distr_len) - sizes[0])


def distr_pack_helper(bins, distr):
    distr_len = len(distr)
    bins = bins.sort_values(by='prio', ascending=False)
    bins['acc_size'] = bins['size'].cumsum().astype('int32')
    size_windows = bins['acc_size'].rolling(window=2)
    res = []
    for size_window in size_windows:
        res.extend(calc_groups(size_window, distr_len))

    result = pd.DataFrame(data={
        'distr_val': distr,
        'group': res
    }).groupby('group').sum().transpose()

    return result.reset_index()


def model_distr_pack(bins, distr):
    n = len(bins['data_mem']['prio'])
    res = pd.DataFrame()
    for i in range(n):
        next_ = distr_pack_helper(
            bins=pd.DataFrame(
                data={
                    'prio': [bins['data_mem']['prio'][i], bins['data_sto']['prio'][i], bins['data_s3']['prio'][i]],
                    'size': [bins['data_mem']['size'][i], bins['data_sto']['size'][i], bins['data_s3']['size'][i]],
                },
                index=['data_mem', 'data_sto', 'data_s3']
            ),
            distr=distr
        )
        res = pd.concat([res, next_], ignore_index=True).fillna(0)


    return res #.drop('group', axis=1)




def calc_time_for_config(inst, distr_cache):
    data_mem = pd.DataFrame(data = {'size': instances['calc_mem_caching'].round(decimals=0), 'prio': instances['calc_mem_speed']})
    data_sto = pd.DataFrame(data =  {'size': instances['calc_sto_caching'].round(decimals=0), 'prio': instances['calc_sto_speed'].tolist()})
    data_s3 = pd.DataFrame(data={'size': [len(distr_cache['working'])] * len(instances), 'prio': instances['calc_net_speed'].tolist()})

    bins_cache = {
        'data_mem': data_mem,
        'data_sto': data_sto,
        'data_s3': data_s3
    }

    mem_read_distribution = model_distr_pack(bins_cache, distr_cache['working'])
    print(mem_read_distribution.dtypes)
    print(len(mem_read_distribution))
    cpu_time = CPU_H / instances['vcpu.value.count']
    inst['data_mem'] = mem_read_distribution['data_mem']
    inst['data_sto'] = mem_read_distribution['data_sto']
    inst['data_s3'] = mem_read_distribution['data_s3']

    scan_time = mem_read_distribution['data_mem'] / inst['calc_mem_speed'] +  mem_read_distribution['data_sto'] / inst['calc_sto_speed'] + mem_read_distribution['data_s3'] / inst['calc_net_speed']
    inst['execution_time_m2'] = cpu_time + scan_time



In [258]:
distr_caching_precomputed = distr_maker(shape=CACHE_SKEW, size=TOTAL_READS)
distr_cache = model_distr_split_fn(distr_caching_precomputed, FIRST_READ_FROM_S3)
calc_inst_speeds(instances)
calc_time_for_config(instances, distr_cache)

instances

group
index        object
data_mem    float64
data_s3     float64
data_sto    float64
dtype: object
257


Unnamed: 0,longname,id,memory.text,memory.value.gib,vcpu.text,vcpu.value.count,processorName,clockSpeed.text,clockSpeed.value.ghz,storage.text,...,calc_mem_speed,calc_cpu_real,calc_mem_caching,calc_sto_caching,calc_sto_speed,calc_s3_speed,data_mem,data_sto,data_s3,execution_time_m2
0,Z1D Extra Large,z1d.xlarge,32.0 GiB,32.0,4 vCPUs,4,Intel Xeon Platinum 8151,4 GHz,4.0,150 GiB NVMe SSD,...,50,2.0,16.0,75.0,0.5,1.0,495.875061,0.0,80.994223,79.712880
1,Z1D Metal,z1d.metal,384.0 GiB,384.0,48 vCPUs,48,Intel Xeon Platinum 8151,4 GHz,4.0,1800 GiB (2 * 900 GiB NVMe SSD),...,50,24.0,192.0,900.0,0.5,2.5,576.869284,0.0,0.000000,11.954052
2,Z1D Large,z1d.large,16.0 GiB,16.0,2 vCPUs,2,Intel Xeon Platinum 8151,4 GHz,4.0,75 GiB NVMe SSD,...,50,1.0,8.0,37.5,0.5,1.0,428.232839,0.0,148.636445,137.473813
3,Z1D 6xlarge,z1d.6xlarge,192.0 GiB,192.0,24 vCPUs,24,Intel Xeon Platinum 8151,4 GHz,4.0,900 GiB NVMe SSD,...,50,12.0,96.0,450.0,0.5,1.0,576.869284,0.0,0.000000,12.370719
4,Z1D 3xlarge,z1d.3xlarge,96.0 GiB,96.0,12 vCPUs,12,Intel Xeon Platinum 8151,4 GHz,4.0,450 GiB NVMe SSD,...,50,6.0,48.0,225.0,0.5,1.0,567.376163,0.0,9.493122,20.608687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252,A1 Metal,a1.metal,32.0 GiB,32.0,16 vCPUs,16,AWS Graviton Processor,2.3 GHz,2.3,EBS only,...,50,8.0,16.0,0.0,0.5,1.0,495.875061,0.0,80.994223,75.962880
253,A1 Medium,a1.medium,2.0 GiB,2.0,1 vCPUs,1,AWS Graviton Processor,2.3 GHz,2.3,EBS only,...,50,0.5,1.0,0.0,0.5,1.0,185.982177,0.0,390.887107,336.429329
254,A1 Large,a1.large,4.0 GiB,4.0,2 vCPUs,2,AWS Graviton Processor,2.3 GHz,2.3,EBS only,...,50,1.0,2.0,0.0,0.5,1.0,266.370896,0.0,310.498388,263.726128
255,A1 Quadruple Extra Large,a1.4xlarge,32.0 GiB,32.0,16 vCPUs,16,AWS Graviton Processor,2.3 GHz,2.3,EBS only,...,50,8.0,16.0,0.0,0.5,1.0,495.875061,0.0,80.994223,75.962880
