In [1]:
import pandas as pd
import numpy as np
from scipy.stats import zipf

In [2]:
MODEL_FACTORS_BANDWIDTH = {
    'RAM': 50,
    'NVMe': 2,
    'SSD': 0.5,
    'HDD': 0.25,
    'EBS': None,
}

# base workload
CPU_H = 20
TOTAL_READS = 800

# caching
CACHE_SKEW = 1.2
FIRST_READ_FROM_S3 = True

instances = pd.read_csv("../../input/instances.csv")

In [5]:
def distr_maker(shape, size):
    if np.isnan(size):
        return []
    if size <= 1:
        return [size]

    distr = zipf.pmf(np.arange(1, size+1), shape)
    normd = distr / np.sum(distr) * size
    return normd.tolist()

def model_distr_hsplit(distr, lim):
    dist_low = np.minimum(distr, lim)
    dist_high = np.maximum(distr - dist_low, 0)
    return {'low': dist_low, 'high': dist_high}


def model_distr_split_fn(distr, split_first_read):
    if split_first_read:
        split_dist = model_distr_hsplit(distr, 1)
    else:
        split_dist = [np.zeros(len(distr)), distr] # ToDO: Fix
    return {"initial": split_dist['low'], "working": split_dist['high']}

def calc_groups(sizes, distr_len):
    if len(sizes) == 1:
        return [sizes.index[0]] * min(sizes[0], distr_len)
    elif sizes[0] > distr_len:
        return []
    else:
        return [sizes.index[1]] * (min(sizes[1], distr_len) - sizes[0])


def distr_pack_helper(bins, distr):
    distr_len = len(distr)
    bins = bins.sort_values(by='prio', ascending=False)
    bins['acc_size'] = bins['size'].cumsum().astype('int32')
    size_windows = bins['acc_size'].rolling(window=2)
    res = []
    for size_window in size_windows:
        res.extend(calc_groups(size_window, distr_len))

    result = pd.DataFrame(data={
        'distr_val': distr,
        'group': res
    }).groupby('group').sum().transpose()

    return result.reset_index()


def model_distr_pack(bins, distr):
    n = len(bins['data_mem']['prio'])
    res = pd.DataFrame()
    for i in range(n):
        next_ = distr_pack_helper(
            bins=pd.DataFrame(
                data={
                    'prio': [bins['data_mem']['prio'][i], bins['data_sto']['prio'][i], bins['data_s3']['prio'][i]],
                    'size': [bins['data_mem']['size'][i], bins['data_sto']['size'][i], bins['data_s3']['size'][i]],
                },
                index=['data_mem', 'data_sto', 'data_s3']
            ),
            distr=distr
        )
        res = pd.concat([res, next_], ignore_index=True).fillna(0)


    return res #.drop('group', axis=1)




def calc_time_for_config(inst, distr_cache):
    data_mem = pd.DataFrame(data = {'size': inst['calc_mem_caching'].round(decimals=0), 'prio': inst['calc_mem_speed']})
    data_sto = pd.DataFrame(data =  {'size': inst['calc_sto_caching'].round(decimals=0), 'prio': inst['calc_sto_speed']})
    data_s3 = pd.DataFrame(data={'size': [len(distr_cache['working'])] * len(instances), 'prio': inst['calc_net_speed']})

    bins_cache = {
        'data_mem': data_mem,
        'data_sto': data_sto,
        'data_s3': data_s3
    }

    mem_read_distribution = model_distr_pack(bins_cache, distr_cache['working'])
    cpu_time = CPU_H / instances['vcpu_count']
    inst['data_mem'] = mem_read_distribution['data_mem']
    inst['data_sto'] = mem_read_distribution['data_sto']
    inst['data_s3'] = mem_read_distribution['data_s3']

    scan_time = mem_read_distribution['data_mem'] / inst['calc_mem_speed'] +  mem_read_distribution['data_sto'] / inst['calc_sto_speed'] + mem_read_distribution['data_s3'] / inst['calc_net_speed']
    inst['execution_time_m2'] = cpu_time + scan_time



In [6]:
distr_caching_precomputed = distr_maker(shape=CACHE_SKEW, size=TOTAL_READS)
distr_cache = model_distr_split_fn(distr_caching_precomputed, FIRST_READ_FROM_S3)
calc_time_for_config(instances, distr_cache)

instances

Unnamed: 0.1,Unnamed: 0,id,memory_Gib,vcpu_count,clock_ghz,storage_Gib,storage_count,storage_type,network_Gbps,network_is_steady,...,calc_sto_speed,calc_cpu_real,calc_mem_caching,calc_sto_caching,calc_mem_spooling,calc_sto_spooling,data_mem,data_sto,data_s3,execution_time_m2
0,8627,m5.24xlarge,384.0,96.0,3.1,0.0,0.0,EBS,25,True,...,3.125,48.0,192.0,0.0,192.0,0.0,576.869284,0.0,0.0,11.745719
1,8635,c5.24xlarge,192.0,96.0,3.0,0.0,0.0,EBS,25,True,...,3.125,48.0,96.0,0.0,96.0,0.0,576.869284,0.0,0.0,11.745719
2,8646,r5n.24xlarge,768.0,96.0,2.5,0.0,0.0,EBS,100,True,...,12.5,48.0,384.0,0.0,384.0,0.0,576.869284,0.0,0.0,11.745719
3,8656,r5d.24xlarge,768.0,96.0,3.1,3600.0,4.0,NVMe,25,True,...,8.0,48.0,384.0,1800.0,384.0,1800.0,576.869284,0.0,0.0,11.745719
4,8673,i3en.24xlarge,768.0,96.0,3.1,60000.0,8.0,NVMe,100,True,...,16.0,48.0,384.0,30000.0,384.0,30000.0,576.869284,0.0,0.0,11.745719
5,8674,c5n.18xlarge,192.0,72.0,3.0,0.0,0.0,EBS,100,True,...,12.5,36.0,96.0,0.0,96.0,0.0,576.869284,0.0,0.0,11.815163
6,8683,m5d.24xlarge,384.0,96.0,3.1,3600.0,4.0,NVMe,25,True,...,8.0,48.0,192.0,1800.0,192.0,1800.0,576.869284,0.0,0.0,11.745719
7,8698,c5d.24xlarge,192.0,96.0,3.0,3600.0,4.0,NVMe,25,True,...,8.0,48.0,96.0,1800.0,96.0,1800.0,576.869284,0.0,0.0,11.745719
8,8712,r5.24xlarge,768.0,96.0,3.1,0.0,0.0,EBS,25,True,...,3.125,48.0,384.0,0.0,384.0,0.0,576.869284,0.0,0.0,11.745719
9,8717,x1e.32xlarge,3904.0,128.0,2.3,3840.0,2.0,SSD,25,True,...,1.0,64.0,1952.0,1920.0,1952.0,1920.0,576.869284,0.0,0.0,11.693636
