In [1]:
import pandas as pd
from models.utils import model_make_scaling, distr_maker, model_distr_hsplit, model_distr_split_fn, distr_pack_helper, model_distr_pack


In [2]:
# base workload
CPU_H = 20
TOTAL_READS = 800

# caching
CACHE_SKEW = 0.1
FIRST_READ_FROM_S3 = False

# materialization
SPOOLING_FRACTION = 0.2
SPOOLING_SKEW = 0.1
SPOOLING_READ_SUM = TOTAL_READS * SPOOLING_FRACTION

# scaling
SCALING_PARAM = 0.98  # - portion of the workload that can be parallelized across instances
MAX_INSTANCE_COUNT = 32

In [3]:
inst = pd.read_csv("../../input/instances.csv")

In [4]:
distr_caching_precomputed = [
    distr_maker(shape=CACHE_SKEW, size=round(TOTAL_READS / n))
    for n in range(1, MAX_INSTANCE_COUNT + 1)
]
distr_cache = list(map(lambda x: model_distr_split_fn(x, FIRST_READ_FROM_S3), distr_caching_precomputed))

spooling_distr = [
    0 if round(SPOOLING_READ_SUM/n) < 1 else distr_maker(shape=SPOOLING_SKEW, size=round(SPOOLING_READ_SUM / n))
    for n in range(1, MAX_INSTANCE_COUNT + 1)
]

scaling = [model_make_scaling(SCALING_PARAM, n) for n in range(1, MAX_INSTANCE_COUNT+1)]


In [5]:
def calc_time_for_config_m4(inst, count,distr_cache, distr_spooling, scale):
    inst = inst.reset_index()
    bins_cache = {
        'data_mem': pd.DataFrame(data={'size': inst['calc_mem_caching'].round(decimals=0), 'prio': inst['calc_mem_speed']}),
        'data_sto': pd.DataFrame(data={'size': inst['calc_sto_caching'].round(decimals=0), 'prio': inst['calc_sto_speed']}),
        'data_s3': pd.DataFrame(data={'size': [len(distr_cache['working'])] * len(inst), 'prio': inst['calc_net_speed']})
    }

    bins_spooling = {
        'data_mem': pd.DataFrame(data={'size': inst['calc_mem_spooling'].round(decimals=0), 'prio': inst['calc_mem_speed']}),
        'data_sto': pd.DataFrame(data={'size': inst['calc_sto_spooling'].round(decimals=0), 'prio': inst['calc_sto_speed']}),
        'data_s3': pd.DataFrame(data={'size': [len(distr_spooling)] * len(inst), 'prio': inst['calc_net_speed']})
    }

    mem_read_distribution = model_distr_pack(bins_cache, distr_cache['working'])
    spool_read_distribution = model_distr_pack(bins_spooling, distr_spooling)

    spool_sum = sum(distr_spooling)
    inv_eff = count * scale

    result = pd.DataFrame(
        columns=[
            "id_name",
            "count",
            "id",
            "cost_usdph",
            "read_cache_load",
            "read_cache_mem",
            "read_cache_sto",
            "read_cache_s3",
            "read_spool_mem",
            "read_spool_sto",
            "read_spool_s3",
            "rw_mem",
            "rw_sto",
            "rw_s3",
            "rw_xchg",
            "stat_read_spool",
            "stat_read_work",
            "time_cpu",
            "time_mem",
            "time_sto",
            "time_s3",
            "time_xchg",
            "stat_time_sum",
            "stat_time_max",
            "stat_time_period"
            ]
    )

    result["id_name"] = inst["id"]
    result["count"] = count
    result["id"] = result["id_name"] + "/" + str(count)
    result["cost_usdph"] = inst["cost_usdph"] * count

    result["read_cache_load"] = sum(distr_cache["initial"])
    result["read_cache_mem"] = mem_read_distribution['data_mem']
    result["read_cache_sto"] = mem_read_distribution['data_sto']
    result["read_cache_s3"] = mem_read_distribution['data_s3']

    result["read_spool_mem"] = spool_read_distribution['data_mem']
    result["read_cache_sto"] = spool_read_distribution['data_sto']
    result["read_cache_s3"] = spool_read_distribution['data_s3']

    result["rw_mem"] = result["read_cache_mem"] + 2 * result["read_spool_mem"]
    result["rw_sto"] = result["read_cache_sto"] + 2 * result["read_cache_sto"]
    result["rw_s3"] = result["read_cache_s3"] + 2 * result["read_cache_s3"]

    result["rw_xchg"] = 0 if count == 0 else 2 * spool_sum

    result["stat_read_spool"] = spool_sum
    result["stat_read_work"] = sum(distr_cache["working"])

    result["time_cpu"] = (CPU_H * 3600 / inst['calc_cpu_real']) * inv_eff
    result["time_mem"] = (result["rw_mem"] / inst["calc_mem_speed"]) * inv_eff
    result["time_sto"] = (result["rw_sto"] / inst["calc_sto_speed"]) * inv_eff
    result["time_s3"] = (result["rw_s3"] / inst["calc_s3_speed"]) * inv_eff

    result["time_xchg"] = (result["rw_xchg"] / 2 / inst["calc_net_speed"]) * inv_eff
    result["time_load"] = (result["read_cache_load"] / inst["calc_s3_speed"]) * inv_eff
    result["stat_time_sum"] = result["time_s3"] + result["time_sto"] + result["time_mem"] + result["time_xchg"]\
                              + result["time_load"] + result["time_cpu"]
    result["stat_time_max"] = result[["time_s3", "time_sto", "time_mem", "time_xchg", "time_load", "time_cpu" ]].max(axis=1)

    return result

In [9]:
result = [calc_time_for_config_m4(inst, i, distr_cache[i-1], spooling_distr[i-1], scaling[i-1]) for i in range(1, MAX_INSTANCE_COUNT+1)]
result[3]

Unnamed: 0,id_name,count,id,cost_usdph,read_cache_load,read_cache_mem,read_cache_sto,read_cache_s3,read_spool_mem,read_spool_sto,...,stat_read_work,time_cpu,time_mem,time_sto,time_s3,time_xchg,stat_time_sum,stat_time_max,stat_time_period,time_load
0,m5.24xlarge,4,m5.24xlarge/4,18.432,0.0,192.770163,0.0,0.0,40.0,,...,200.0,1590.0,5.782727,0.0,0.0,13.568,1609.350727,1590.0,,0.0
1,c5.24xlarge,4,c5.24xlarge/4,16.32,0.0,103.116733,0.0,0.0,40.0,,...,200.0,1590.0,3.882075,0.0,0.0,13.568,1607.450075,1590.0,,0.0
2,r5n.24xlarge,4,r5n.24xlarge/4,28.608,0.0,200.0,0.0,0.0,40.0,,...,200.0,1590.0,5.936,0.0,0.0,3.392,1599.328,1590.0,,0.0
3,r5d.24xlarge,4,r5d.24xlarge/4,27.648,0.0,200.0,0.0,0.0,40.0,,...,200.0,1590.0,5.936,0.0,0.0,13.568,1609.504,1590.0,,0.0
4,i3en.24xlarge,4,i3en.24xlarge/4,43.392,0.0,200.0,0.0,0.0,40.0,,...,200.0,1590.0,5.936,0.0,0.0,3.392,1599.328,1590.0,,0.0
5,c5n.18xlarge,4,c5n.18xlarge/4,15.552,0.0,103.116733,0.0,0.0,40.0,,...,200.0,2120.0,3.882075,0.0,0.0,3.392,2127.274075,2120.0,,0.0
6,m5d.24xlarge,4,m5d.24xlarge/4,21.696,0.0,192.770163,0.0,0.0,40.0,,...,200.0,1590.0,5.782727,0.0,0.0,13.568,1609.350727,1590.0,,0.0
7,c5d.24xlarge,4,c5d.24xlarge/4,18.432,0.0,103.116733,0.0,0.0,40.0,,...,200.0,1590.0,3.882075,0.0,0.0,13.568,1607.450075,1590.0,,0.0
8,r5.24xlarge,4,r5.24xlarge/4,24.192,0.0,200.0,0.0,0.0,40.0,,...,200.0,1590.0,5.936,0.0,0.0,13.568,1609.504,1590.0,,0.0
9,x1e.32xlarge,4,x1e.32xlarge/4,106.752,0.0,200.0,0.0,0.0,40.0,,...,200.0,1192.5,5.936,0.0,0.0,13.568,1212.004,1192.5,,0.0
