In [2]:
from models.const import SCALING_PARAM, MAX_INSTANCE_COUNT, CACHE_SKEW, TOTAL_READS, FIRST_READ_FROM_S3, SPOOLING_READ_SUM, SPOOLING_SKEW, CPU_H
from models.utils import model_make_scaling, distr_maker, model_distr_hsplit, model_distr_split_fn, distr_pack_helper, model_distr_pack

from preprocessing.instances import instSet_transform

TOTAL_READS = 5000


In [3]:
import pandas as pd
import numpy as np

In [4]:
inst = instSet_transform()

In [5]:
FIRST_READ_FROM_S3 = False
distr_caching_precomputed = [
    distr_maker(shape=CACHE_SKEW, size=round(TOTAL_READS / n))
    for n in range(1, MAX_INSTANCE_COUNT + 1)
]
distr_cache = list(map(lambda x: model_distr_split_fn(x, FIRST_READ_FROM_S3), distr_caching_precomputed))

spooling_distr = [
    0 if round(SPOOLING_READ_SUM/n) < 1 else distr_maker(shape=SPOOLING_SKEW, size=round(SPOOLING_READ_SUM / n))
    for n in range(1, MAX_INSTANCE_COUNT + 1)
]

scaling = [model_make_scaling(SCALING_PARAM, n) for n in range(1, MAX_INSTANCE_COUNT+1)]


In [7]:
def calc_time_for_config_m4(inst, count,distr_cache, distr_spooling, scale):
    inst = inst.reset_index()
    bins_cache = {
        'data_mem': pd.DataFrame(data={'size': inst['calc_mem_caching'].round(decimals=0), 'prio': inst['calc_mem_speed']}),
        'data_sto': pd.DataFrame(data={'size': inst['calc_sto_caching'].round(decimals=0), 'prio': inst['calc_sto_speed']}),
        'data_s3': pd.DataFrame(data={'size': [len(distr_cache['working'])] * len(inst), 'prio': inst['calc_net_speed']})
    }

    bins_spooling = {
        'data_mem': pd.DataFrame(data={'size': inst['calc_mem_spooling'].round(decimals=0), 'prio': inst['calc_mem_speed']}),
        'data_sto': pd.DataFrame(data={'size': inst['calc_sto_spooling'].round(decimals=0), 'prio': inst['calc_sto_speed']}),
        'data_s3': pd.DataFrame(data={'size': [len(distr_spooling)] * len(inst), 'prio': inst['calc_net_speed']})
    }

    mem_read_distribution = model_distr_pack(bins_cache, distr_cache['working'])
    spool_read_distribution = model_distr_pack(bins_spooling, distr_spooling)

    spool_sum = sum(distr_spooling)
    inv_eff = count * scale

    result = pd.DataFrame(
        columns=[
            "id_name",
            "count",
            "id",
            "cost_usdph",
            "read_cache_load",
            "read_cache_mem",
            "read_cache_sto",
            "read_cache_s3",
            "read_spool_mem",
            "read_spool_sto",
            "read_spool_s3",
            "rw_mem",
            "rw_sto",
            "rw_s3",
            "rw_xchg",
            "stat_read_spool",
            "stat_read_work",
            "time_cpu",
            "time_mem",
            "time_sto",
            "time_s3",
            "time_xchg",
            "stat_time_sum",
            "stat_time_max",
            "stat_time_period"
            ]
    )

    result["id_name"] = inst["id"]
    result["count"] = count
    result["id"] = result["id_name"] + "/" + str(count)
    result["cost_usdph"] = inst["cost_usdph"] * count

    result["read_cache_load"] = sum(distr_cache["initial"])
    result["read_cache_mem"] = mem_read_distribution['data_mem']
    result["read_cache_sto"] = mem_read_distribution['data_sto']
    result["read_cache_s3"] = mem_read_distribution['data_s3']

    result["read_spool_mem"] = spool_read_distribution['data_mem']
    result["read_cache_sto"] = spool_read_distribution['data_sto']
    result["read_cache_s3"] = spool_read_distribution['data_s3']

    result["rw_mem"] = result["read_cache_mem"] + 2 * result["read_spool_mem"]
    result["rw_sto"] = result["read_cache_sto"] + 2 * result["read_cache_sto"]
    result["rw_s3"] = result["read_cache_s3"] + 2 * result["read_cache_s3"]

    result["rw_xchg"] = 0 if count == 0 else 2 * spool_sum

    result["stat_read_spool"] = spool_sum
    result["stat_read_work"] = sum(distr_cache["working"])

    result["time_cpu"] = (CPU_H * 3600 / inst['calc_cpu_real']) * inv_eff
    result["time_mem"] = (result["rw_mem"] / inst["calc_mem_speed"]) * inv_eff
    result["time_sto"] = (result["rw_sto"] / inst["calc_sto_speed"]) * inv_eff
    result["time_s3"] = (result["rw_s3"] / inst["calc_s3_speed"]) * inv_eff

    result["time_xchg"] = (result["rw_xchg"] / 2 / inst["calc_net_speed"]) * inv_eff
    result["time_load"] = (result["read_cache_load"] / inst["calc_s3_speed"]) * inv_eff
    result["stat_time_sum"] = result["time_s3"] + result["time_sto"] + result["time_mem"] + result["time_xchg"]\
                              + result["time_load"] + result["time_cpu"]
    result["stat_time_max"] = result[["time_s3", "time_sto", "time_mem", "time_xchg", "time_load", "time_cpu" ]].max(axis=1)

    return result

In [8]:
result = [calc_time_for_config_m4(inst, i, distr_cache[i-1], spooling_distr[i-1], scaling[i-1]) for i in range(1, MAX_INSTANCE_COUNT+1)]


In [9]:
result[31]

Unnamed: 0,id_name,count,id,cost_usdph,read_cache_load,read_cache_mem,read_cache_sto,read_cache_s3,read_spool_mem,read_spool_sto,read_spool_s3,rw_mem,rw_sto,rw_s3,rw_xchg,stat_read_spool,stat_read_work,time_cpu,time_mem,time_sto,time_s3,time_xchg,stat_time_sum,stat_time_max,stat_time_period,time_load
0,m5.24xlarge,32,m5.24xlarge/32,147.456,0.0,156.0,0,0,47.0,,,250.0,0,0,94.0,47.0,156.0,15450.0,51.5,0.0,0.0,154.912,15656.412,15450.0,,0.0
1,c5.24xlarge,32,c5.24xlarge/32,130.56,0.0,148.356914,0,0,47.0,,,242.356914,0,0,94.0,47.0,156.0,15450.0,49.925524,0.0,0.0,154.912,15654.837524,15450.0,,0.0
2,r5n.24xlarge,32,r5n.24xlarge/32,228.864,0.0,156.0,0,0,47.0,,,250.0,0,0,94.0,47.0,156.0,15450.0,51.5,0.0,0.0,38.728,15540.228,15450.0,,0.0
3,r5d.24xlarge,32,r5d.24xlarge/32,221.184,0.0,156.0,0,0,47.0,,,250.0,0,0,94.0,47.0,156.0,15450.0,51.5,0.0,0.0,154.912,15656.412,15450.0,,0.0
4,i3en.24xlarge,32,i3en.24xlarge/32,347.136,0.0,156.0,0,0,47.0,,,250.0,0,0,94.0,47.0,156.0,15450.0,51.5,0.0,0.0,38.728,15540.228,15450.0,,0.0
5,c5n.18xlarge,32,c5n.18xlarge/32,124.416,0.0,148.356914,0,0,47.0,,,242.356914,0,0,94.0,47.0,156.0,20600.0,49.925524,0.0,0.0,38.728,20688.653524,20600.0,,0.0
6,m5d.24xlarge,32,m5d.24xlarge/32,173.568,0.0,156.0,0,0,47.0,,,250.0,0,0,94.0,47.0,156.0,15450.0,51.5,0.0,0.0,154.912,15656.412,15450.0,,0.0
7,c5d.24xlarge,32,c5d.24xlarge/32,147.456,0.0,148.356914,0,0,47.0,,,242.356914,0,0,94.0,47.0,156.0,15450.0,49.925524,0.0,0.0,154.912,15654.837524,15450.0,,0.0
8,r5.24xlarge,32,r5.24xlarge/32,193.536,0.0,156.0,0,0,47.0,,,250.0,0,0,94.0,47.0,156.0,15450.0,51.5,0.0,0.0,154.912,15656.412,15450.0,,0.0
9,x1e.32xlarge,32,x1e.32xlarge/32,854.016,0.0,156.0,0,0,47.0,,,250.0,0,0,94.0,47.0,156.0,11587.5,51.5,0.0,0.0,154.912,11793.912,11587.5,,0.0


In [20]:
spooling_distr = [
    0 if round(SPOOLING_READ_SUM/n) < 1 else distr_maker(shape=SPOOLING_SKEW, size=round(SPOOLING_READ_SUM / n))
    for n in range(1, MAX_INSTANCE_COUNT + 1)
]
len(spooling_distr[0]) * len(inst)

22500

In [28]:
bins_spooling = {
    'data_mem': pd.DataFrame(data={'size': inst['calc_mem_spooling'].round(decimals=0), 'prio': inst['calc_mem_speed']}),
    'data_sto': pd.DataFrame(data={'size': inst['calc_sto_spooling'].round(decimals=0), 'prio': inst['calc_sto_speed']}),
    'data_s3': pd.DataFrame(data={'size': [len(spooling_distr[0])] * len(inst), 'prio': inst['calc_net_speed']})
}

spool_read_distribution = model_distr_pack(bins_spooling, spooling_distr[0])

spool_read_distribution.round()

group,index,data_mem,data_s3,data_sto
0,distr_val,1367.0,133.0,0.0
1,distr_val,1300.0,200.0,0.0
2,distr_val,1421.0,79.0,0.0
3,distr_val,1421.0,0.0,79.0
4,distr_val,1421.0,0.0,79.0
5,distr_val,1300.0,200.0,0.0
6,distr_val,1367.0,0.0,133.0
7,distr_val,1300.0,0.0,200.0
8,distr_val,1421.0,79.0,0.0
9,distr_val,1500.0,0.0,0.0
