In [80]:
import pandas as pd
import sqlalchemy
from dotenv import dotenv_values
import numpy as np

from models.utils import distr_maker, model_distr_pack
from preprocessing.instances import instSet_transform

config = dotenv_values()
engine = sqlalchemy.create_engine(f'postgresql+psycopg2://{config["USERNAME"]}:{config["PASSWORD"]}@{config["HOST"]}/{config["DATABASE"]}')
SNOWFLAKE_INSTANCE = instSet_transform()
SNOWFLAKE_INSTANCE = SNOWFLAKE_INSTANCE[SNOWFLAKE_INSTANCE["id"] == "c5d.24xlarge"]
SNOWFLAKE_INSTANCE

Unnamed: 0,id,memory_Gib,vcpu_count,clock_ghz,storage_Gib,storage_count,storage_type,network_Gbps,network_is_steady,cost_usdph,loading_comment,id_prefix,id_numstr,id_number,id_slice,id_slice_factor,id_slice_of,id_slice_net,id_slice_sto,calc_net_speed,calc_s3_speed,calc_mem_speed,calc_sto_speed,calc_cpu_real,calc_mem_caching,calc_sto_caching,calc_mem_spooling,calc_sto_spooling
8698,c5d.24xlarge,192.0,96.0,3.0,3600.0,4.0,NVMe,25,True,4.608,,c5d,24,24.0,24.0,1.0,8698,25.0,4.0,3.125,2.5,50,8.0,48.0,96.0,1800.0,96.0,1800.0


In [82]:
sql_statement = sqlalchemy.text("""
   SELECT warehouseId,
   sum(systemCpuTime) + sum(userCpuTime) AS cpu_micros,
   sum(persistentReadBytesS3)            AS scan_s3,
   sum(persistentReadBytesCache)         AS scan_cache,
   sum(intDataReadBytesLocalSSD)         AS spool_ssd,
   sum(intDataReadBytesS3)               AS spool_s3,
   avg(warehousesize)                    AS warehouse_size
  FROM snowset TABLESAMPLE SYSTEM (0.1)
  group by warehouseId
""")

result_df = None
with engine.connect() as conn:
    result_df = pd.DataFrame(conn.execute(sql_statement).fetchall(), columns=[
        'warehouse_id',
        'cpu_micros',
        'scan_s3',
        'scan_cache',
        'spool_ssd',
        'spool_s3',
        'warehouse_size'
    ])

result_df

Unnamed: 0,warehouse_id,cpu_micros,scan_s3,scan_cache,spool_ssd,spool_s3,warehouse_size
0,8535908784919966172,1966027455,64,854,1971,0,14648.625000000000
1,5434552103031488794,70001,1,2,0,0,260.0000000000000000
2,3283200638972591640,116356245,33,7257,0,0,15514.600000000000
3,3320613879277808115,4617509248,16,7369,51414,0,452713.500000000000
4,1308453160768928463,71020000,8,0,0,0,5410.0000000000000000
...,...,...,...,...,...,...,...
781,6882855326578019261,4590260,140,40,0,0,632.2857142857142857
782,798213079456588986,15744391875,1710,13260,0,0,3510.6690140845070423
783,5396797649563403368,2217983796,804,0,5572,0,3545.0495867768595041
784,8230729311302445343,1629159654,50,2459,0,0,11516.000000000000


In [75]:
def snowset_estimate_cache_skew(row):
    scanned = float((row['scan_s3'] + row['scan_cache'])/ row['warehouse_size'] / 1024**3)
    tail = float(row['scan_s3'] / row['warehouse_size'] / 1024**3)

    bins = {
        'data_mem': pd.DataFrame(data={'size': SNOWFLAKE_INSTANCE['calc_mem_caching'].round(decimals=0), 'prio': SNOWFLAKE_INSTANCE['calc_mem_speed']}),
        'data_sto': pd.DataFrame(data={'size': SNOWFLAKE_INSTANCE['calc_sto_caching'].round(decimals=0), 'prio': SNOWFLAKE_INSTANCE['calc_sto_speed']}),
        'data_s3': pd.DataFrame(data={'size': scanned, 'prio': SNOWFLAKE_INSTANCE['calc_net_speed']})
    }

    skew = 0.00001
    error = 1
    i = 1

    while error > 0.01 and i < 100 and skew > 0:
        distribution = distr_maker(skew, round(scanned))
        pack = model_distr_pack(bins, distribution)
        if pack.iloc[0]["data_s3"] == 0:
            break

        err_abs = round(pack.iloc[0]['data_s3'] - float(tail),2)
        error = round(abs(err_abs/tail), 2)
        skew = skew + np.sign(err_abs) * min(0.1, error / (i * 0.5))
        i += i

    row['data_scan'] = scanned
    row['cache_skew_tail'] = tail
    row['cache_skew'] = skew
    row['cache_skew_error'] = error
    row['cache_skew_iter'] = i

    return row



In [76]:
def snowset_spool_frac_estimation(row):
    scanned = row['scan_s3'] + row['scan_cache']
    spooled = row['spool_s3'] + row['spool_ssd']
    row['spool_frac'] = spooled / scanned
    return row


In [83]:
result_df = result_df.apply(snowset_estimate_cache_skew, axis=1)
result_df = result_df.apply(snowset_spool_frac_estimation, axis=1)

Sizes data_mem    96
Name: acc_size, dtype: int64
Sizes data_mem      96
data_sto    1896
Name: acc_size, dtype: int64
Sizes data_sto    1896
data_s3     1896
Name: acc_size, dtype: int64
Sizes data_mem    96
Name: acc_size, dtype: int64
Sizes data_mem      96
data_sto    1896
Name: acc_size, dtype: int64
Sizes data_sto    1896
data_s3     1896
Name: acc_size, dtype: int64
Sizes data_mem    96
Name: acc_size, dtype: int64
Sizes data_mem      96
data_sto    1896
Name: acc_size, dtype: int64
Sizes data_sto    1896
data_s3     1896
Name: acc_size, dtype: int64
Sizes data_mem    96
Name: acc_size, dtype: int64
Sizes data_mem      96
data_sto    1896
Name: acc_size, dtype: int64
Sizes data_sto    1896
data_s3     1896
Name: acc_size, dtype: int64
Sizes data_mem    96
Name: acc_size, dtype: int64
Sizes data_mem      96
data_sto    1896
Name: acc_size, dtype: int64
Sizes data_sto    1896
data_s3     1896
Name: acc_size, dtype: int64
Sizes data_mem    96
Name: acc_size, dtype: int64
Sizes data

In [78]:
def snowset_row_est_spool_skew(row):
    scanned = float(row['spool_frac']) * float(row['data_scan'])
    tail = float(row['spool_s3'] / row['warehouse_size'] / 1024 ** 3)

    if scanned < 1:
        row['spool_skew_tail'] = tail
        row['spool_skew'] = 0.0001
        row['spool_skew_error'] = 0
        row['spool_skew_iter'] = 0

        return row

    bins = {
        'data_mem': pd.DataFrame(data={'size': SNOWFLAKE_INSTANCE['calc_mem_caching'].round(decimals=0), 'prio': SNOWFLAKE_INSTANCE['calc_mem_speed']}),
        'data_sto': pd.DataFrame(data={'size': SNOWFLAKE_INSTANCE['calc_sto_caching'].round(decimals=0), 'prio': SNOWFLAKE_INSTANCE['calc_sto_speed']}),
        'data_s3': pd.DataFrame(data={'size': scanned, 'prio': SNOWFLAKE_INSTANCE['calc_net_speed']})
    }

    skew = 0.00001
    error = 1
    iter_count = 1

    while error > 0.01 and iter_count < 100 and skew > 0:
        dist_est = distr_maker(skew, round(scanned))
        pack = model_distr_pack(bins, dist_est)

        if not pack.iloc[0]['data_s3'] or tail == 0:
            break

        print("Data S3: ", pack.iloc[0]['data_s3'])
        print("Tail: ", tail)
        err_abs = pack.iloc[0]['data_s3'] - tail
        error = abs(err_abs / tail)
        skew = skew + np.sign(err_abs) * min(0.1, error / (iter_count * 0.5))
        iter_count += 1

    if iter_count >= 100:
        print(["Aborted after 100 iterations, skew might not be very accurate", skew])
    if skew <= 0:
        print(["Skew < 0, this is a weird row.", skew])

    row['spool_skew_tail'] = tail
    row['spool_skew'] = skew
    row['spool_skew_error'] = error
    row['spool_skew_iter'] = iter_count

    return row


In [84]:
result_df.apply(snowset_row_est_spool_skew, axis=1)

Unnamed: 0,warehouse_id,cpu_micros,scan_s3,scan_cache,spool_ssd,spool_s3,warehouse_size,data_scan,cache_skew_tail,cache_skew,cache_skew_error,cache_skew_iter,spool_frac,spool_skew_tail,spool_skew,spool_skew_error,spool_skew_iter
0,8535908784919966172,1966027455,64,854,1971,0,14648.625000000000,5.836412e-11,4.068958e-12,0.00001,1,1,2.147058823529411764705882353,0.0,0.0001,0,0
1,5434552103031488794,70001,1,2,0,0,260.0000000000000000,1.074603e-11,3.582010e-12,0.00001,1,1,0,0.0,0.0001,0,0
2,3283200638972591640,116356245,33,7257,0,0,15514.600000000000,4.376098e-10,1.980950e-12,0.00001,1,1,0,0.0,0.0001,0,0
3,3320613879277808115,4617509248,16,7369,51414,0,452713.500000000000,1.519243e-11,3.291521e-14,0.00001,1,1,6.961949898442789438050101557,0.0,0.0001,0,0
4,1308453160768928463,71020000,8,0,0,0,5410.0000000000000000,1.377187e-12,1.377187e-12,0.00001,1,1,0,0.0,0.0001,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
781,6882855326578019261,4590260,140,40,0,0,632.2857142857142857,2.651302e-10,2.062124e-10,0.00001,1,1,0,0.0,0.0001,0,0
782,798213079456588986,15744391875,1710,13260,0,0,3510.6690140845070423,3.971294e-09,4.536348e-10,0.00001,1,1,0,0.0,0.0001,0,0
783,5396797649563403368,2217983796,804,0,5572,0,3545.0495867768595041,2.112194e-10,2.112194e-10,0.00001,1,1,6.930348258706467661691542289,0.0,0.0001,0,0
784,8230729311302445343,1629159654,50,2459,0,0,11516.000000000000,2.029080e-10,4.043603e-12,0.00001,1,1,0,0.0,0.0001,0,0
