# Clustering

We want to check if it's possible to put similar workloads into one cluster (with joint influence coefficients).

To do that we need to identify workloads that we know are similar and check similarity of their influence coefficients.

We'll take our `redis_ycsb` workload and change `workload_profile` parameter (more on https://github.com/brianfrankcooper/YCSB/wiki/Core-Workloads).
Redis YCSB has `workload_profile=d` by default. We'll check how other profiles are affected by our four basic workloads.

In [4]:
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm

from helpers.load_data import (
    clean_column_names,
    get_data_with_cpu,
    get_experiments_paths,
    trim_experiment,
)
from helpers.regression import (
    boxplot_two,
    draw_regression_graph,
    fit_regression,
    get_coeff,
)

experiments_path = '../../data/redisy'

#### Coefficients from 4 rounds of experiments:

In [35]:
from helpers.load_data import remove_setup_datapoints

def get_coeff(path, instances_n, trim, perf_metric):
    df = get_data_with_cpu(path, instances_n=instances_n, cpu_window=30)
    df['time'] = pd.to_datetime(df['cbtool_time'], unit='s')
    df = remove_setup_datapoints(df, path)
    df = trim_experiment(df, trim)
    results = fit_regression(data=df, formula=f'{perf_metric} ~ instances_n')
    return results.params[1]

In [44]:
rounds_n = 4
basic_loads = ['linpack', 'redis_ycsb_d', 'hadoop_pagerank', 'sysbench']
redis_loads = [
    ('redis_ycsb_a', 'app_latency'),
    ('redis_ycsb_b', 'app_latency'),
    ('redis_ycsb_c', 'app_latency'),
    ('redis_ycsb_d', 'app_latency'),
    ('redis_ycsb_e', 'app_throughput_inv'),
    ('redis_ycsb_f', 'app_latency'),
]
df_coeffs = pd.DataFrame(columns=['impact_on'] + basic_loads)

for i in range(rounds_n):
    print(f'Round {i}')
    for load_name, perf_metric in redis_loads:
        row = {'impact_on': load_name}

        for basic_load in basic_loads:
            experiment_name = f'{load_name}_1_{basic_load}_9'
            path = get_experiments_paths(experiment_name, experiments_path)[i]
            try:
                coeff = get_coeff(path, instances_n=8, trim=7, perf_metric=perf_metric)
            except:
                coeff = None
            row[basic_load] = coeff

        df_coeffs = df_coeffs.append(row, ignore_index=True)

df_coeffs

Round 0
Round 1
Round 2
Round 3


Unnamed: 0,impact_on,linpack,redis_ycsb_d,hadoop_pagerank,sysbench
0,redis_ycsb_a,0.004582,0.002718,0.00297546,0.000918
1,redis_ycsb_b,0.005603,0.002847,0.00335927,0.001082
2,redis_ycsb_c,0.004971,0.002917,0.00323429,0.001064
3,redis_ycsb_d,0.005081,0.002953,0.0032223,0.00104
4,redis_ycsb_e,0.000261,0.000145,0.000159179,5e-05
5,redis_ycsb_f,0.004464,0.002668,0.00293994,0.000971
6,redis_ycsb_a,0.004645,0.00271,0.00291722,0.001065
7,redis_ycsb_b,0.005513,0.002791,0.00327849,0.00102
8,redis_ycsb_c,0.005144,0.00293,0.00323842,0.001055
9,redis_ycsb_d,0.005032,0.002987,0.00325326,0.001057


In [59]:
for i in range(3):
    experiment_name = f'redis_ycsb_d_1_redis_ycsb_d_9'
    path = get_experiments_paths(experiment_name, experiments_path)[rounds_n+i]
    coeff = get_coeff(path, instances_n=8, trim=7, perf_metric=perf_metric)
    print(f'{path}: {coeff / 0.00505075}')
    

../../data/redisy/redis_ycsb_d_1_redis_ycsb_d_9_m17: 0.5675396347028535
../../data/redisy/redis_ycsb_d_1_redis_ycsb_d_9_m18: 0.5732742007250929
../../data/redisy/redis_ycsb_d_1_redis_ycsb_d_9_m19: 0.569143295098402


#### Normalized:

In [45]:
for i in range(len(df_coeffs)):
    df_coeffs.iloc[i, 1:] /= df_coeffs.at[i, 'linpack']

df_coeffs

Unnamed: 0,impact_on,linpack,redis_ycsb_d,hadoop_pagerank,sysbench
0,redis_ycsb_a,1.0,0.593226,0.649405,0.200294
1,redis_ycsb_b,1.0,0.508085,0.599541,0.193093
2,redis_ycsb_c,1.0,0.58674,0.650657,0.214013
3,redis_ycsb_d,1.0,0.58113,0.63419,0.20468
4,redis_ycsb_e,1.0,0.556277,0.610743,0.193029
5,redis_ycsb_f,1.0,0.597797,0.658627,0.217484
6,redis_ycsb_a,1.0,0.583434,0.628048,0.229369
7,redis_ycsb_b,1.0,0.506257,0.594632,0.184938
8,redis_ycsb_c,1.0,0.56964,0.629495,0.205007
9,redis_ycsb_d,1.0,0.593587,0.646454,0.209994


#### Mean of all

In [46]:
df_coeffs.mean(axis=0)

linpack            1.000000
redis_ycsb_d       0.561007
hadoop_pagerank    0.632734
sysbench           0.200576
dtype: float64

#### Standard deviation of all:

In [47]:
df_coeffs.std(axis=0)

linpack            0.000000
redis_ycsb_d       0.031555
hadoop_pagerank    0.022588
sysbench           0.012402
dtype: float64

#### Mean of each group

In [48]:
df_coeffs.groupby('impact_on').mean()

Unnamed: 0_level_0,linpack,redis_ycsb_d,sysbench
impact_on,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
redis_ycsb_a,1.0,0.57876,0.201549
redis_ycsb_b,1.0,0.501582,0.185981
redis_ycsb_c,1.0,0.562777,0.202309
redis_ycsb_d,1.0,0.576297,0.20333
redis_ycsb_e,1.0,0.556,0.196029
redis_ycsb_f,1.0,0.590627,0.21426


#### Standard deviation within load type:

In [49]:
df_coeffs.groupby('impact_on').std()

Unnamed: 0_level_0,linpack,redis_ycsb_d,sysbench
impact_on,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
redis_ycsb_a,0.0,0.018229,0.019394
redis_ycsb_b,0.0,0.006588,0.00664
redis_ycsb_c,0.0,0.019122,0.009163
redis_ycsb_d,0.0,0.013762,0.005377
redis_ycsb_e,0.0,0.002618,0.004939
redis_ycsb_f,0.0,0.006945,0.00671


#### What is the largest difference?

In [51]:
(df_coeffs.groupby('impact_on').mean() - df_coeffs.mean())

Unnamed: 0_level_0,hadoop_pagerank,linpack,redis_ycsb_d,sysbench
impact_on,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
redis_ycsb_a,,0.0,0.017753,0.000973
redis_ycsb_b,,0.0,-0.059425,-0.014596
redis_ycsb_c,,0.0,0.00177,0.001733
redis_ycsb_d,,0.0,0.01529,0.002754
redis_ycsb_e,,0.0,-0.005007,-0.004548
redis_ycsb_f,,0.0,0.02962,0.013684


In [52]:
(df_coeffs.groupby('impact_on').mean() - df_coeffs.mean()).abs().max()

hadoop_pagerank         NaN
linpack            0.000000
redis_ycsb_d       0.059425
sysbench           0.014596
dtype: float64

We see that the largest difference between subtype mean and mean is `0.098976`. It caused by `redis_ycsb_f`.

In [53]:
df_no_f = df_coeffs[df_coeffs['impact_on'] != 'redis_ycsb_f']

In [54]:
df_no_f.groupby('impact_on').mean() - df_no_f.mean()

Unnamed: 0_level_0,hadoop_pagerank,linpack,redis_ycsb_d,sysbench
impact_on,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
redis_ycsb_a,,0.0,0.023677,0.00371
redis_ycsb_b,,0.0,-0.053501,-0.011859
redis_ycsb_c,,0.0,0.007694,0.00447
redis_ycsb_d,,0.0,0.021214,0.00549
redis_ycsb_e,,0.0,0.000917,-0.001811


In [34]:
(df_no_f.groupby('impact_on').mean() - df_no_f.mean()).abs().max()

linpack            0.0
redis_ycsb_d       0.0
hadoop_pagerank    0.0
sysbench           0.0
dtype: float64

If we exclude `redis_ycsb_f`, then the maximum difference is `0.058481`.