In [171]:
import numpy as np
import pandas as pd
from scipy.stats import binned_statistic
from datetime import datetime

pd.options.display.max_rows = 15
pd.options.display.float_format = '{:,.3f}'.format

In [172]:
file_dir = '/Users/hmohamed/github/data-research-spring2020/raw-data-linode-run3-rules/'
files = ['10_svc_resp_size.csv'
         ,'13_svc_cpu_use.csv'
         , '1_svc_req.csv'
         ,'4_system_net_use.csv'
         ,'7_system_disk_io.csv'
         ,'11_svc_pods.csv'
         ,'14_svc_net_use.csv'
         ,'2_system_cpu_use.csv'
         ,'5_system_net_sat.csv'
         ,'8_svc_ltcy_200.csv'
         ,'12_svc_err.csv'
         ,'15_svc_disk_use.csv'
         ,'3_system_cpu_sat.csv'
         ,'6_system_disk_sat.csv'
         ,'9_svc_req_size.csv'
        ]

# Combine individual files

In [174]:
# rest all data to start from datetime 2020-02-27 22:49:50

start_dt = '2020-02-27 22:49:50'
time_span = '15S'

i = 1
for file in files:
    df = pd.read_csv(file_dir + file, header=None, skiprows=1, names=['metric', 'date', 'value'])
    df.date = pd.to_datetime(df.date)
    df.value = pd.to_numeric(df.value)
    df.set_index('date', inplace=True)
    df.sort_index()
    
    n_samples = df.shape[0]
    metric = df.metric[0]
    
    print("There are {} rows in {} before reindexing".format(n_samples, metric))
    idx = pd.date_range(start_dt, periods=n_samples, freq=time_span)
    df.set_index(idx, inplace=True)
        
    print("There are {} rows in {} after reindexing".format(df.shape[0], metric)) 
    
    df = df.drop('metric', 1).rename(columns={"value": metric})
    
    if i == 1:
        metrics_df = df
    else:
        metrics_df = pd.merge_asof(metrics_df, df, left_index=True, right_index=True, tolerance=pd.Timedelta('15 second')).fillna(np.nan)
    
    i = i+ 1
    print("")
    
metrics_df.shape    

There are 5228 rows in svc_resp_size before reindexing
There are 5228 rows in svc_resp_size after reindexing

There are 5253 rows in svc_cpu_use before reindexing
There are 5253 rows in svc_cpu_use after reindexing

There are 4774 rows in svc_req_rate before reindexing
There are 4774 rows in svc_req_rate after reindexing

There are 4548 rows in system_net_use before reindexing
There are 4548 rows in system_net_use after reindexing

There are 4499 rows in system_disk_io before reindexing
There are 4499 rows in system_disk_io after reindexing

There are 5205 rows in svc_pods before reindexing
There are 5205 rows in svc_pods after reindexing

There are 5253 rows in svc_net_use before reindexing
There are 5253 rows in svc_net_use after reindexing

There are 4249 rows in system_cpu_use before reindexing
There are 4249 rows in system_cpu_use after reindexing

There are 4501 rows in system_net_sat before reindexing
There are 4501 rows in system_net_sat after reindexing

There are 4727 rows in

(5228, 15)

In [175]:
metrics_df.head(3)

Unnamed: 0,svc_resp_size,svc_cpu_use,svc_req_rate,system_net_use,system_disk_io,svc_pods,svc_net_use,system_cpu_use,system_net_sat,svc_ltcy_200,svc_err,svc_disk_use,system_cpu_sat,system_disk_sat,svc_req_size
2020-02-27 22:49:50,20951.47,2.167,1.29,2704635.44,0.0,7.0,-0.572,2.77,0.0,0.96,0.0,5.032,33.0,0.99,1968.01
2020-02-27 22:50:05,33829.4,1.942,2.34,3515630.91,0.0,7.0,-3.788,3.42,0.0,0.87,0.0,511.058,36.1,0.99,3769.44
2020-02-27 22:50:20,40144.34,-2.268,3.24,3923260.11,0.01,7.0,1.775,3.41,0.0,0.8,0.0,243.908,32.4,0.99,5238.26


# Remove Null and zero latency

In [176]:
# check for null
metrics_df.isnull().sum() * 100 / metrics_df.shape[0]

svc_resp_size      0.038
svc_cpu_use        0.000
svc_req_rate       8.665
system_net_use    12.988
system_disk_io    13.925
svc_pods           1.607
svc_net_use        0.000
system_cpu_use    18.707
system_net_sat    13.887
svc_ltcy_200       9.564
svc_err           54.610
svc_disk_use       0.000
system_cpu_sat    19.740
system_disk_sat   13.676
svc_req_size       0.038
dtype: float64

In [177]:
# remove where latency is null as these are not valid records
metrics_df = metrics_df[metrics_df.svc_ltcy_200.notnull()]

metrics_df.isnull().sum() * 100 / metrics_df.shape[0]

svc_resp_size      0.042
svc_cpu_use        0.000
svc_req_rate       0.000
system_net_use     3.786
system_disk_io     4.822
svc_pods           1.311
svc_net_use        0.000
system_cpu_use    10.110
system_net_sat     4.780
svc_ltcy_200       0.000
svc_err           49.810
svc_disk_use       0.000
system_cpu_sat    11.252
system_disk_sat    4.547
svc_req_size       0.042
dtype: float64

In [178]:
# drop errors metric, it has about 50% of missing values
metrics_df = metrics_df.drop('svc_err', 1)
metrics_df.isnull().sum() * 100 / metrics_df.shape[0]

svc_resp_size      0.042
svc_cpu_use        0.000
svc_req_rate       0.000
system_net_use     3.786
system_disk_io     4.822
svc_pods           1.311
svc_net_use        0.000
system_cpu_use    10.110
system_net_sat     4.780
svc_ltcy_200       0.000
svc_disk_use       0.000
system_cpu_sat    11.252
system_disk_sat    4.547
svc_req_size       0.042
dtype: float64

In [179]:
# missing values about 10% we can interpolate
metrics_df = metrics_df.interpolate(method='linear', inplace=False)
metrics_df.isnull().sum() * 100 / metrics_df.shape[0]


svc_resp_size     0.000
svc_cpu_use       0.000
svc_req_rate      0.000
system_net_use    0.000
system_disk_io    0.000
svc_pods          0.000
svc_net_use       0.000
system_cpu_use    0.000
system_net_sat    0.000
svc_ltcy_200      0.000
svc_disk_use      0.000
system_cpu_sat    0.000
system_disk_sat   0.000
svc_req_size      0.000
dtype: float64

# Feature scale of measures

In [180]:

metrics_df.svc_resp_size = metrics_df.svc_resp_size / (1024)
metrics_df.svc_req_size = metrics_df.svc_req_size / (1024)
metrics_df.system_net_use = metrics_df.system_net_use / (1024 * 1024)

metrics_df.head()

Unnamed: 0,svc_resp_size,svc_cpu_use,svc_req_rate,system_net_use,system_disk_io,svc_pods,svc_net_use,system_cpu_use,system_net_sat,svc_ltcy_200,svc_disk_use,system_cpu_sat,system_disk_sat,svc_req_size
2020-02-27 22:49:50,20.46,2.167,1.29,2.579,0.0,7.0,-0.572,2.77,0.0,0.96,5.032,33.0,0.99,1.922
2020-02-27 22:50:05,33.037,1.942,2.34,3.353,0.0,7.0,-3.788,3.42,0.0,0.87,511.058,36.1,0.99,3.681
2020-02-27 22:50:20,39.203,-2.268,3.24,3.742,0.01,7.0,1.775,3.41,0.0,0.8,243.908,32.4,0.99,5.115
2020-02-27 22:50:35,40.236,-6.74,3.56,3.694,0.0,7.0,2.535,3.18,0.0,3.93,2.164,34.45,0.99,5.851
2020-02-27 22:50:50,38.784,1.955,3.38,3.636,0.01,7.0,-0.662,3.39,0.0,4.0,26.683,33.3,0.99,5.593


In [181]:
if 1 == 1:
    output_file = file_dir + 'aligned_dataset.csv'
    metrics_df.to_csv(path_or_buf=output_file, index=True)