In [178]:
import numpy as np
import pandas as pd
from scipy.stats import binned_statistic
from datetime import datetime

In [179]:
home_dir = '/Users/hmohamed/github/data-research-spring2020'

file_dir = home_dir + '/raw-data-linode-run3/'
merged_dir = file_dir + 'merged/'

raw_singal_file = 'all_data.csv'

out_dir = home_dir + '/raw-data-linode-run3-rules/'

resample_time_window='15S'

rate_time_window = '1T'  # S for second , T for minute
skip = 1 * 60

save=True

In [180]:
def extract_metric_series(df, name):
    metric = df.loc[df['metric'] == name].drop(['metric'], axis=1).rename(index=str, columns={"value": name})
    metric.date = pd.to_datetime(metric.date)
    metric[name] = pd.to_numeric(metric[name])
    metric.set_index('date', inplace=True)
    metric.sort_index()
    return metric

# df is a timeseries and resampled per second
# window in a form of nS or nT  or nH , where n is an interger (S for seconds, T for minutes, H for hours)
def rate_series(df, col, interval='S'):    
    df[col] = df.pct_change(fill_method='ffill', freq=interval)  # change per second
    df = df.dropna(axis=0, subset=[col])    
    return df
    
def rate_df(df, interval):
    metric = df.metric[0]
    df = df.drop('metric', 1)
    df = rate_series(df, 'value', interval)
    df['metric'] = metric

In [181]:
data_df = pd.read_csv(merged_dir + raw_singal_file, header=None, skiprows=1, names=['date', 'value', 'metric'])
data_df.head(5)

Unnamed: 0,date,value,metric
0,2020-02-27 22:49:53,3504.829879,service_cpu_use
1,2020-02-27 22:49:54,5742.951598,service_cpu_use
2,2020-02-27 22:49:55,7981.073317,service_cpu_use
3,2020-02-27 22:49:56,10242.4575,service_cpu_use
4,2020-02-27 22:49:57,11901.568761,service_cpu_use


In [182]:
# features
features = data_df.metric.unique()
features

array(['service_cpu_use', 'service_memory_use', 'service_cpu_sat',
       'service_net_usage', 'service_disk_usage', 'service_req_total',
       'service_errors', 'service_request_size', 'service_response_size',
       'containers_count', 'system_cpu_usage', 'service_ltcy_200',
       'system_network_usage'], dtype=object)

# Constructing svc_cpu_use metric 

In [183]:
svc_cpu_use = extract_metric_series(data_df, 'service_cpu_use')

nulls = svc_cpu_use.isnull().sum()
print("number of null before rating: {}".format(nulls))
rows = svc_cpu_use.shape[0]
print("before rating : {}".format(svc_cpu_use.shape))
print("")

svc_cpu_use = rate_series(svc_cpu_use, 'service_cpu_use', rate_time_window)
print("after rating : {}".format(svc_cpu_use.shape))
nulls = svc_cpu_use.isnull().sum()
print("number of null after rating: {}".format(nulls))
print("")

print("number of rows removed after rating {} ".format(rows - svc_cpu_use.shape[0]))


svc_cpu_use.head()

number of null before rating: service_cpu_use    0
dtype: int64
before rating : (78845, 1)

after rating : (78785, 1)
number of null after rating: service_cpu_use    0
dtype: int64

number of rows removed after rating 60 


Unnamed: 0_level_0,service_cpu_use
date,Unnamed: 1_level_1
2020-02-27 22:50:53,1.425095
2020-02-27 22:50:54,0.583049
2020-02-27 22:50:55,0.214302
2020-02-27 22:50:56,0.036171
2020-02-27 22:50:57,-0.111615


In [184]:
# resample every 15 second

nulls = svc_cpu_use.isnull().sum()
print("number of null before rating: {}".format(nulls))

svc_cpu_use = svc_cpu_use.resample(resample_time_window).sum()
svc_cpu_use.sort_index()

nulls = svc_cpu_use.isnull().sum()
print("number of null after resampling: {}".format(nulls))

svc_cpu_use = svc_cpu_use.interpolate(method='linear')

nulls = svc_cpu_use.isnull().sum()
print("number of null after interpolation: {}".format(nulls))

svc_cpu_use = svc_cpu_use.rename(columns={"service_cpu_use" : "val"})
svc_cpu_use['variable'] = 'svc_cpu_use'

svc_cpu_use.head()

number of null before rating: service_cpu_use    0
dtype: int64
number of null after resampling: service_cpu_use    0
dtype: int64
number of null after interpolation: service_cpu_use    0
dtype: int64


Unnamed: 0_level_0,val,variable
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-27 22:50:45,2.166677,svc_cpu_use
2020-02-27 22:51:00,1.942357,svc_cpu_use
2020-02-27 22:51:15,-2.26765,svc_cpu_use
2020-02-27 22:51:30,-6.739642,svc_cpu_use
2020-02-27 22:51:45,1.954651,svc_cpu_use


In [185]:
# re-arrange

svc_cpu_use.index.names = ['time']
svc_cpu_use = svc_cpu_use.reset_index()
svc_cpu_use = svc_cpu_use[['variable', 'time', 'val']]


svc_cpu_use.head()

Unnamed: 0,variable,time,val
0,svc_cpu_use,2020-02-27 22:50:45,2.166677
1,svc_cpu_use,2020-02-27 22:51:00,1.942357
2,svc_cpu_use,2020-02-27 22:51:15,-2.26765
3,svc_cpu_use,2020-02-27 22:51:30,-6.739642
4,svc_cpu_use,2020-02-27 22:51:45,1.954651


In [186]:
# save to file
if save:
    print('saving service_cpu_use to file {} with dimension {}'.format(out_dir + '13_svc_cpu_use.csv', svc_cpu_use.shape))       
    svc_cpu_use.to_csv(path_or_buf=out_dir + '13_svc_cpu_use.csv', index=False) 
else:
    print("system_cpu_usage data is not saved. Savig flag is turned off!")
    


saving service_cpu_use to file /Users/hmohamed/github/data-research-spring2020/raw-data-linode-run3-rules/13_svc_cpu_use.csv with dimension (5253, 3)


# Contructing svc_net_use

In [187]:
svc_net_use = extract_metric_series(data_df, 'service_net_usage')

nulls = svc_net_use.isnull().sum()
rows = svc_net_use.shape[0]
print("number of null before rating: {}".format(nulls))
print("before rating : {}".format(svc_net_use.shape))
print("")

svc_net_use = rate_series(svc_net_use, 'service_net_usage', rate_time_window)
print("after rating : {}".format(svc_net_use.shape))
nulls = svc_net_use.isnull().sum()
print("number of null after rating: {}".format(nulls))
print("")

print("number of rows removed after rating {} ".format(rows - svc_net_use.shape[0]))
svc_net_use.head()

number of null before rating: service_net_usage    0
dtype: int64
before rating : (78847, 1)

after rating : (78787, 1)
number of null after rating: service_net_usage    0
dtype: int64

number of rows removed after rating 60 


Unnamed: 0_level_0,service_net_usage
date,Unnamed: 1_level_1
2020-02-27 22:50:51,0.046134
2020-02-27 22:50:52,0.017068
2020-02-27 22:50:53,-0.026434
2020-02-27 22:50:54,-0.062983
2020-02-27 22:50:55,-0.090031


In [188]:
# resample every 15 second

nulls = svc_net_use.isnull().sum()
print("number of null before rating: {}".format(nulls))

svc_net_use = svc_net_use.resample(resample_time_window).sum()
svc_net_use.sort_index()

nulls = svc_net_use.isnull().sum()
print("number of null after resampling: {}".format(nulls))

svc_net_use = svc_net_use.interpolate(method='linear')

nulls = svc_net_use.isnull().sum()
print("number of null after interpolation: {}".format(nulls))

svc_net_use = svc_net_use.rename(columns={"service_net_usage" : "val"})
svc_net_use['variable'] = 'svc_net_use'

svc_net_use.head()

number of null before rating: service_net_usage    0
dtype: int64
number of null after resampling: service_net_usage    0
dtype: int64
number of null after interpolation: service_net_usage    0
dtype: int64


Unnamed: 0_level_0,val,variable
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-27 22:50:45,-0.571974,svc_net_use
2020-02-27 22:51:00,-3.788081,svc_net_use
2020-02-27 22:51:15,1.775082,svc_net_use
2020-02-27 22:51:30,2.534885,svc_net_use
2020-02-27 22:51:45,-0.661786,svc_net_use


In [189]:
# re-arrange

svc_net_use.index.names = ['time']
svc_net_use = svc_net_use.reset_index()
svc_net_use = svc_net_use[['variable', 'time', 'val']]


svc_net_use.head()

Unnamed: 0,variable,time,val
0,svc_net_use,2020-02-27 22:50:45,-0.571974
1,svc_net_use,2020-02-27 22:51:00,-3.788081
2,svc_net_use,2020-02-27 22:51:15,1.775082
3,svc_net_use,2020-02-27 22:51:30,2.534885
4,svc_net_use,2020-02-27 22:51:45,-0.661786


In [190]:
# save to file
if save:
    print('saving service_net_use to file {} with dimension {}'.format(out_dir + '14_svc_net_use.csv', svc_net_use.shape))       
    svc_net_use.to_csv(path_or_buf=out_dir + '14_svc_net_use.csv', index=False) 
else:
    print("system_net_usage data is not saved. Savig flag is turned off!")

saving service_net_use to file /Users/hmohamed/github/data-research-spring2020/raw-data-linode-run3-rules/14_svc_net_use.csv with dimension (5253, 3)


# Contructing svc_disk_use

In [191]:
svc_disk_use = extract_metric_series(data_df, 'service_disk_usage')

nulls = svc_disk_use.isnull().sum()
rows = svc_disk_use.shape[0]
print("number of null before rating: {}".format(nulls))
print("before rating : {}".format(svc_disk_use.shape))
print("")

svc_disk_use = rate_series(svc_disk_use, 'service_disk_usage', rate_time_window)
print("after rating : {}".format(svc_disk_use.shape))
nulls = svc_disk_use.isnull().sum()
print("number of null after rating: {}".format(nulls))
print("")

print("number of rows removed after rating {} ".format(rows - svc_disk_use.shape[0]))
svc_disk_use.head()

number of null before rating: service_disk_usage    0
dtype: int64
before rating : (78849, 1)

after rating : (78789, 1)
number of null after rating: service_disk_usage    0
dtype: int64

number of rows removed after rating 60 


Unnamed: 0_level_0,service_disk_usage
date,Unnamed: 1_level_1
2020-02-27 22:50:51,0.186792
2020-02-27 22:50:52,1.562741
2020-02-27 22:50:53,1.488394
2020-02-27 22:50:54,1.299468
2020-02-27 22:50:55,1.989912


In [192]:
# resample every 15 second

nulls = svc_disk_use.isnull().sum()
print("number of null before rating: {}".format(nulls))

svc_disk_use = svc_disk_use.resample(resample_time_window).sum()
svc_disk_use.sort_index()

nulls = svc_disk_use.isnull().sum()
print("number of null after resampling: {}".format(nulls))

svc_disk_use = svc_disk_use.interpolate(method='linear')

nulls = svc_disk_use.isnull().sum()
print("number of null after interpolation: {}".format(nulls))

svc_disk_use = svc_disk_use.rename(columns={"service_disk_usage" : "val"})
svc_disk_use['variable'] = 'svc_disk_use'

svc_disk_use.head()

number of null before rating: service_disk_usage    0
dtype: int64
number of null after resampling: service_disk_usage    0
dtype: int64
number of null after interpolation: service_disk_usage    0
dtype: int64


Unnamed: 0_level_0,val,variable
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-27 22:50:45,5.032348,svc_disk_use
2020-02-27 22:51:00,511.058311,svc_disk_use
2020-02-27 22:51:15,243.907914,svc_disk_use
2020-02-27 22:51:30,2.163715,svc_disk_use
2020-02-27 22:51:45,26.683041,svc_disk_use


In [193]:
# re-arrange

svc_disk_use.index.names = ['time']
svc_disk_use = svc_disk_use.reset_index()
svc_disk_use = svc_disk_use[['variable', 'time', 'val']]


svc_disk_use.head()

Unnamed: 0,variable,time,val
0,svc_disk_use,2020-02-27 22:50:45,5.032348
1,svc_disk_use,2020-02-27 22:51:00,511.058311
2,svc_disk_use,2020-02-27 22:51:15,243.907914
3,svc_disk_use,2020-02-27 22:51:30,2.163715
4,svc_disk_use,2020-02-27 22:51:45,26.683041


In [194]:
# save to file
if save:
    print('saving service_net_use to file {} with dimension {}'.format(out_dir + '15_svc_disk_use.csv', svc_net_use.shape))       
    svc_disk_use.to_csv(path_or_buf=out_dir + '15_svc_disk_use.csv', index=False) 
else:
    print("system_disk_usage data is not saved. Savig flag is turned off!")

saving service_net_use to file /Users/hmohamed/github/data-research-spring2020/raw-data-linode-run3-rules/15_svc_disk_use.csv with dimension (5253, 3)
