# 1- Raw Signal Metrics Data Transformation

This file to be executed first

This notebook transforms raw metrics signals into adequate values and saves them as time series data to different output files. The outputfiles will be having data in columns named as 'date', 'metric', 'value'. 

In [167]:
import numpy as np
import pandas as pd
from datetime import datetime

In [168]:
# 1- read processed file

home_dir = '/Users/hmohamed/github/data-research-spring2020'

file_dir = home_dir + '/raw-data-linode-run3/'
merged_dir = file_dir + 'merged/' 

# containers / istio input files
service_cpu_usage_file = '1_service_cpu_use.csv'
service_memory_usage_file = '2_service_memory_use.csv'
service_cpu_sat_file = '3_service_cpu_sat.csv'
service_net_usage_file = '4_service_net_usage.csv'
service_disk_usage_file = '5_service_disk_usage.csv'
service_req_total_file = '6_service_req_total.csv'
#service_ltcy_file = '7_service_ltcy_200.csv'
service_errors_file = '8_service_errors.csv'
service_request_size_file = '9_service_request_size.csv'
service_response_size_file = '10_service_response_size.csv'
containers_count_file = '11_containers_count.csv'
system_network_receive = '15_system_network_receive.csv'
system_network_transmit = '16_system_network_transmit.csv'
service_ltcy_sum = '17_service_ltcy_sum.csv'
service_ltcy_count = '18_service_ltcy_count.csv'

node_load1_file = '13_node_load1.csv'
node_cpu_seconds_total_file = '14_node_cpu_seconds_total.csv'

# system input files
system_cpu_usage_file = '12_system_cpu_usage.csv'


service_input_files = [service_cpu_usage_file
                      , service_memory_usage_file
                      , service_cpu_sat_file
                      , service_net_usage_file
                      , service_disk_usage_file
                      , service_req_total_file
                      #, service_ltcy_file
                      , service_ltcy_sum
                      , service_ltcy_count
                      , service_errors_file
                      , service_request_size_file
                      , service_response_size_file                       
                      , containers_count_file
                     ]

system_input_files = [
    system_cpu_usage_file
#    ,system_network_receive
#    ,system_network_transmit
    
]

concated_data_file = 'all_data.csv'

services = ['checkoutservice'
,'cartservice'
,'emailservice'
,'currencyservice'
,'paymentservice'
,'productcatalogservice'
,'shippingservice'
] 


save=True
frequency = '1S'  # S for second , T for minute


In [169]:
# This function merges and alines the metrics timeseries data into a data frame, a column for every feature
def expand(df, by_col, by_col_values, dup='sum'):
    # first convert to time series
    df = toTimeSeries(df, 'date')
    metrics_df = pd.DataFrame()
    i = 0
    for col_value in by_col_values:
        print("Processing metric for column: %", col_value)
        series = extractMetricSeries(df, by_col, col_value)
        series = resample(series, value_column_name=col_value, dup=dup)  
        if i == 0:
            metrics_df = series
        else:
            metrics_df = merge(metrics_df, series)
        i = i + 1
    return metrics_df 

# T for minutes, S for seconds
# remedy duplicates by either taking the maximum (=max) or average  (=mean) them
def resample(df, value_column_name, index_col_name='date', frequency = frequency, interpolate = True
             , interpolate_method = 'linear', base=6, dup = 'sum'):
    # eliminate dups in timestamp
    if dup == 'max':
        df = df.groupby([index_col_name])[value_column_name].max()   # taking max
        df = pd.DataFrame(df)
    elif dup == 'sum':
        df = df.groupby([index_col_name])[value_column_name].sum()
    else:
        df = df.groupby([index_col_name]).mean()   # taking mean
        
    df.index = pd.to_datetime(df.index)
    df.sort_index(inplace=True)  # order the timeseries
    
    # fill in missing interval (upsample)
    shape_before = df.shape
    resampled = df.resample(frequency, kind='timestamp', base=base)  #.bfill()
    
    if interpolate:
        resampled = resampled.interpolate(method=interpolate_method)    
        
    print("dimention before resampling is: {}".format(shape_before))
    return resampled

def toTimeSeries(df, index_col_name='date', value_col_name='value'):
    df[index_col_name] = pd.to_datetime(df[index_col_name])
    df[value_col_name] = pd.to_numeric(df[value_col_name])
    df.set_index(index_col_name, inplace=True)
    df.sort_index(inplace=True)
    return df


# This function extracts timeseries of one named service from the whole raw timeseries data
def extractMetricSeries(df, col_name, col_value):
    metric = df.loc[df[col_name] == col_value].drop([col_name], axis=1).rename(index=str, columns={"value": col_value})
    metric.sort_index(inplace=True)
    return metric  

# sum df rows, remove expanded columns and set a new column with a metric name
def sumTimeseries(df, columns_to_delete, metric_name, metric_col_name='metric'):
    df['value'] = df.sum(axis=1)
    df[metric_col_name] = metric_name
    for col in columns_to_delete:
        df = df.drop([col], axis=1)
    return df 

# max df rows, remove expanded columns and set a new column with a metric name
def maxTimeseries(df, columns_to_delete, metric_name, metric_col_name='metric'):
    df['value'] = df.max(axis=1)
    df[metric_col_name] = metric_name
    for col in columns_to_delete:
        df = df.drop([col], axis=1)
    return df

# average df rows, remove expanded columns and set a new column with a metric name
def avgTimeseries(df, columns_to_delete, metric_name, metric_col_name='metric'):
    df['value'] = df.mean(axis=1)
    df[metric_col_name] = metric_name
    for col in columns_to_delete:
        df = df.drop([col], axis=1)
    return df

# average df rows, remove expanded columns and set a new column with a metric name
def countTimeseries(df, columns_to_delete, metric_name, metric_col_name='metric'):
    df['value'] = df.count(axis=1)
    df[metric_col_name] = metric_name
    for col in columns_to_delete:
        df = df.drop([col], axis=1)
    return df

def merge(df, series):
    return pd.merge_asof(df, series, left_index=True, right_index=True, tolerance=pd.Timedelta('1 second')).bfill()    

    

In [170]:
# process istio and container metrics data signals
for file in service_input_files:
    orig_file = file
    print('processing input file {}'.format(file))
    pos = file.find('_')
    metric_name = file[pos+1:]
    pos = metric_name.find('.')
    metric_name = metric_name[:pos]
    print('processing metric {}'.format(metric_name))
    
    data_df = pd.read_csv(file_dir + file)
    data_df.rename(columns={'ztime': 'date', 'name':'metric'}, inplace=True)
        
    # expand and either average or max any duplicates in timestamps series
    expanded_df = expand(data_df, by_col='service', by_col_values=services)
    
    # sum timeseries rows (accumulate the services)
    sum_df = sumTimeseries(expanded_df, columns_to_delete=services, metric_name=metric_name)

    if save:
        print('saving {} data with shape {}'.format(orig_file, sum_df.shape))
        save_to_file = merged_dir + orig_file
        sum_df.to_csv(path_or_buf=save_to_file, index=True) 
        print("----------------")
    else:
        print("Metric data is not saved. Savig flag is turned off!")        


processing input file 1_service_cpu_use.csv
processing metric service_cpu_use
Processing metric for column: % checkoutservice
dimention before resampling is: (31755,)
Processing metric for column: % cartservice
dimention before resampling is: (33399,)
Processing metric for column: % emailservice
dimention before resampling is: (32712,)
Processing metric for column: % currencyservice
dimention before resampling is: (33053,)
Processing metric for column: % paymentservice
dimention before resampling is: (32376,)
Processing metric for column: % productcatalogservice
dimention before resampling is: (33172,)
Processing metric for column: % shippingservice
dimention before resampling is: (32077,)
saving 1_service_cpu_use.csv data with shape (78845, 2)
----------------
processing input file 2_service_memory_use.csv
processing metric service_memory_use
Processing metric for column: % checkoutservice
dimention before resampling is: (31682,)
Processing metric for column: % cartservice
dimention b

Processing metric for column: % productcatalogservice
dimention before resampling is: (5065,)
Processing metric for column: % shippingservice
dimention before resampling is: (5036,)
saving 10_service_response_size.csv data with shape (78841, 2)
----------------
processing input file 11_containers_count.csv
processing metric containers_count
Processing metric for column: % checkoutservice
dimention before resampling is: (5204,)
Processing metric for column: % cartservice
dimention before resampling is: (5208,)
Processing metric for column: % emailservice
dimention before resampling is: (5208,)
Processing metric for column: % currencyservice
dimention before resampling is: (5211,)
Processing metric for column: % paymentservice
dimention before resampling is: (5207,)
Processing metric for column: % productcatalogservice
dimention before resampling is: (5203,)
Processing metric for column: % shippingservice
dimention before resampling is: (5214,)
saving 11_containers_count.csv data with sh

# system_cpu_usage

In [171]:
# system_cpu_usage
system_cpu_usage_df = pd.read_csv(file_dir + system_cpu_usage_file)
system_cpu_usage_df.rename(columns={'ztime': 'date', 'name':'metric'}, inplace=True)

# find nodes in the system
nodes = system_cpu_usage_df.dropna(subset=['node']).node.unique()

system_cpu_usage_expanded = expand(system_cpu_usage_df, by_col='node'
                                   , by_col_values=nodes, dup='mean')

system_cpu_usage_sum = sumTimeseries(system_cpu_usage_expanded, columns_to_delete=nodes
                                          , metric_name='system_cpu_usage')

if save:
    print('saving system_cpu_usage to file {} with dimension {}'.format(merged_dir + system_cpu_usage_file, system_cpu_usage_sum.shape))       
    system_cpu_usage_sum.to_csv(path_or_buf=merged_dir + system_cpu_usage_file, index=True) 
else:
    print("system_cpu_usage data is not saved. Savig flag is turned off!") 

Processing metric for column: % 192.168.181.164
dimention before resampling is: (5257, 1)
Processing metric for column: % 192.168.228.12
dimention before resampling is: (5257, 1)
Processing metric for column: % 192.168.227.189
dimention before resampling is: (4676, 1)
Processing metric for column: % 192.168.189.71
dimention before resampling is: (5257, 1)
Processing metric for column: % 192.168.227.202
dimention before resampling is: (5256, 1)
saving system_cpu_usage to file /Users/hmohamed/github/data-research-spring2020/raw-data-linode-run3/merged/12_system_cpu_usage.csv with dimension (78841, 2)


# system_cpu_sat

--- skip ---

In [172]:
# skip system_cpu_sat
# system_cpu_sat
node_load_df = pd.read_csv(file_dir + node_load1_file)
node_load_df.rename(columns={'ztime': 'date', 'name':'metric'}, inplace=True)
node_load_expanded = expand(node_load_df, by_col='node'
                                   , by_col_values=nodes, dup='sum')
nodes = node_load_expanded.columns

node_load_expanded.head(3)



Processing metric for column: % 192.168.181.164
dimention before resampling is: (7485,)
Processing metric for column: % 192.168.228.12
dimention before resampling is: (7580,)
Processing metric for column: % 192.168.227.189
dimention before resampling is: (6922,)
Processing metric for column: % 192.168.189.71
dimention before resampling is: (7703,)
Processing metric for column: % 192.168.227.202
dimention before resampling is: (7130,)


Unnamed: 0_level_0,192.168.181.164,192.168.228.12,192.168.227.189,192.168.189.71,192.168.227.202
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-02-27 16:36:28,1.21,0.866,10.176,0.478,0.996667
2020-02-27 16:36:29,1.192,0.872,10.108,0.470667,1.013333
2020-02-27 16:36:30,1.174,0.878,10.04,0.463333,1.03


In [173]:
# skip system_cpu_sat
# system_cpu_sat
node_cpu_total_df = pd.read_csv(file_dir + node_cpu_seconds_total_file)
node_cpu_total_df.rename(columns={'ztime': 'date', 'name':'metric'}, inplace=True)
node_cpu_total_expanded = expand(node_cpu_total_df, by_col='node'
                                   , by_col_values=nodes, dup='count')
node_cpu_total_expanded.head(3)

Processing metric for column: % 192.168.181.164
dimention before resampling is: (8404, 1)
Processing metric for column: % 192.168.228.12
dimention before resampling is: (8422, 1)
Processing metric for column: % 192.168.227.189
dimention before resampling is: (7632, 1)
Processing metric for column: % 192.168.189.71
dimention before resampling is: (8419, 1)
Processing metric for column: % 192.168.227.202
dimention before resampling is: (8413, 1)


Unnamed: 0_level_0,192.168.181.164,192.168.228.12,192.168.227.189,192.168.189.71,192.168.227.202
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-02-27 16:36:28,847.72,104.499667,468.161,241.6715,241.963333
2020-02-27 16:36:29,847.805,104.514333,468.212167,241.692,241.985167
2020-02-27 16:36:30,847.89,104.529,468.263333,241.7125,242.007


# system_network_use

In [174]:
# System network recieve
system_net_receive_df = pd.read_csv(file_dir + system_network_receive)
system_net_receive_df.rename(columns={'ztime': 'date', 'name':'metric'}, inplace=True)

# find nodes in the system
nodes = system_net_receive_df.dropna(subset=['node']).node.unique()

system_net_receive_expanded = expand(system_net_receive_df, by_col='node'
                                   , by_col_values=nodes, dup='mean')

system_net_receive_sum = sumTimeseries(system_net_receive_expanded, columns_to_delete=nodes
                                          , metric_name='system_network_receive')

system_net_receive_sum.head(3)

Processing metric for column: % 192.168.228.12
dimention before resampling is: (8464, 1)
Processing metric for column: % 192.168.189.71
dimention before resampling is: (8470, 1)
Processing metric for column: % 192.168.227.189
dimention before resampling is: (7692, 1)
Processing metric for column: % 192.168.227.202
dimention before resampling is: (8467, 1)
Processing metric for column: % 192.168.181.164
dimention before resampling is: (8468, 1)


Unnamed: 0_level_0,value,metric
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-27 16:36:27,868665400.0,system_network_receive
2020-02-27 16:36:28,868739800.0,system_network_receive
2020-02-27 16:36:29,868818200.0,system_network_receive


In [175]:
system_net_receive_sum.shape

(127614, 2)

In [176]:
# System network recieve
system_net_transmit_df = pd.read_csv(file_dir + system_network_transmit)
system_net_transmit_df.rename(columns={'ztime': 'date', 'name':'metric'}, inplace=True)

# find nodes in the system
nodes = system_net_transmit_df.dropna(subset=['node']).node.unique()

system_net_transmit_expanded = expand(system_net_transmit_df, by_col='node'
                                   , by_col_values=nodes, dup='mean')

system_net_transmit_sum = sumTimeseries(system_net_transmit_expanded, columns_to_delete=nodes
                                          , metric_name='system_network_receive')

system_net_transmit_sum.head(3)

Processing metric for column: % 192.168.228.12
dimention before resampling is: (8462, 1)
Processing metric for column: % 192.168.189.71
dimention before resampling is: (8469, 1)
Processing metric for column: % 192.168.227.202
dimention before resampling is: (8470, 1)
Processing metric for column: % 192.168.227.189
dimention before resampling is: (7692, 1)
Processing metric for column: % 192.168.181.164
dimention before resampling is: (8465, 1)


Unnamed: 0_level_0,value,metric
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-27 16:36:27,421948300.0,system_network_receive
2020-02-27 16:36:28,422008600.0,system_network_receive
2020-02-27 16:36:29,422074700.0,system_network_receive


In [177]:
system_net_transmit_sum.shape

(127614, 2)

In [178]:
# sum network recieved and transmit
system_network_usage_df =  system_net_receive_sum['value'] + system_net_transmit_sum['value']

system_network_usage_df = pd.DataFrame(system_network_usage_df, columns=['value'])
system_network_usage_df['metric'] = 'system_network_usage'
system_network_usage_df.sort_index()

# check number of nulls
nulls = system_network_usage_df.isnull().sum()
if nulls['value'] > 0:
    ltcy_df = system_network_usage_df.fillna(ltcy_df.mean())
        
system_network_usage_df.head(5) 


Unnamed: 0_level_0,value,metric
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-27 16:36:27,1290614000.0,system_network_usage
2020-02-27 16:36:28,1290748000.0,system_network_usage
2020-02-27 16:36:29,1290893000.0,system_network_usage
2020-02-27 16:36:30,1291037000.0,system_network_usage
2020-02-27 16:36:31,1291182000.0,system_network_usage


# Concatenate all data together

In [179]:
i = 1
all_files = service_input_files + system_input_files
for file in all_files:
    input_file = merged_dir + file
    print('reading data from {}'.format(input_file))
    input_df = pd.read_csv(input_file)
    #input_df = toTimeSeries(input_df)
    if i == 1:
        timeseries_df = input_df
    else:
        timeseries_df = pd.concat([timeseries_df, input_df], ignore_index=True)
    i = i +1
    

reading data from /Users/hmohamed/github/data-research-spring2020/raw-data-linode-run3/merged/1_service_cpu_use.csv
reading data from /Users/hmohamed/github/data-research-spring2020/raw-data-linode-run3/merged/2_service_memory_use.csv
reading data from /Users/hmohamed/github/data-research-spring2020/raw-data-linode-run3/merged/3_service_cpu_sat.csv
reading data from /Users/hmohamed/github/data-research-spring2020/raw-data-linode-run3/merged/4_service_net_usage.csv
reading data from /Users/hmohamed/github/data-research-spring2020/raw-data-linode-run3/merged/5_service_disk_usage.csv
reading data from /Users/hmohamed/github/data-research-spring2020/raw-data-linode-run3/merged/6_service_req_total.csv
reading data from /Users/hmohamed/github/data-research-spring2020/raw-data-linode-run3/merged/17_service_ltcy_sum.csv
reading data from /Users/hmohamed/github/data-research-spring2020/raw-data-linode-run3/merged/18_service_ltcy_count.csv
reading data from /Users/hmohamed/github/data-research-s

# Average Latency


In [180]:
if True:    
    ltcy_sum_df = extractMetricSeries(timeseries_df,'metric','service_ltcy_sum')
    ltcy_sum_df = toTimeSeries(ltcy_sum_df, value_col_name='service_ltcy_sum')
    ltcy_sum_df.sort_index()

    ltcy_count_df = extractMetricSeries(timeseries_df,'metric','service_ltcy_count' )  
    ltcy_count_df = toTimeSeries(ltcy_count_df, value_col_name='service_ltcy_count')
    ltcy_count_df.sort_index()

    ltcy_df = ltcy_sum_df['service_ltcy_sum'] / ltcy_count_df['service_ltcy_count']

    ltcy_df = pd.DataFrame(ltcy_df, columns=['value'])
    ltcy_df['metric'] = 'service_ltcy_200'
    ltcy_df.sort_index()

    # check number of nulls
    nulls = ltcy_df.isnull().sum()
    if nulls['value'] > 0:
        ltcy_df = ltcy_df.fillna(ltcy_df.mean())
        
    # add that to the time series and remove the ltcy count and sum signal

    ltcy_df = ltcy_df.reset_index()

    timeseries_df = pd.concat([timeseries_df, ltcy_df], ignore_index=True)

    # remove service latency sum and count signals
    timeseries_df = timeseries_df[timeseries_df.metric != 'service_ltcy_sum']
    timeseries_df = timeseries_df[timeseries_df.metric != 'service_ltcy_count']

timeseries_df.head(5)          
       

Unnamed: 0,date,value,metric
0,2020-02-27 22:49:53,3504.829879,service_cpu_use
1,2020-02-27 22:49:54,5742.951598,service_cpu_use
2,2020-02-27 22:49:55,7981.073317,service_cpu_use
3,2020-02-27 22:49:56,10242.4575,service_cpu_use
4,2020-02-27 22:49:57,11901.568761,service_cpu_use


In [181]:
# add system_network_usage
system_network_usage_df.sort_index()
system_network_usage_df = system_network_usage_df.reset_index()
timeseries_df = pd.concat([timeseries_df, system_network_usage_df], ignore_index=True)

timeseries_df.head(5) 

Unnamed: 0,date,value,metric
0,2020-02-27 22:49:53,3504.829879,service_cpu_use
1,2020-02-27 22:49:54,5742.951598,service_cpu_use
2,2020-02-27 22:49:55,7981.073317,service_cpu_use
3,2020-02-27 22:49:56,10242.4575,service_cpu_use
4,2020-02-27 22:49:57,11901.568761,service_cpu_use


# Save to file

In [182]:
if save:
    print('saving system_cpu_usage to file {} with dimension {}'.format(merged_dir + concated_data_file, timeseries_df.shape))       
    timeseries_df.to_csv(path_or_buf=merged_dir + concated_data_file, index=True) 
else:
    print("system_cpu_usage data is not saved. Savig flag is turned off!")

saving system_cpu_usage to file /Users/hmohamed/github/data-research-spring2020/raw-data-linode-run3/merged/all_data.csv with dimension (1073721, 3)


In [183]:
timeseries_df.head(10)

Unnamed: 0,date,value,metric
0,2020-02-27 22:49:53,3504.829879,service_cpu_use
1,2020-02-27 22:49:54,5742.951598,service_cpu_use
2,2020-02-27 22:49:55,7981.073317,service_cpu_use
3,2020-02-27 22:49:56,10242.4575,service_cpu_use
4,2020-02-27 22:49:57,11901.568761,service_cpu_use
5,2020-02-27 22:49:58,10968.804868,service_cpu_use
6,2020-02-27 22:49:59,10126.117869,service_cpu_use
7,2020-02-27 22:50:00,11627.063366,service_cpu_use
8,2020-02-27 22:50:01,11425.092013,service_cpu_use
9,2020-02-27 22:50:02,10639.756226,service_cpu_use
