In [324]:
import numpy as np
import pandas as pd
from datetime import datetime

In [325]:
# 1- read processed file
file_dir = '../../data/raw-data-linode-run3/'
merged_dir = file_dir + 'merged/' 

service_cpu_usage_file = '1_service_cpu_use.csv'
service_memory_usage_file = '2_service_memory_use.csv'
service_cpu_sat_file = '3_service_cpu_sat.csv'
service_net_usage_file = '4_service_net_usage.csv'
service_disk_usage_file = '5_service_disk_usage.csv'
service_req_total_file = '6_service_req_total.csv'
service_ltcy_file = '7_service_ltcy.csv'
service_errors_file = '8_service_errors.csv'
service_request_size_file = '9_service_request_size.csv'
service_response_size_file = '10_service_response_size.csv'
containers_count_file = '11_containers_count'




input_files_to_sum = [service_cpu_usage_file
                      , service_memory_usage_file
                      , service_cpu_sat_file
                      , service_net_usage_file
                      , service_disk_usage_file
                      , service_req_total_file
                      , service_ltcy_file
                      , service_errors_file
                      , service_request_size_file
                      , service_response_size_file 
                     ]

# make sure to max for conainers count metric when fixing duplicates on timestamp entries
input_files_to_max = [
    containers_count_file
]


save=True


In [326]:
svc_cpu_usage_df = pd.read_csv(file_dir + svc_cpu_usage_file)
svc_cpu_usage_df.shape

(347791, 4)

In [327]:
svc_cpu_usage_df.rename(columns={'ztime': 'date', 'name':'metric'}, inplace=True)
svc_cpu_usage_df.head()

Unnamed: 0,metric,date,service,value
0,service_cpu_usage,2020-02-28 05:59:52,productcatalogservice,0.036354
1,service_cpu_usage,2020-02-28 05:59:33,productcatalogservice,0.036354
2,service_cpu_usage,2020-02-28 05:59:15,productcatalogservice,0.036354
3,service_cpu_usage,2020-02-28 05:59:04,productcatalogservice,0.036354
4,service_cpu_usage,2020-02-28 05:58:40,productcatalogservice,0.036354


In [328]:
 services = ['checkoutservice'
,'cartservice'
,'emailservice'
,'currencyservice'
,'paymentservice'
,'productcatalogservice'
,'shippingservice'
] 
       

In [329]:
def toTimeSeries(df, index_col_name='date', value_col_name='value'):
    df[index_col_name] = pd.to_datetime(df[index_col_name])
    df[value_col_name] = pd.to_numeric(df[value_col_name])
    df.set_index(index_col_name, inplace=True)
    df.sort_index(inplace=True)
    return df


# This function extracts timeseries of one named service from the whole raw timeseries data
def extractMetricSeries(df, col_name, col_value):
    metric = df.loc[df[col_name] == col_value].drop([col_name], axis=1).rename(index=str, columns={"value": col_value})
    #metric.date = pd.to_datetime(metric.date)
    #metric[name] = pd.to_numeric(metric[name])
    #metric.set_index('date', inplace=True)
    metric.sort_index(inplace=True)
    return metric

# T for minutes, S for seconds
# aggregate of duplicates could be either by taking the maximum (=max) or average  (=rate)
def resample(df, index_col_name='date', frequency = 'S', interpolate = True
             , interpolate_method = 'linear', base=6, aggregate = 'rate'):
    # eliminate dups in timestamp
    if aggregate == 'max':
        df = df.groupby([index_col_name]).max()   # taking max
    else:
        df = df.groupby([index_col_name]).mean()   # taking mean
    df.index = pd.to_datetime(df.index)
    # fill in missing interval (upsample)
    resampled = df.resample(frequency, kind='timestamp', base=base).bfill()
    if interpolate:
        resampled = resampled.interpolate(method=interpolate_method)
    return resampled

# This function merges and alines the metrics timeseries data into a data frame, a column for every feature
def expand(df, by_col, by_col_values):
    # first convert to time series
    df = toTimeSeries(df, 'date')
    metrics_df = pd.DataFrame()
    i = 0
    for col_value in by_col_values:
        print("Processing metric for column: %", col_value)
        series = extractMetricSeries(df, by_col, col_value)
        series = resample(series)
        #service_series = diffSeries(service_series)   
        if i == 0:
            metrics_df = series
        else:
            metrics_df = merge(metrics_df, series)
        i = i + 1
    return metrics_df   

def sumMetrics(df, columns_to_delete, metric_name, metric_col_name='metric'):
    df['value'] = df.sum(axis=1)
    df[metric_col_name] = metric_name
    for col in columns_to_delete:
        df = df.drop([col], axis=1)
    return df    

def merge(df, series):
    return pd.merge_asof(df, series, left_index=True, right_index=True, tolerance=pd.Timedelta('1 second')).bfill()    

In [330]:
#svc_cpu_usage_df_expanded = expand(svc_cpu_usage_df, 'service', services)
#svc_cpu_usage_df_sum = sumMetrics(svc_cpu_usage_df_expanded, services, 'svc_cpu_usage')

#if False:
#    print("saving 1_svc_cpu_usage of shape {}".format(svc_cpu_usage_df_sum.shape))
#    save_to_file = merged_dir + '1_svc_cpu_usage'
#    svc_cpu_usage_df_sum.to_csv(path_or_buf=save_to_file, index=True)
        
        

In [331]:
for file in input_files_to_sum:
    orig_file = file
    print('processing input file {}'.format(file))
    pos = file.find('_')
    metric_name = file[pos+1:]
    pos = metric_name.find('.')
    metric_name = metric_name[:pos]
    print('processing metric {}'.format(metric_name))
    
    data_df = pd.read_csv(file_dir + file)
    data_df.rename(columns={'ztime': 'date', 'name':'metric'}, inplace=True)
    expanded_df = expand(data_df, 'service', services)
    sum_df = sumMetrics(expanded_df, services, metric_name)

    if save:
        print('saving {} data with shape {}'.format(orig_file, svc_cpu_usage_df_sum.shape))
        save_to_file = merged_dir + orig_file
        sum_df.to_csv(path_or_buf=save_to_file, index=True) 
        print("----------------")
    else:
        print("Metric data is not saved. Savig flag is turned off!")        


processing input file 1_service_cpu_use.csv
processing metric service_cpu_use
Processing metric for column: % checkoutservice
Processing metric for column: % cartservice
Processing metric for column: % emailservice
Processing metric for column: % currencyservice
Processing metric for column: % paymentservice
Processing metric for column: % productcatalogservice
Processing metric for column: % shippingservice
saving 1_service_cpu_use.csv of shape (78845, 2)
----------------
processing input file 2_service_memory_use.csv
processing metric service_memory_use
Processing metric for column: % checkoutservice
Processing metric for column: % cartservice
Processing metric for column: % emailservice
Processing metric for column: % currencyservice
Processing metric for column: % paymentservice
Processing metric for column: % productcatalogservice
Processing metric for column: % shippingservice
saving 2_service_memory_use.csv of shape (78845, 2)
----------------
processing input file 3_service_cpu

In [335]:
# convert latency to percentile (chooing 95)
Percentile = 0.95

ltcy_file = merged_dir + service_ltcy_file
ltcy_df = pd.read_csv(ltcy_file)
ltcy_df = toTimeSeries(ltcy_df)
metric_series = ltcy_df.metric
ltcy_df = ltcy_df.drop('metric', 1)
ltcy_df = ltcy_df.groupby(['date']).quantile(Percentile)
ltcy_df['metric'] = metric_series

if save:
    print('saving {} latency quantile file to {} with dimension {}'.format(Percentile, ltcy_file, ltcy_df.shape))       
    ltcy_df.to_csv(path_or_buf=ltcy_file, index=True) 
else:
    print("Latency percentils data is not saved. Savig flag is turned off!")   

saving 0.95 latency quantile file to ../../data/raw-data-linode-run3/merged/7_service_ltcy.csv with dimension (78841, 2)
