# Data Clean up

Haytham Mohamed - INFS890 Spring 2020

This notebook is to check missing data and outliers

In [180]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn import preprocessing
from datetime import datetime
from scipy import stats

sns.set()
sns.set(color_codes=True)
#sns.set_color_codes()

pd.options.display.max_rows = 15
pd.options.display.float_format = '{:,.2f}'.format

# Read the raw data. 

All data is in one file, where each metric is listed under each other with three columns: 

1- metric: The name of the metric

2- date: the data and time of the metric

3- value: the metric value


In [181]:
# 1- read processed file

home_dir = '/Users/hmohamed/github/data-research-spring2020/sock-shop'

file_dir = home_dir + '/metrics-1pod/'
processed_dir = home_dir + '/processed-data/' 


orders_ltcy_file = '1_orders_ltcy.csv'
carts_ltcy_file = '2_carts_ltcy.csv'
 
carts_req_rate_file = '3_carts_req.csv'
orders_req_rate_file = '4_orders_req.csv'

service_cpu_usage_file = '6_svc_cpu_use.csv'
service_cpu_sat_file = '8_svc_cpu_sat.csv'

service_io_file = '10_svc_disk_io.csv'

service_net_use_file = '12_svc_net_use.csv'

nodes_cpu_usage_file = '7_nodes_cpu_use.csv'
nodes_cpu_sat_file = '9_nodes_cpu_sat.csv'
nodes_io_file = '11_nodes_disk_io.csv'
nodes_net_use_file = '13_nodes_net_use.csv'



service_pods_file = '5_svc_pods.csv'


service_input_files = [service_cpu_usage_file
                      #, service_cpu_sat_file
                      #, system_cpu_usage_file
                      #, system_cpu_sat_file
                       
                      #, orders_ltcy_file
                      #, carts_ltcy_file
                     ]

concated_data_file = 'all_data.csv'

CARTS_SAVE = False
ORDERS_SAVE = True

frequency = '15S'  # S for second , T for minute

In [182]:
def read_df(file_dir, data_file):
    df = pd.read_csv(file_dir + data_file, header=None, skiprows=1, names=['unit', 'metric', 'date', 'value'])
    #df = df.drop(['pods'], axis=1)
    return to_time_series(df)

def to_time_series(df, index_col_name='date', value_col_name='value'):
    df[index_col_name] = pd.to_datetime(df[index_col_name])
    df[value_col_name] = pd.to_numeric(df[value_col_name])
    df.set_index(index_col_name, inplace=True)
    df.sort_index(inplace=True)
    return df
    

# This function merges and alines the metrics timeseries data into a data frame, a column for every feature
def expand(df, by_col='unit', base=37, dup='mean'):
    # first convert to time series
    #df = toTimeSeries(df, 'date')
    by_col_values = np.unique(df[by_col])
    metrics_df = pd.DataFrame()
    i = 0
    for col_value in by_col_values:
        #print("Processing metric for column: %", col_value)
        series = extract_metric_series(df, by_col, col_value)
        series = resample(series, value_column_name=col_value, dup=dup, base=base)  
        if i == 0:
            metrics_df = series
        else:
            metrics_df = merge(metrics_df, series)
        i = i + 1
    return metrics_df 

# T for minutes, S for seconds
# remedy duplicates by either taking the maximum (=max) or average  (=mean) them
def resample(df, value_column_name, index_col_name='date', frequency = frequency, interpolate = True
             , interpolate_method = 'linear', base=37, dup = 'mean'):
    # eliminate dups in timestamp
    if dup == 'max':
        df = df.groupby([index_col_name])[value_column_name].max()   # taking max
        df = pd.DataFrame(df)
    elif dup == 'sum':
        df = df.groupby([index_col_name])[value_column_name].sum()
    else:
        df = df.groupby([index_col_name]).mean()   # taking mean
        
    df.index = pd.to_datetime(df.index)
    df.sort_index(inplace=True)  # order the timeseries
    
    # fill in missing interval (upsample)
    shape_before = df.shape
    resampled = df.resample(frequency, kind='timestamp', base=base).bfill()
    
    if interpolate:
        resampled = resampled.interpolate(method=interpolate_method)    
        
    #print("dimention before resampling is: {}".format(shape_before))
    return resampled


# This function extracts timeseries of one named service from the whole raw timeseries data
def extract_metric_series(df, col_name, col_value):
    metric = df.loc[df[col_name] == col_value].drop([col_name], axis=1).rename(index=str, columns={"value": col_value})
    metric.sort_index(inplace=True)
    return metric  

# sum df rows, remove expanded columns and set a new column with a metric name
def sum_timeseries(df, columns_to_delete, metric_name, metric_col_name='metric'):
    df['value'] = df.sum(axis=1)
    df[metric_col_name] = metric_name
    for col in columns_to_delete:
        df = df.drop([col], axis=1)
    return df 

# max df rows, remove expanded columns and set a new column with a metric name
def max_timeseries(df, columns_to_delete, metric_name, metric_col_name='metric'):
    df['value'] = df.max(axis=1)
    df[metric_col_name] = metric_name
    for col in columns_to_delete:
        df = df.drop([col], axis=1)
    return df

# average df rows, remove expanded columns and set a new column with a metric name
def avg_timeseries(df, columns_to_delete, metric_name, metric_col_name='metric'):
    df['value'] = df.mean(axis=1)
    df[metric_col_name] = metric_name
    for col in columns_to_delete:
        df = df.drop([col], axis=1)
    return df

# average df rows, remove expanded columns and set a new column with a metric name
def count_timeseries(df, columns_to_delete, metric_name, metric_col_name='metric'):
    df['value'] = df.count(axis=1)
    df[metric_col_name] = metric_name
    for col in columns_to_delete:
        df = df.drop([col], axis=1)
    return df

def merge(df, series):
    return pd.merge_asof(df, series, left_index=True, right_index=True, tolerance=pd.Timedelta('1 second')).bfill()    

    

In [183]:
orders_ltcy_df = read_df(file_dir, orders_ltcy_file)
orders_ltcy_df = expand(orders_ltcy_df, base=37)
orders_ltcy_df.head(5) 

Unnamed: 0_level_0,orders_ltcy
date,Unnamed: 1_level_1
2020-03-26 19:47:07,0.24
2020-03-26 19:47:22,0.24
2020-03-26 19:47:37,0.24
2020-03-26 19:47:52,0.24
2020-03-26 19:48:07,0.24


In [184]:
carts_ltcy_df = read_df(file_dir, carts_ltcy_file)
carts_ltcy_df = expand(carts_ltcy_df, base=37)
carts_ltcy_df.head() 

Unnamed: 0_level_0,carts_ltcy
date,Unnamed: 1_level_1
2020-03-26 19:46:52,1.96
2020-03-26 19:47:07,0.98
2020-03-26 19:47:22,0.92
2020-03-26 19:47:37,0.49
2020-03-26 19:47:52,0.48


In [185]:
print('orders latency shape {} and carts latency shape {}'
       .format(orders_ltcy_df.shape,carts_ltcy_df.shape))


orders latency shape (6764, 1) and carts latency shape (6765, 1)


In [186]:
#services_io_use_df = read_df(file_dir, service_io_file)
#services_io_use_df = expand(services_io_use_df, base=37)
#services_io_use_df.head(5) 

In [187]:
#services_io_use_df.shape

In [188]:
#services_io_use_df.describe(include='all')

In [189]:
services_net_use_df = read_df(file_dir, service_net_use_file) 
services_net_use_df = expand(services_net_use_df, base=37)
services_net_use_df = services_net_use_df.div(1024)   # to KB
services_net_use_df.head(5) 

Unnamed: 0_level_0,carts-db_net_use,carts_net_use,catalogue-db_net_use,catalogue_net_use,front-end_net_use,orders-db_net_use,orders_net_use,payment_net_use,queue-master_net_use,rabbitmq_net_use,session-db_net_use,shipping_net_use,user-db_net_use,user_net_use
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-03-26 19:46:37,4.8,22.01,5.98,9.86,80.16,18.54,3.85,8.97,18.16,13.49,14.61,2.66,2.2,3.96
2020-03-26 19:46:52,16.65,27.34,15.6,26.21,78.63,11.3,9.13,17.0,1.94,12.04,14.82,13.57,16.07,18.62
2020-03-26 19:47:07,17.18,9.43,12.78,24.34,73.94,11.58,21.9,9.58,2.97,11.6,15.31,14.43,19.99,25.92
2020-03-26 19:47:22,6.25,9.18,4.68,4.47,71.3,4.36,12.26,2.8,3.5,2.92,4.14,3.74,10.45,12.56
2020-03-26 19:47:37,9.28,13.37,8.62,13.64,71.51,4.67,9.01,2.21,3.05,3.07,4.24,4.89,9.81,11.48


In [190]:
services_net_use_df.shape

(6765, 14)

In [191]:
services_net_use_df.describe(include='all')

Unnamed: 0,carts-db_net_use,carts_net_use,catalogue-db_net_use,catalogue_net_use,front-end_net_use,orders-db_net_use,orders_net_use,payment_net_use,queue-master_net_use,rabbitmq_net_use,session-db_net_use,shipping_net_use,user-db_net_use,user_net_use
count,6765.0,6765.0,6765.0,6765.0,6765.0,6765.0,6765.0,6765.0,6765.0,6765.0,6765.0,6765.0,6765.0,6765.0
mean,50.61,81.98,82.37,162.57,827.7,9.93,88.97,19.07,6.45,7.52,27.66,20.04,80.61,144.6
std,50.17,74.14,65.86,120.39,618.55,5.07,51.42,11.12,4.01,4.93,19.76,11.92,62.9,105.04
min,1.09,1.11,1.41,1.12,1.54,1.8,1.28,1.92,1.1,1.37,1.75,0.86,2.2,3.96
25%,8.38,20.55,13.82,29.35,142.06,5.95,31.19,13.44,3.8,2.82,7.22,13.24,16.4,32.12
50%,34.16,59.96,81.64,163.35,873.77,8.75,88.86,14.78,4.92,6.95,27.92,16.19,76.68,138.96
75%,82.3,131.61,129.9,249.39,1287.59,12.66,128.72,22.14,7.69,10.06,41.8,24.87,126.23,218.84
max,383.28,454.97,297.22,539.07,2843.75,39.63,284.54,86.79,29.89,30.97,89.08,92.75,302.15,512.42


In [192]:
services_pods_df = read_df(file_dir, service_pods_file)
services_pods_df = expand(services_pods_df, base=37)
services_pods_df.head(5)  

Unnamed: 0_level_0,carts-db_pods,carts_pods,catalogue-db_pods,catalogue_pods,front-end_pods,orders-db_pods,orders_pods,payment_pods,queue-master_pods,rabbitmq_pods,session-db_pods,shipping_pods,user-db_pods,user_pods
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-03-26 19:46:37,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.5,1.0,1.0,1.0,1.0
2020-03-26 19:46:52,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.5,1.0,1.0,1.0,1.0
2020-03-26 19:47:07,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.5,1.0,1.0,1.0,1.0
2020-03-26 19:47:22,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.5,1.0,1.0,1.0,1.0
2020-03-26 19:47:37,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.5,1.0,1.0,1.0,1.0


In [193]:
services_pods_df.shape

(6766, 14)

In [194]:
services_cpu_use_df = read_df(file_dir, service_cpu_usage_file)
services_cpu_use_df = expand(services_cpu_use_df, base=37)
services_cpu_use_df.head(5)    

Unnamed: 0_level_0,carts-db_cpu_use,carts_cpu_use,catalogue-db_cpu_use,catalogue_cpu_use,front-end_cpu_use,orders-db_cpu_use,orders_cpu_use,payment_cpu_use,queue-master_cpu_use,rabbitmq_cpu_use,session-db_cpu_use,shipping_cpu_use,user-db_cpu_use,user_cpu_use
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-03-26 19:46:22,0.07,0.03,0.01,0.01,0.03,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
2020-03-26 19:46:37,0.18,0.18,0.01,0.02,0.06,0.02,0.02,0.01,0.02,0.02,0.02,0.01,0.02,0.02
2020-03-26 19:46:52,0.2,0.44,0.01,0.02,0.07,0.02,0.03,0.01,0.02,0.02,0.03,0.01,0.02,0.03
2020-03-26 19:47:07,0.17,0.23,0.01,0.02,0.07,0.02,0.03,0.01,0.02,0.02,0.03,0.01,0.02,0.02
2020-03-26 19:47:22,0.14,0.24,0.01,0.02,0.07,0.01,0.02,0.01,0.01,0.01,0.01,0.01,0.02,0.02


In [195]:
services_cpu_use_df.shape

(6766, 14)

In [196]:
nodes_cpu_use_df = read_df(file_dir, nodes_cpu_usage_file)
nodes_cpu_use_df = expand(nodes_cpu_use_df, base=37)
nodes_cpu_use_df.head()   

Unnamed: 0_level_0,nodes_cpu_use
date,Unnamed: 1_level_1
2020-03-26 19:46:22,2.5
2020-03-26 19:46:37,1.89
2020-03-26 19:46:52,1.35
2020-03-26 19:47:07,1.26
2020-03-26 19:47:22,1.28


In [197]:
nodes_io_use_df = read_df(file_dir, nodes_io_file)
nodes_io_use_df = expand(nodes_io_use_df, base=37)
nodes_io_use_df.head() 

Unnamed: 0_level_0,nodes_disk_io
date,Unnamed: 1_level_1
2020-03-26 19:46:22,1.67
2020-03-26 19:46:37,1.01
2020-03-26 19:46:52,0.05
2020-03-26 19:47:07,0.04
2020-03-26 19:47:22,0.26


In [198]:
nodes_io_use_df.describe(include='all')

Unnamed: 0,nodes_disk_io
count,6766.0
mean,0.35
std,0.58
min,0.0
25%,0.01
50%,0.03
75%,0.42
max,2.96


In [199]:
nodes_net_use_df = read_df(file_dir, nodes_net_use_file)
nodes_net_use_df = expand(nodes_net_use_df, base=37)
nodes_net_use_df = nodes_net_use_df.div(1024 * 1024)  # to MB
nodes_net_use_df.head() 

Unnamed: 0_level_0,nodes_net_use
date,Unnamed: 1_level_1
2020-03-26 19:46:37,2.76
2020-03-26 19:46:52,2.59
2020-03-26 19:47:07,2.15
2020-03-26 19:47:22,1.81
2020-03-26 19:47:37,1.8


In [200]:
nodes_net_use_df.describe(include='all')

Unnamed: 0,nodes_net_use
count,6765.0
mean,11.57
std,6.87
min,1.52
25%,3.25
50%,12.26
75%,17.29
max,27.28


In [201]:
orders_req_rate_df = read_df(file_dir, orders_req_rate_file)
orders_req_rate_df = expand(orders_req_rate_df, base=37)
#orders_req_rate_df = orders_req_rate_df[orders_req_rate_df.orders_req  > 0]
orders_req_rate_df.head()

Unnamed: 0_level_0,orders_req
date,Unnamed: 1_level_1
2020-03-26 19:46:52,0.15
2020-03-26 19:47:07,0.13
2020-03-26 19:47:22,0.36
2020-03-26 19:47:37,0.44
2020-03-26 19:47:52,0.44


In [202]:
carts_req_rate_df = read_df(file_dir, carts_req_rate_file)
carts_req_rate_df = expand(carts_req_rate_df, base=37)
#carts_req_rate_df = carts_req_rate_df[carts_req_rate_df.carts_req  > 0]
carts_req_rate_df.head()

Unnamed: 0_level_0,carts_req
date,Unnamed: 1_level_1
2020-03-26 19:46:37,0.47
2020-03-26 19:46:52,0.49
2020-03-26 19:47:07,1.04
2020-03-26 19:47:22,1.62
2020-03-26 19:47:37,1.78


# Orders Flow 

The features selected for the orders workflow are the requests rate, pod-level CPU utilization of the microservices including front-end, orders, users, shipping, payment, cart, users-db, orders- db, cart-db, and the CPU utilization the nodes that host these microservices

In [203]:
# services cpu use
selected_services_cpu = ['front-end_cpu_use'
                     ,'orders_cpu_use'  
                     ,'orders-db_cpu_use'
                     ,'user_cpu_use'
                     ,'user-db_cpu_use'
                     ,'shipping_cpu_use'
                     ,'payment_cpu_use'
                     ,'carts_cpu_use'                                                                                
                     ,'carts-db_cpu_use'
                     ]
orders_flow_df = services_cpu_use_df[selected_services_cpu]

# add services io use
#selected_services_io = ['front-end_io','orders_io','orders-db_io','user_io','user-db_io','shipping_io','payment_io','carts_io','carts-db_io']
#orders_io_df = services_io_use_df[selected_services_io]

# add services net use
selected_services_net = ['front-end_net_use','orders_net_use','orders-db_net_use','user_net_use','user-db_net_use'
                        ,'shipping_net_use','payment_net_use','carts_net_use','carts-db_net_use']
orders_net_use_df = services_net_use_df[selected_services_net]

# add services pods
selected_services_pods = ['front-end_pods'
                     ,'orders_pods'  
                     #,'orders-db_pods'
                     ,'user_pods'
                     #,'user-db_pods'
                     ,'shipping_pods'
                     ,'payment_pods'
                     ,'carts_pods'                                                                                
                     #,'carts-db_pods'
                     ]
orders_pods_df = services_pods_df[selected_services_pods]
orders_flow_df = merge(orders_flow_df, orders_pods_df)
orders_flow_df = merge(orders_flow_df, orders_net_use_df)
#orders_flow_df = merge(orders_flow_df, orders_io_df)


# add the nodes cpu use
orders_flow_df = merge(orders_flow_df, nodes_cpu_use_df)

# add the nodes io use
orders_flow_df = merge(orders_flow_df, nodes_io_use_df)

# add the nodes net use
orders_flow_df = merge(orders_flow_df, nodes_net_use_df)

# add the orders request rate
orders_flow_df = merge(orders_flow_df, orders_req_rate_df)

# add the orders latency target
orders_flow_df = merge(orders_flow_df, orders_ltcy_df)

orders_flow_df.head(5)



Unnamed: 0_level_0,front-end_cpu_use,orders_cpu_use,orders-db_cpu_use,user_cpu_use,user-db_cpu_use,shipping_cpu_use,payment_cpu_use,carts_cpu_use,carts-db_cpu_use,front-end_pods,...,user-db_net_use,shipping_net_use,payment_net_use,carts_net_use,carts-db_net_use,nodes_cpu_use,nodes_disk_io,nodes_net_use,orders_req,orders_ltcy
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-03-26 19:46:22,0.03,0.01,0.01,0.01,0.01,0.01,0.01,0.03,0.07,1.0,...,2.2,2.66,8.97,22.01,4.8,2.5,1.67,2.76,0.15,0.24
2020-03-26 19:46:37,0.06,0.02,0.02,0.02,0.02,0.01,0.01,0.18,0.18,1.0,...,2.2,2.66,8.97,22.01,4.8,1.89,1.01,2.76,0.15,0.24
2020-03-26 19:46:52,0.07,0.03,0.02,0.03,0.02,0.01,0.01,0.44,0.2,1.0,...,16.07,13.57,17.0,27.34,16.65,1.35,0.05,2.59,0.15,0.24
2020-03-26 19:47:07,0.07,0.03,0.02,0.02,0.02,0.01,0.01,0.23,0.17,1.0,...,19.99,14.43,9.58,9.43,17.18,1.26,0.04,2.15,0.13,0.24
2020-03-26 19:47:22,0.07,0.02,0.01,0.02,0.02,0.01,0.01,0.24,0.14,1.0,...,10.45,3.74,2.8,9.18,6.25,1.28,0.26,1.81,0.36,0.24


In [204]:
orders_flow_df.describe(include='all')

Unnamed: 0,front-end_cpu_use,orders_cpu_use,orders-db_cpu_use,user_cpu_use,user-db_cpu_use,shipping_cpu_use,payment_cpu_use,carts_cpu_use,carts-db_cpu_use,front-end_pods,...,user-db_net_use,shipping_net_use,payment_net_use,carts_net_use,carts-db_net_use,nodes_cpu_use,nodes_disk_io,nodes_net_use,orders_req,orders_ltcy
count,6766.0,6766.0,6766.0,6766.0,6766.0,6766.0,6766.0,6766.0,6766.0,6766.0,...,6766.0,6766.0,6766.0,6766.0,6766.0,6766.0,6766.0,6766.0,6766.0,6766.0
mean,0.83,0.21,0.02,0.2,0.12,0.05,0.03,0.53,1.83,2.76,...,80.6,20.03,19.07,81.98,50.6,4.1,0.35,11.57,5.03,2.97
std,0.62,0.13,0.0,0.13,0.08,0.04,0.01,0.58,1.99,1.1,...,62.9,11.92,11.12,74.14,50.17,2.08,0.58,6.87,4.2,3.51
min,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.03,0.01,0.5,...,2.2,0.86,1.92,1.11,1.09,1.1,0.0,1.52,0.0,0.02
25%,0.19,0.08,0.02,0.05,0.03,0.04,0.02,0.12,0.02,2.0,...,16.3,13.24,13.43,20.55,8.38,1.92,0.01,3.25,0.49,0.24
50%,0.84,0.22,0.02,0.2,0.12,0.05,0.03,0.32,1.15,3.0,...,76.68,16.19,14.78,59.96,34.15,4.21,0.03,12.25,5.14,0.49
75%,1.29,0.3,0.02,0.3,0.18,0.06,0.04,0.58,3.28,4.0,...,126.23,24.87,22.13,131.57,82.3,5.78,0.42,17.29,8.2,5.81
max,2.73,0.91,0.04,0.64,0.34,0.63,0.1,2.53,6.58,4.0,...,302.15,92.75,86.79,454.97,383.28,9.66,2.96,27.28,20.13,9.75


In [205]:
orders_flow_df.shape

(6766, 29)

In [206]:
if ORDERS_SAVE:
    processed_file_path = processed_dir + 'orders_flow_data.csv'
    orders_flow_df.to_csv(path_or_buf=processed_file_path, index=True)

# Cart Flow

the features selected for the <b>cart workflow</b> are the request rate, the pod-level CPU utilization of the microservices including front-end, orders, cart, cart-db, and the CPU utilization  the nodes that host these microservices.

In [207]:
selected_services = ['front-end_cpu_use'
                     ,'orders_cpu_use'                                                           
                     ,'carts_cpu_use'                                                                                
                     ,'carts-db_cpu_use'
                     ]
carts_flow_df = services_cpu_use_df[selected_services]

# add the nodes cpu use
carts_flow_df = merge(carts_flow_df, nodes_cpu_use_df)

# add the orders request rate
carts_flow_df = merge(carts_flow_df, carts_req_rate_df)

# add the orders latency target
carts_flow_df = merge(carts_flow_df, carts_ltcy_df)

carts_flow_df.head(5)


Unnamed: 0_level_0,front-end_cpu_use,orders_cpu_use,carts_cpu_use,carts-db_cpu_use,nodes_cpu_use,carts_req,carts_ltcy
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-03-26 19:46:22,0.03,0.01,0.03,0.07,2.5,0.47,1.96
2020-03-26 19:46:37,0.06,0.02,0.18,0.18,1.89,0.47,1.96
2020-03-26 19:46:52,0.07,0.03,0.44,0.2,1.35,0.49,1.96
2020-03-26 19:47:07,0.07,0.03,0.23,0.17,1.26,1.04,0.98
2020-03-26 19:47:22,0.07,0.02,0.24,0.14,1.28,1.62,0.92


In [208]:
carts_flow_df.shape

(6766, 7)

In [209]:
if CARTS_SAVE:
    processed_file_path = processed_dir + 'cart_flow_data.csv'
    carts_flow_df.to_csv(path_or_buf=processed_file_path, index=True)