# Generate Monitor Data as Alibaba Format

In [1]:
import pandas as pd
import json

In [2]:
usage_data = pd.read_csv('/data/clusterdata/cluster-trace-v2017/usage_data_small.csv')
init_data = pd.read_csv('/data/clusterdata/cluster-trace-v2017/init_data_small.csv')

In [3]:
usage_data.head()

Unnamed: 0.1,Unnamed: 0,start_time,pod_id,used_cpu,used_mem
0,88,39600,66,0.36,0.020187
1,152,39600,65,0.432,0.017345
2,1143,39600,32,0.3392,0.035539
3,1162,39600,31,1.146,0.037702
4,1189,39600,33,0.4984,0.02263


In [4]:
init_data.head()

Unnamed: 0.1,Unnamed: 0,start_time,pod_id,node_id,used_cpu,used_mem
0,0,39600,0,0,1.872,0.054504
1,1,39600,1,1,0.2784,0.024767
2,2,39600,2,2,0.088,0.015963
3,3,39600,3,3,0.328,0.022324
4,4,39600,4,4,0.4608,0.017227


In [5]:
with open('../../monitor_data.json','r') as f:
    monitor_data = json.load(f)

In [6]:
monitor_data

{'vm-4c8g-node2': [{'pod_name': 'buyservice-67dc97d8c6-tcslv',
   'data': [['2022-03-21T03:33:19.820772Z',
     13320192.0,
     0.00019961673333324844],
    ['2022-03-21T03:34:45.914658Z', 13320192.0, 0.0006103306333333336],
    ['2022-03-21T03:35:30.924394Z', 13320192.0, 0.00014586533333330227],
    ['2022-03-21T03:35:43.687006Z', 13320192.0, 0.00014850259999998154],
    ['2022-03-21T03:38:19.939410Z', 13385728.0, 0.0001573609666666679],
    ['2022-03-21T03:39:45.991732Z', 13385728.0, 0.0001428469999999275],
    ['2022-03-21T03:40:31.041279Z', 13393920.0, 0.00018408453333336184],
    ['2022-03-21T03:40:43.777566Z', 13393920.0, 0.0004348888666667013],
    ['2022-03-21T03:43:20.041896Z', 13303808.0, 0.00016514653333334905],
    ['2022-03-21T03:44:46.015356Z', 13336576.0, 0.0006909560000000426],
    ['2022-03-21T03:45:31.149692Z', 13336576.0, 0.00019220673333328147],
    ['2022-03-21T03:45:43.863301Z', 13336576.0, 0.00019220673333328147],
    ['2022-03-21T03:48:20.128052Z', 13369344.0, 

In [7]:
monitor_data_df = pd.DataFrame(columns=['start_time','pod_id','node_id','used_cpu','used_mem'])

In [8]:
pod_dict = {}
node_dict = {}
pod_counter = 0
node_counter = 0

for node,pods in monitor_data.items():
    if node not in node_dict:
        node_dict[node] = node_counter
        node_counter += 1
    for pod in pods:
        if pod['pod_name'] not in pod_dict:
            pod_dict[pod['pod_name']] = pod_counter
            pod_counter += 1 

In [9]:
pod_dict, node_dict

({'buyservice-67dc97d8c6-tcslv': 0,
  'influxdb-776b79db44-g668h': 1,
  'partservice-77dd859969-8qpm2': 2,
  'storageservice-77566757c9-7lpvq': 3,
  'timeservice-6d49cb4875-z6m6q': 4,
  'webapp-d5585d547-kzqn8': 5,
  'storageservice-1-588769c495-jwxn7': 6,
  'tableservice-7554b56f68-x95d4': 7,
  'storageservice-1-67c95799f-4ktpc': 8,
  'tableservice-7674f79f88-svwzd': 9},
 {'vm-4c8g-node2': 0, 'vm-4c8g-node3': 1, 'vm-8c16g-node10': 2})

In [12]:
monitor_data['vm-8c16g-node10'][0]['pod_name'], monitor_data['vm-8c16g-node10'][1]['pod_name']

('storageservice-1-67c95799f-4ktpc', 'tableservice-7674f79f88-svwzd')

In [20]:
import time

counter = 0
base_time = 1647804799
for node,pods in monitor_data.items():
    if node == 'vm-8c16g-node10':
        continue
    node_id = node_dict[node]
    for pod in pods:
        if pod['pod_name'] in ['storageservice-1-67c95799f-4ktpc', 'tableservice-7674f79f88-svwzd']:
            continue
        pod_id = pod_dict[pod['pod_name']]
        for data in pod['data']:
            start_time = time.mktime(time.strptime(data[0],'%Y-%m-%dT%H:%M:%S.%fZ'))-base_time
            used_cpu = data[1]
            used_mem = data[2]
            monitor_data_df.loc[counter] = [start_time,pod_id,node_id,used_cpu,used_mem]
            counter+=1

In [22]:
monitor_data_df[(monitor_data_df['used_cpu']>0) & (monitor_data_df['used_mem']>0)]

Unnamed: 0,start_time,pod_id,node_id,used_cpu,used_mem
0,0,0,0,13320192.0,0.000200
1,86,0,0,13320192.0,0.000610
2,131,0,0,13320192.0,0.000146
3,144,0,0,13320192.0,0.000149
4,300,0,0,13385728.0,0.000157
...,...,...,...,...,...
13715,175709,5,0,5144576.0,0.000058
13716,175809,5,0,5144576.0,0.000056
13717,175861,5,0,5144576.0,0.000044
13718,175956,5,0,5144576.0,0.000063


In [24]:
monitor_data_df['pod_id'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=object)

In [25]:
monitor_data_df['node_id'].unique()

array([0, 1], dtype=object)

In [27]:
monitor_data_df = monitor_data_df.sort_values('start_time')

In [30]:
monitor_data_df.head(10)

Unnamed: 0,start_time,pod_id,node_id,used_cpu,used_mem
0,0,0,0,13320192.0,0.0002
2283,0,1,0,110206976.0,0.023339
6860,0,3,0,79089664.0,0.00122
9146,0,4,0,13627392.0,0.000124
11428,0,5,0,5144576.0,6.5e-05
13720,0,6,1,45056.0,0.0
14059,0,7,1,40960.0,0.0
4579,0,2,0,80056320.0,0.000699
9147,86,4,0,13627392.0,6.3e-05
4580,86,2,0,80056320.0,0.000919


In [32]:
init_data_small = monitor_data_df[monitor_data_df['start_time']==0]

In [33]:
usage_data_small = monitor_data_df[['start_time','pod_id','node_id','used_cpu','used_mem']]

In [34]:
init_data_small.to_csv('/data/monitor_init_data.csv')
usage_data_small.to_csv('/data/monitor_usage_data.csv')

In [1]:
import pandas as pd

usage_data = pd.read_csv('/data/monitor_usage_data.csv')

In [5]:
usage_data['start_time'].unique()

array([0.00000e+00, 8.60000e+01, 1.31000e+02, ..., 1.76009e+05,
       1.76029e+05, 1.76046e+05])