In [2]:
import tensorflow as tf
import pandas as pd
import requests
from sklearn import preprocessing

In [3]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [4]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [5]:
from src.kube.pod import get_pod_names
from src.prometheus.time_utils import generate_time
from src.prometheus.metrics import metric_labels
from src.prometheus.constants import prometheus_endpoint, prometheus_query, excluding_services

In [6]:
data_length = 420
offsets = [1598047832.6821344, 1598048432.9423163, 1598049032.9943507, 1598049633.0863874, 1598050233.1645138, 1598050833.2558558, 1598051433.366836, 1598052033.4203055, 1598052633.4842124, 1598053233.699553, 1598053833.741907, 1598054433.789637, 1598055033.8514934, 1598055634.24382, 1598056234.3267875, 1598056834.3907578, 1598057434.4455614, 1598058034.5075765, 1598058634.5720835, 1598059234.6254506, 1598059834.6723816, 1598060434.7206242, 1598061034.7685883, 1598061634.8055122, 1598062234.841223, 1598062834.9602003, 1598063435.252741, 1598064035.3158479, 1598064635.5269578, 1598065235.6270418, 1598065835.667722, 1598066435.7218814, 1598067035.7902415, 1598067635.8587837, 1598068236.2302053, 1598068836.2907255, 1598069436.3609302, 1598070036.4161923, 1598070636.4580853, 1598071236.493694, 1598071836.6565282, 1598072436.7347512] 
offsets_in_ms = [(((int(offset) - data_length + 1)), int(offset)) for offset in offsets]

In [7]:
pods = ['vehicles-service', 'identity-service', 'customers-service', 'deliveries-service', 'orders-service',
        'availability-service', 'parcels-service', 'pricing-service']

In [8]:
datasets = []

for offset in offsets_in_ms:
    prom_data_by_pods = {}
    for job_name in metric_labels:
        for metric in metric_labels[job_name]:
            response =requests.get(prometheus_endpoint + prometheus_query,
                                   params={'query': metric[1], 'start': offset[0], 'end': offset[1], 'step':1})
            prometheus_data = response.json()['data']['result']

            for pod in pods:
                metric_data = next(data['values'] for data in prometheus_data if data['metric']['app'] == pod)
                metric_data = map(lambda val: float(val[1]), metric_data)
                prom_data_by_pods[metric[0] + '_' + pod] = metric_data
    datasets.append(pd.DataFrame.from_dict(prom_data_by_pods))

In [9]:
for idx, d_t in enumerate(datasets):
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(d_t)
    datasets[idx] = pd.DataFrame(x_scaled, columns=d_t.columns)

In [10]:
features_by_service = 9
prediction_column_idx_in_batch = -1

In [11]:
vehicle_service_index = pods.index('vehicles-service')
vehicle_service_index

0

In [12]:
dataset = pd.concat(datasets)
dataset.shape

# 42 batches. Each 420 rows

(17640, 72)

In [13]:
vehicle_dataset = dataset.iloc[:, [(len(pods) * x) + vehicle_service_index for x in range(0, features_by_service)]]
vehicle_dataset.shape

(17640, 9)

In [80]:
batch_size = 420
batches = [(x * batch_size, (x + 1) * batch_size) for x in range(0, int(dataset.shape[0]/420))]

In [143]:
tf_dataset =[tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(vehicle_dataset.iloc[batch[0]:batch[1]].values, tf.float32)
        )
    ) for batch in batches]


In [105]:
len(tf_dataset)

42

In [144]:
window_size = 60
shift = 10

x_size = int((batch_size - window_size) / shift) + 1
skip_y = int(window_size/shift)

In [145]:
datasets_x = []
datasets_y = []
for idx, dataset in enumerate(tf_dataset):
    tf_dataset[idx] = tf_dataset[idx].window(window_size, shift, 1, True)
    tf_dataset[idx] = tf_dataset[idx].flat_map(lambda d: d.batch(window_size))
    datasets_x.append(tf_dataset[idx].take(x_size - skip_y).map(lambda t: t[:, :-1])) # map to skip prediction column
    datasets_y.append(tf_dataset[idx].skip(skip_y).map(lambda t: t[:shift, -1])) # map to take only prediction column

In [146]:
def merge_batches(datasets):
    datasets_as_tuple = tuple(datasets)
    zipped = tf.data.Dataset.zip(datasets_as_tuple)
    
    return zipped.map(lambda *t: tf.stack(t, axis=0)) # merge batches

In [147]:
import random

def train_test_split(X, y, split = 0.7):
    all_indexes = range(0, len(X))
    indexes_train = random.sample(all_indexes, int(0.7 * len(X)))
    indexes_test = list(set(all_indexes) - set(indexes_train))
    
    X_train_all = [X[i] for i in indexes_train]
    y_train_all = [y[i] for i in indexes_train]

    X_test_all = [X[i] for i in indexes_test]
    y_test_all = [y[i] for i in indexes_test]
    
    return merge_batches(X_train_all), merge_batches(y_train_all), merge_batches(X_test_all), merge_batches(y_test_all)

In [148]:
X_train, y_train, X_test, y_test = train_test_split(datasets_x, datasets_y)

In [152]:
print(X_train)
print(y_train)
print(X_test)
print(y_test)

<MapDataset shapes: (29, None, 8), types: tf.float32>
<MapDataset shapes: (29, None), types: tf.float32>
<MapDataset shapes: (13, None, 8), types: tf.float32>
<MapDataset shapes: (13, None), types: tf.float32>


In [141]:
y # 42 batches with windowed prediction

<MapDataset shapes: (42, None), types: tf.float32>

In [142]:
for x in datasets_y:
    for k in x:
        print(k)
        break
    break

tf.Tensor([0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(10,), dtype=float32)
