In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import plotly.express as px
#pd.set_option('display.float_format', lambda x: '%.6f' % x)
#pd.set_option('display.max_rows', 110)
import warnings
warnings.filterwarnings("ignore")
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
#from IPython.core.interactiveshell import InteractiveShell
#
#InteractiveShell.ast_node_interactivity = "last_expr"

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# 25 functions with data in test set (last 3 days)
workload_ids = [1, 7, 10, 11, 21, 31, 37, 40, 46, 47, 49, 50, 51, 52, 55, 56, 65, 66, 67, 68, 69, 71, 72, 80, 81]

# Transformer

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input, MultiHeadAttention, LayerNormalization, GlobalAveragePooling1D
import tensorflow.keras.backend as K

In [4]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Attention
    x = LayerNormalization(epsilon=1e-6)(inputs)
    x = MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(x, x)
    x = Dropout(dropout)(x)
    res = x + inputs

    # Feed Forward Part
    x = LayerNormalization(epsilon=1e-6)(res)
    x = Dense(ff_dim, activation="relu")(x)
    x = Dropout(dropout)(x)
    x = Dense(inputs.shape[-1])(x)
    return x + res

def build_model(input_shape, head_size, num_heads, ff_dim, num_transformer_blocks, mlp_units, dropout=0, mlp_dropout=0):
    inputs = Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = GlobalAveragePooling1D(data_format="channels_first")(x)
    for dim in mlp_units:
        x = Dense(dim, activation="relu")(x)
        x = Dropout(mlp_dropout)(x)
    outputs = Dense(1, activation="sigmoid")(x)

    return Model(inputs, outputs)

#def f1_metric(y_true, y_pred):
#    y_pred = tf.round(y_pred)
#    tp = tf.reduce_sum(tf.cast(y_true * y_pred, 'float'), axis=0)
#    fp = tf.reduce_sum(tf.cast((1 - y_true) * y_pred, 'float'), axis=0)
#    fn = tf.reduce_sum(tf.cast(y_true * (1 - y_pred), 'float'), axis=0)
#
#    precision = tp / (tp + fp + tf.keras.backend.epsilon())
#    recall = tp / (tp + fn + tf.keras.backend.epsilon())
#
#    f1 = 2 * precision * recall / (precision + recall + tf.keras.backend.epsilon())
#    return tf.reduce_mean(f1)

In [9]:
data = pd.read_csv(f'../../data/training_data/1.txt', delimiter=',')
data.drop(columns=["Unnamed: 0"], inplace=True)

data.columns.values
train_features = [feature for feature in data if feature != 'invocations']

X = data[train_features]
y = data[['invocations']]

test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)

#time_steps = 1198956
num_features = 9
#input_shape = (time_steps, num_features)
X_train_array = X_train.to_numpy()
#X_train = X_train_array.reshape((time_steps, num_features))
y_train_array = y_train.to_numpy()
#y_train = y_train_array.reshape((time_steps, 1))

sequence_length = 3600
num_samples = len(X_train_array) // sequence_length

X_train_reshaped = X_train_array[:num_samples * sequence_length].reshape(-1, sequence_length, num_features)
y_train_reshaped = y_train_array[:num_samples].reshape(-1, 1)

input_shape = (sequence_length, num_features)

model = build_model(
    input_shape,
    head_size=4,
    num_heads=1,
    ff_dim=4,
    num_transformer_blocks=4,
    mlp_units=[8],
    mlp_dropout=0.4,
    dropout=0.25,
)

model.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    metrics=["accuracy"],
)

history = model.fit(
    X_train_reshaped, y_train_reshaped,
    epochs=1,
    batch_size=32,
)



In [10]:
%%capture --no-display
from datetime import datetime

now = datetime.now()

model_preds = []
for i in range(200):
    x = np.reshape(X_test[i:i+3600], (-1, 3600, 9))
    prediction = model.predict(x)
    model_preds.append(round(prediction[0][0]))

later = datetime.now()
difference = (later - now).total_seconds()
difference

16.345417

In [25]:
y_pred = pd.DataFrame({"invocations": model_preds})
score = f1_score(y_test[3601:3801], y_pred)
score

0.0

In [38]:
x = np.reshape(X_test[:3600], (-1, 3600, 9))
prediction = model.predict(x)
prediction



array([[0.]], dtype=float32)

In [44]:
x = [1,1,1,0]
y = pd.DataFrame({"invocations": x})
y

Unnamed: 0,invocations
0,1
1,1
2,1
3,0


In [43]:
y_test

Unnamed: 0,invocations
1198956,0
1198957,0
1198958,0
1198959,0
1198960,0
...,...
1498691,1
1498692,1
1498693,1
1498694,1
