In [25]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import plotly.express as px
pd.set_option('display.float_format', lambda x: '%.6f' % x)
pd.set_option('display.max_rows', 110)
import warnings
warnings.filterwarnings("ignore")

In [28]:
data = pd.read_csv('../../data/training_data/1.txt', delimiter=',')
data.drop(columns=["Unnamed: 0"], inplace=True)
data

Unnamed: 0,second,invocations,time_since_prev_invocation,year,month,day,hour,minute,second_of_day,is_weekday
0,0,0,1,2021,1,31,0,0,0,0
1,1,0,2,2021,1,31,0,0,1,0
2,2,0,3,2021,1,31,0,0,2,0
3,3,0,4,2021,1,31,0,0,3,0
4,4,0,5,2021,1,31,0,0,4,0
...,...,...,...,...,...,...,...,...,...,...
1498691,1209598,1,2,2021,2,13,23,59,86398,0
1498692,1209598,1,2,2021,2,13,23,59,86398,0
1498693,1209598,1,2,2021,2,13,23,59,86398,0
1498694,1209599,1,1,2021,2,13,23,59,86399,0


In [29]:
data.columns.values
train_features = [feature for feature in data if feature != 'invocations']

X = data[train_features]
y = data[['invocations']]

test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)
print(np.unique(y_pred))

score = f1_score(y_test, y_pred)
print(f'F1-score: {score}')

[0 1]
F1-score: 0.9136951924084818


In [32]:
test_data = pd.concat([X_test, y_test], axis=1)

predictions = model.predict(test_data[train_features])
test_data["prediction"] = predictions

fig = px.scatter(test_data.tail(50000), x='second', y=['invocations', 'prediction'], labels={'second': 'Seconds'})
fig.update_traces(mode='markers')
fig.update_layout(title='Invocations vs Prediction',
                  xaxis_title='Seconds',
                  yaxis_title='Count',
                  hovermode='x unified')
fig.show()

In [16]:
X_test

Unnamed: 0,second,time_since_prev_invocation,year,month,day,hour,minute,second_of_day,is_weekday
1198956,951793,15917,2021,2,11,0,23,1393,1
1198957,951794,15918,2021,2,11,0,23,1394,1
1198958,951795,15919,2021,2,11,0,23,1395,1
1198959,951796,15920,2021,2,11,0,23,1396,1
1198960,951797,15921,2021,2,11,0,23,1397,1
...,...,...,...,...,...,...,...,...,...
1498691,1209598,2,2021,2,13,23,59,86398,0
1498692,1209598,2,2021,2,13,23,59,86398,0
1498693,1209598,2,2021,2,13,23,59,86398,0
1498694,1209599,1,2021,2,13,23,59,86399,0


In [17]:
y_test

Unnamed: 0,invocations
1198956,0
1198957,0
1198958,0
1198959,0
1198960,0
...,...
1498691,1
1498692,1
1498693,1
1498694,1


In [18]:
from datetime import datetime

now = datetime.now()
for i in range(200):
    predictions = model.predict(test_data[train_features]) # 3.5 days of historical data + latest datapoint

later = datetime.now()
difference = (later - now).total_seconds()
difference

15.185093

In [31]:
class_probabilities = model.predict_proba(X_test)
class_probabilities

array([[9.9963129e-01, 3.6871457e-04],
       [9.9963129e-01, 3.6871457e-04],
       [9.9963129e-01, 3.6871457e-04],
       ...,
       [6.3324559e-01, 3.6675441e-01],
       [5.4296315e-01, 4.5703685e-01],
       [2.2445112e-01, 7.7554888e-01]], dtype=float32)

# Train all 100 models

In [27]:
for i in range(100):
    data = pd.read_csv(f'../../data/training_data/{i+1}.txt', delimiter=',')
    data.drop(columns=["Unnamed: 0"], inplace=True)

    data.columns.values
    train_features = [feature for feature in data if feature != 'invocations']

    X = data[train_features]
    y = data[['invocations']]

    test_size = 0.2
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    score = f1_score(y_test, y_pred)
    print(f'Model: {i+1}, Unique preds: {np.unique(y_pred)}, F1-score: {score}')

Model: 1, Unique preds: [0 1], F1-score: 0.9136951924084818
Model: 2, Unique preds: [0], F1-score: 0.0
Model: 3, Unique preds: [0 1], F1-score: 0.05542635658914729
Model: 4, Unique preds: [0], F1-score: 0.0
Model: 5, Unique preds: [0], F1-score: 0.0
Model: 6, Unique preds: [0 1], F1-score: 0.21281772200525
Model: 7, Unique preds: [0 1], F1-score: 0.7904218120656841
Model: 8, Unique preds: [0 1], F1-score: 0.15895908449600252
Model: 9, Unique preds: [0 1], F1-score: 0.012368181226402812
Model: 10, Unique preds: [0 1], F1-score: 0.3777338892546266
Model: 11, Unique preds: [0 1], F1-score: 0.46189158171876055
Model: 12, Unique preds: [0], F1-score: 0.0
Model: 13, Unique preds: [0], F1-score: 0.0
Model: 14, Unique preds: [0], F1-score: 0.0
Model: 15, Unique preds: [0], F1-score: 0.0
Model: 16, Unique preds: [0], F1-score: 0.0
Model: 17, Unique preds: [0 1], F1-score: 0.0037896731406916154
Model: 18, Unique preds: [0], F1-score: 0.0
Model: 19, Unique preds: [0], F1-score: 0.0
Model: 20, Uni

Where F1-score is 0 then there is no invocations for the test set -> needs to be removed from candidate set for workloads