In [1]:
import os
import sys

sys.path.append('../../txf_design-space/')
sys.path.append('../../txf_design-space/flexibert')
sys.path.append('../../boshnas/boshnas/')
sys.path.append('../utils')
sys.path.append('../')

import yaml
import json
import time
import torch
import shlex
import shutil
import argparse
import subprocess
import collections
import numpy as np
from tqdm import tqdm

from embeddings.utils import graph_util, print_util as pu

sys.path.append('../../txf_design-space/transformers/src/transformers')
import embedding_util, energy_util

from boshnas import BOSHNAS
from acq import gosh_acq as acq

from transformers import BertModel
from transformers import RobertaTokenizer, RobertaModel
from transformers.models.bert.configuration_bert import BertConfig
from transformers.models.bert.modeling_modular_bert import BertModelModular, BertForMaskedLMModular, BertForSequenceClassificationModular

import warnings
warnings.filterwarnings("ignore")

from run_energy_profiler import *

In [2]:
# Load design space
design_space = yaml.safe_load(open('../design_space/design_space.yaml'))

# Load dataset
dataset = json.load(open('../dataset/dataset.json'))
for key in dataset.keys():
    dataset[key]['embedding'] = eval(dataset[key]['embedding'])
    
X, latency, energy, peak_power = convert_to_tabular(dataset)
max_latency, max_energy, max_peak_power = np.amax(latency), np.amax(energy), np.amax(peak_power)

mean_X = np.mean(X, axis=0)

# Increase maximum values
max_latency, max_energy, max_peak_power = 1.2 * max_latency, 1.2 * max_energy, 1.2 * max_peak_power
print(f'Max latency: {max_latency : 0.3f}s/seq. Max energy: {max_energy : 0.3f}J/seq. Max peak power: {max_peak_power : 0.3f}W')
    
latency, energy, peak_power = latency/max_latency, energy/max_energy, peak_power/max_peak_power
    
# Get the embedding for model with hash in the first entry
model_hash = list(dataset.keys())[0]
print(np.array(dataset[model_hash]['embedding'])) 

surrogate_models = \
    init_surrogate_models('boshnas', dataset, design_space, '../dataset/surrogate_models/', True)

X_ds = convert_to_tabular({model_hash: dataset[model_hash]}, only_embeddings=True)

# We see a non-zero epistemic uncertainty
get_predictions(surrogate_models, X_ds-mean_X)

Max latency:  0.017s/seq. Max energy:  1.058J/seq. Max peak power:  343.896W
[  4   3 194 151   0 208   1   2 202 220   2 100 185   2 125 174   0  83
 154   1 214 126   3 128  38   2 223 106   2 192  60   0   0   0   0   0
   0]


(array([[5.20225889e-39, 5.09268604e-03]]),
 array([[0., 1.]]),
 array([[1., 1.]]),
 2.0050926860421896,
 0)

In [3]:
get_predictions(surrogate_models, (X-mean_X)/max_X)

([(tensor([0.5864]), (tensor([0.5780]), 0)),
  (tensor([0.5845]), (tensor([0.5760]), 0)),
  (tensor([0.5921]), (tensor([0.5802]), 0)),
  (tensor([0.5904]), (tensor([0.5779]), 0)),
  (tensor([0.5851]), (tensor([0.5775]), 0)),
  (tensor([0.5894]), (tensor([0.5811]), 0)),
  (tensor([0.5873]), (tensor([0.5749]), 0)),
  (tensor([0.5817]), (tensor([0.5772]), 0)),
  (tensor([0.5884]), (tensor([0.5722]), 0)),
  (tensor([0.5807]), (tensor([0.5745]), 0)),
  (tensor([0.5843]), (tensor([0.5707]), 0)),
  (tensor([0.5852]), (tensor([0.5775]), 0)),
  (tensor([0.5843]), (tensor([0.5737]), 0)),
  (tensor([0.5832]), (tensor([0.5757]), 0)),
  (tensor([0.5843]), (tensor([0.5773]), 0)),
  (tensor([0.5869]), (tensor([0.5772]), 0)),
  (tensor([0.5847]), (tensor([0.5777]), 0))],
 [(tensor([0.4938]), (tensor([0.4330]), 0)),
  (tensor([0.4937]), (tensor([0.4276]), 0)),
  (tensor([0.4958]), (tensor([0.4326]), 0)),
  (tensor([0.4952]), (tensor([0.4236]), 0)),
  (tensor([0.4896]), (tensor([0.4296]), 0)),
  (tensor

In [4]:
# Train surrogate models on the normalized dataset
train_surrogate_models(surrogate_models, (X-mean_X)/max_X, latency, energy, peak_power)

# We see a non-zero epistemic uncertainty
get_predictions(surrogate_models, (X-mean_X)/max_X)

([(tensor([0.4834]), (tensor([0.0625]), 0)),
  (tensor([0.4482]), (tensor([0.0661]), 0)),
  (tensor([0.5132]), (tensor([0.0630]), 0)),
  (tensor([0.5275]), (tensor([0.0663]), 0)),
  (tensor([0.2622]), (tensor([0.0646]), 0)),
  (tensor([0.8217]), (tensor([0.0621]), 0)),
  (tensor([0.5896]), (tensor([0.0653]), 0)),
  (tensor([0.7882]), (tensor([0.0620]), 0)),
  (tensor([0.5783]), (tensor([0.0616]), 0)),
  (tensor([0.5036]), (tensor([0.0645]), 0)),
  (tensor([0.6091]), (tensor([0.0650]), 0)),
  (tensor([0.2616]), (tensor([0.0651]), 0)),
  (tensor([0.2589]), (tensor([0.0644]), 0)),
  (tensor([0.3191]), (tensor([0.0645]), 0)),
  (tensor([0.2489]), (tensor([0.0648]), 0)),
  (tensor([0.2565]), (tensor([0.0650]), 0)),
  (tensor([0.2837]), (tensor([0.0652]), 0))],
 [(tensor([0.4703]), (tensor([0.0603]), 0)),
  (tensor([0.4575]), (tensor([0.0575]), 0)),
  (tensor([0.4970]), (tensor([0.0597]), 0)),
  (tensor([0.5043]), (tensor([0.0565]), 0)),
  (tensor([0.2690]), (tensor([0.0599]), 0)),
  (tensor

In [5]:
latency, energy, peak_power

(array([0.50794941, 0.46322158, 0.5299725 , 0.55653288, 0.28797305,
        0.83333333, 0.6225946 , 0.82762999, 0.5978553 , 0.50909825,
        0.63592478, 0.28402982, 0.28320672, 0.34924548, 0.2638582 ,
        0.27996118, 0.3165051 ]),
 array([0.46015746, 0.47726582, 0.51435174, 0.52287294, 0.2386085 ,
        0.83333333, 0.59214409, 0.81465537, 0.58527834, 0.50316834,
        0.62450576, 0.20652767, 0.24696664, 0.3017484 , 0.23711017,
        0.28845448, 0.22602613]),
 array([0.63790534, 0.67373874, 0.58835356, 0.4624932 , 0.3267284 ,
        0.61936777, 0.74753424, 0.83333333, 0.4472374 , 0.55160363,
        0.63335226, 0.39762648, 0.43721469, 0.56476028, 0.60987724,
        0.37214102, 0.3810698 ]))

In [3]:
# Get random samples of the entire dataset to obtain maximum uncertainty
random_samples = embedding_util.get_samples(design_space, num_samples=128, sampling_method='Random', debug=True)
X_ds = convert_to_tabular(random_samples, only_embeddings=True)

# get_predictions(surrogate_models, (X_ds-mean_X)/max_X)

Generating 128 samples using the Random sampler...
Model types: Counter({'deep_wide': 64, 'shallow_wide': 64})


In [10]:
# Test GP
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel

kernel = DotProduct() + WhiteKernel()
gpr = GaussianProcessRegressor(random_state=0)
gpr.fit(X, latency)

print(gpr.predict(X, return_std=True))
gpr.predict(X_ds, return_std=True)

(array([0.61496692, 0.44964377, 0.40962363, 0.3442319 , 0.49871354,
       0.32955583, 0.56445718, 0.40859325, 0.40107206, 0.36049002,
       0.47029914, 0.47251738, 0.46108461, 0.38952403, 0.49884428,
       0.39713076, 0.49151356, 0.50543872, 0.38385656, 0.37129304,
       0.36077238, 0.4901821 , 0.42445154, 0.54022113, 0.48228385,
       0.51868902, 0.48933703, 0.52521296, 0.41137793, 0.40591169,
       0.35041508, 0.38325614, 0.50964025, 0.48233635, 0.51517263,
       0.49524503, 0.52841742, 0.42845551, 0.44443552, 0.38324001,
       0.51485329, 0.55220043, 0.40400061, 0.36077808, 0.50787942,
       0.44718853, 0.41266617, 0.4111156 , 0.39350357, 0.39907721,
       0.59060093, 0.40562163, 0.54034379, 0.31737507, 0.55783334,
       0.47866685, 0.52184731, 0.4071087 , 0.39573855, 0.6681791 ,
       0.48098054, 0.38542373, 0.36697039, 0.45378423, 0.62098377,
       0.53040453, 0.45734412, 0.36712179, 0.57568841, 0.338662  ,
       0.66204074, 0.46739883, 0.45245265, 0.37204812, 0.5492

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1.

In [9]:
count = 0
while len(set(random_samples.keys()).difference(set(dataset.keys()))) == 0:
    random_samples = embedding_util.get_samples(design_space, num_samples=128, sampling_method='Random', debug=False)
    count += 1; print(count)
len(set(random_samples.keys()).difference(set(dataset.keys())))

128