In [1]:
import numpy as np
import time

### Load Profiled Data

In [4]:
entropies = [0.0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]

data_pl = {} # e -> [exit_layer_count, eval_time, actual_cost/full_cost, accuracy]
for e in entropies:
    data_pl[e] = np.load('./../plotting2/saved_models/bert_base-SST-2-two_stage/entropy_{}.npy'.format(e), allow_pickle=True)

Bucket Accuracy

In [5]:
acc_buckets = {} # for acc in range(60, 100, 5):
min_exit_samples = 0 # N/k

for e in entropies:
    for exit_layer in data_pl[e][3]:
        acc = int(data_pl[e][3][exit_layer] * 20) * 5
        if acc >= 60 and data_pl[e][0][exit_layer] >= min_exit_samples:
            if acc not in acc_buckets:
                acc_buckets[acc] = []
            latency = int(data_pl[e][1][exit_layer] * 1000)
            exit_layer_count = data_pl[e][0][exit_layer]
            acc_buckets[int(acc/5)*5].append((latency, exit_layer, e, exit_layer_count))

A greedy heuristic to get the entropy vector given accuracy and latency constraints

In [6]:
# Accuracy is prioritized. We acheive the highest accuracy possible within the latency budget.
def get_entropies(targt_accuracy, target_latency):
    entropies = [(0, 0, 10)] * 12 # e, acc, latency
    for acc in acc_buckets:
        if acc < targt_accuracy:
            continue
        for latency, exit_layer, e, _ in acc_buckets[acc]:
            if latency > target_latency:
                continue
            if entropies[exit_layer - 1][1] < acc or (entropies[exit_layer - 1][1] == acc and entropies[exit_layer - 1][2] > latency):
                entropies[exit_layer - 1] = (e, acc, latency)
   
    res = []
    for e, _, _ in entropies:
        res.append(e)
    
    for i in range(len(res) - 1, 0, -1):
        if res[i] != 0:
            res[i] = 1
            break
    return tuple(res)

### Simulation

In [7]:
target_accuracies = [90, 90, 90, 80, 80, 80, 70, 70, 70, 70]
target_latencies = [150, 100, 80, 150, 120, 100, 120, 100, 80, 50]

for i in range(len(target_accuracies)):
    time.sleep(3 * 1/(i+1))
    print(f'Entropy vector for {target_accuracies[i]}% acc, {target_latencies[i]}ms latency:', get_entropies(target_accuracies[i], target_latencies[i]))

Entropy vector for 90% acc, 150ms latency: (0.5, 0.1, 0.1, 0, 0, 0.1, 0.01, 0.01, 1, 0, 0, 0)
Entropy vector for 90% acc, 100ms latency: (0.5, 0.1, 0.1, 0, 0, 1, 0, 0, 0, 0, 0, 0)
Entropy vector for 90% acc, 80ms latency: (0.5, 0.1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0)
Entropy vector for 80% acc, 150ms latency: (0.5, 0.1, 0.1, 0.15, 0, 0.1, 0.01, 0.01, 1, 0, 0, 0)
Entropy vector for 80% acc, 120ms latency: (0.5, 0.1, 0.1, 0.15, 0, 0.1, 0.01, 1, 0, 0, 0, 0)
Entropy vector for 80% acc, 100ms latency: (0.5, 0.1, 0.1, 0.15, 0, 1, 0, 0, 0, 0, 0, 0)
Entropy vector for 70% acc, 120ms latency: (0.5, 0.1, 0.1, 0.15, 0, 0.1, 0.01, 1, 0, 0, 0, 0)
Entropy vector for 70% acc, 100ms latency: (0.5, 0.1, 0.1, 0.15, 0, 1, 0, 0, 0, 0, 0, 0)
Entropy vector for 70% acc, 80ms latency: (0.5, 0.1, 0.1, 1, 0, 0, 0, 0, 0, 0, 0, 0)
Entropy vector for 70% acc, 50ms latency: (0.5, 0.1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0)


### Evaluation

In [8]:
def compute_overall_accuracy_latency(data_accLat):
    total = 0
    acc = 0
    latency = 0
    for exit_layer in data_accLat[0]:
        if exit_layer in data_accLat[3]:
            total += data_accLat[0][exit_layer]
            acc += data_accLat[0][exit_layer] * data_accLat[3][exit_layer]
            latency += data_accLat[0][exit_layer] * data_accLat[1][exit_layer]
    return acc/total, latency/total

In [9]:
data_accLat_90_150 = np.load('./../plotting2/saved_models/bert_base-SST-2-two_stage/accLat_{}_{}.npy'.format(90, 150), allow_pickle=True)

acc, latency = compute_overall_accuracy_latency(data_accLat_90_150)
print("Target Accuracy: 90%, Average Latency: 150ms ")
print(f"Achieved Accuracy: {round(acc*100)}%, Average Latency: {round(latency * 1000)}ms")

Target Accuracy: 90%, Average Latency: 150ms 
Achieved Accuracy: 89%, Average Latency: 78ms


In [10]:
data_accLat_80_120 = np.load('./../plotting2/saved_models/bert_base-SST-2-two_stage/accLat_{}_{}.npy'.format(80, 120), allow_pickle=True)

acc, latency = compute_overall_accuracy_latency(data_accLat_80_120)
print("Target Accuracy: 80%, Average Latency: 120ms ")
print(f"Achieved Accuracy: {round(acc*100)}%, Average Latency: {round(latency * 1000)}ms")

Target Accuracy: 80%, Average Latency: 120ms 
Achieved Accuracy: 88%, Average Latency: 90ms


In [11]:
data_accLat_75_100 = np.load('./../plotting2/saved_models/bert_base-SST-2-two_stage/accLat_{}_{}.npy'.format(75, 100), allow_pickle=True)
acc, latency = compute_overall_accuracy_latency(data_accLat_75_100)
print("Target Accuracy: 75%, Average Latency: 100ms ")
print(f"Achieved Accuracy: {round(acc*100)}%, Average Latency: {round(latency * 1000)}ms")

Target Accuracy: 75%, Average Latency: 100ms 
Achieved Accuracy: 85%, Average Latency: 70ms
