In [2]:
import numpy as np
from scipy.optimize import minimize
import math

In [3]:
def optimize_n_samples_per_layer(l, n, desired_latency):
    # Total number of samples and the latency constraint
    n_total = n  # Replace with the actual total number of samples
    latency_constraint = desired_latency  # Replace with desired latency constraint

    # Objective function to minimize
    def objective(n_vec):
        weighted_latency = np.dot(n_vec, l) / np.sum(n_vec)
        return (weighted_latency - latency_constraint) ** 2

    # Equality constraint: sum of n_i = n_total
    def eq_constraint(n_vec):
        return np.sum(n_vec) - n_total

    # Inequality constraints: n_i >= 0 for all i
    def ineq_constraint(n_vec):
        return n_vec  # This will ensure all elements are non-negative

    # Set up constraints in the format required by scipy
    constraints = [
        {'type': 'eq', 'fun': eq_constraint},  # Equality constraint
        {'type': 'ineq', 'fun': ineq_constraint}  # Inequality constraints
    ]

    # Initial guess for n_i (can be uniform distribution across layers)
    initial_guess = np.full(len(l), n_total / len(l))

    # Bounds for each n_i to ensure n_i >= 0
    bounds = [(0, None) for _ in range(len(l))]

    # Solve the optimization problem
    result = minimize(objective, initial_guess, method='SLSQP', bounds=bounds, constraints=constraints)

    n_samples_per_layer = np.round(result.x).astype(int)
    # Output the optimized n_i values and the objective value
    if result.success:
        # print("Optimized n values:", n_samples_per_layer)
        print("Minimum objective value:", np.sqrt(result.fun))
    else:
        print("Optimization failed:", result.message)
    return n_samples_per_layer

In [4]:
def get_per_layer_latencies():
    data_pl = np.load('./../plotting2_profile/saved_models/bert_base-SST-2-two_stage/entropy_0.0.npy', allow_pickle=True)
    latencies = []
    for i in np.arange(0, 12):
        latencies.append(round(data_pl[4 * i + 1][i + 1] * 1000, 2))
    return latencies, data_pl[0][1]
latencies, n = get_per_layer_latencies()
print(latencies)
print(n)

[1.81, 3.62, 3.93, 4.97, 6.01, 7.16, 10.23, 9.22, 12.96, 11.38, 12.33, 13.49]
872


In [5]:
def get_latency_to_n_samples_per_layer(latencies, n):
    max_latency = math.ceil(max(latencies)) + 0.5
    min_latency = round(min(latencies))
    latency_to_n_samples_per_layer = {}
    for desired_latency in np.arange(min_latency, max_latency, 0.5):
        n_samples_per_layer = optimize_n_samples_per_layer(latencies, n, desired_latency)
        print(f"Desired latency: {desired_latency}, N samples per layer: {n_samples_per_layer}")
        latency_to_n_samples_per_layer[desired_latency] = tuple(n_samples_per_layer)
    return latency_to_n_samples_per_layer

In [7]:
latency_to_n_samples_per_layer = get_latency_to_n_samples_per_layer(latencies, n)
np.save("./../plotting2_profile/saved_models/bert_base-SST-2-two_stage/lat_n_samples_pl.npy", latency_to_n_samples_per_layer)

Minimum objective value: 0.04090341739029846
Desired latency: 2.0, N samples per layer: [761 111   0   0   0   0   0   0   0   0   0   0]
Minimum objective value: 0.147529146112781
Desired latency: 2.5, N samples per layer: [496 212 163   0   0   0   0   0   0   0   0   0]
Minimum objective value: 0.04809066425733066
Desired latency: 3.0, N samples per layer: [377 215 187  93   0   0   0   0   0   0   0   0]
Minimum objective value: 3.119185759015153e-06
Desired latency: 3.5, N samples per layer: [292 196 180 125  70   9   0   0   0   0   0   0]
Minimum objective value: 9.587974814095901e-08
Desired latency: 4.0, N samples per layer: [227 173 164 133 102  67   0   6   0   0   0   0]
Minimum objective value: 1.331852583774662e-08
Desired latency: 4.5, N samples per layer: [192 155 149 127 106  82  19  40   0   0   0   0]
Minimum objective value: 1.7142029129502134e-08
Desired latency: 5.0, N samples per layer: [169 141 136 120 104  86  38  54   0  20   5   0]
Minimum objective value: 5.

In [8]:
print(len(latency_to_n_samples_per_layer))
unique_lton = set()
for l in latency_to_n_samples_per_layer:
    unique_lton.add(latency_to_n_samples_per_layer[l])
print(len(unique_lton))

25
25


In [9]:
latencies = [float(x) for x in latency_to_n_samples_per_layer]
print(latencies[0:12])
print(latencies[12:])

[2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5]
[8.0, 8.5, 9.0, 9.5, 10.0, 10.5, 11.0, 11.5, 12.0, 12.5, 13.0, 13.5, 14.0]


In [10]:
temp_data = np.load('./../plotting2_profile/saved_models/bert_base-SST-2-two_stage/lat_entropies_4.0.npy', allow_pickle=True)
print(temp_data)

total_latency = 0
total_accuracy = 0
for layer in range(1, 13):
    total_latency += temp_data[0][layer] * temp_data[1][layer]
    if layer in temp_data[3]:
        total_accuracy += temp_data[0][layer] * temp_data[3][layer]
total_latency /= 872
total_accuracy /= 872
print(total_latency)
print(total_accuracy)

[{1: 107, 2: 118, 3: 93, 4: 84, 5: 39, 6: 68, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 363}
 {1: 0.001680842069821937, 2: 0.0027254698640209133, 3: 0.003777606512910576, 4: 0.004817437557947068, 5: 0.0058552301847017724, 6: 0.006891341770396513, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0.013220137472651878}
 0.5654625382262997
 {1: np.float64(0.9158878504672897), 2: np.float64(0.923728813559322), 12: np.float64(0.8760330578512396), 6: np.float64(0.9558823529411765), 3: np.float64(0.978494623655914), 4: np.float64(0.8571428571428571), 5: np.float64(0.8974358974358975)}]
0.0077446248006383216
0.9036697247706422


In [12]:
latency_to_n_samples_per_layer[4.0]

(np.int64(227),
 np.int64(173),
 np.int64(164),
 np.int64(133),
 np.int64(102),
 np.int64(67),
 np.int64(0),
 np.int64(6),
 np.int64(0),
 np.int64(0),
 np.int64(0),
 np.int64(0))