# quick start

In this notebook, we demonstrate how to utilize our router datasets to evaluate router performance.

In [1]:
import pickle
import numpy as np
from sklearn.neighbors import NearestNeighbors
from utils import convert_arrays_to_shapes



### The format of our router dataset is as follows:

In [2]:
router_dataset = {
        'easy': {
            3: {'all_strong': {'data': None, 'model': None}, 'all_weak': {'data': None, 'model': None}, 'strong_to_weak': {'data': None, 'model': None}},
            5: {'all_strong': {'data': None, 'model': None}, 'all_weak': {'data': None, 'model': None}, 'strong_to_weak': {'data': None, 'model': None}}
        },
        'hard': {
            10: {'all_strong': {'data': None, 'model': None}, 'all_weak': {'data': None, 'model': None}, 'strong_to_weak': {'data': None, 'model': None}},
            100: {'all_strong': {'data': None, 'model': None}, 'all_weak': {'data': None, 'model': None}, 'strong_to_weak': {'data': None, 'model': None}},
            1000: {'all_strong': {'data': None, 'model': None}, 'all_weak': {'data': None, 'model': None}, 'strong_to_weak': {'data': None, 'model': None}}
        },
        'split_index':{
            'train_indices': None, 'val_indices': None, 'test_indices': None
        },
        'embedding': {
            'train_embed': None, 'val_embed': None, 'test_embed': None
        },
        'prompt': {
            'train_prompt': None, 'val_prompt': None, 'test_prompt': None
        }
    }

### load and display the pre-built router dataset

make sure that you have downloaded the ```router_dataset``` folder into the ```data``` directory.

In [3]:
# choose to_handle_datasets from below
# ['arc', 'hellaswag', 'mmlu', 'winogrande', 'gsm8k']
# ['ifeval', 'bbh', 'gpqa', 'musr', 'math', 'mmlu_pro']
to_handle_dataset = 'gsm8k'

# load the pre-built router dataset 
with open(f'data/router_dataset/{to_handle_dataset}_router_dataset.pkl', 'rb') as f:
    router_dataset = pickle.load(f)
    
# display the pre-built router dataset 
print(convert_arrays_to_shapes(router_dataset))

# You can view the router dataset here

{'easy': {3: {'all_strong': {'data': {'train_score': (1055, 3), 'val_score': (132, 3), 'test_score': (132, 3)}, 'model': (3,)}, 'all_weak': {'data': {'train_score': (1055, 3), 'val_score': (132, 3), 'test_score': (132, 3)}, 'model': (3,)}, 'strong_to_weak': {'data': {'train_score': (1055, 3), 'val_score': (132, 3), 'test_score': (132, 3)}, 'model': (3,)}}, 5: {'all_strong': {'data': {'train_score': (1055, 5), 'val_score': (132, 5), 'test_score': (132, 5)}, 'model': (5,)}, 'all_weak': {'data': {'train_score': (1055, 5), 'val_score': (132, 5), 'test_score': (132, 5)}, 'model': (5,)}, 'strong_to_weak': {'data': {'train_score': (1055, 5), 'val_score': (132, 5), 'test_score': (132, 5)}, 'model': (5,)}}}, 'hard': {10: {'all_strong': {'data': {'train_score': (1055, 10), 'val_score': (132, 10), 'test_score': (132, 10)}, 'model': (10,)}, 'all_weak': {'data': {'train_score': (1055, 10), 'val_score': (132, 10), 'test_score': (132, 10)}, 'model': (10,)}, 'strong_to_weak': {'data': {'train_score': 

### Implement a router method (using KNN as an example)

In [4]:
def knn_router(X_train, Y_train, X_test, Y_test, knearest):
    """
    Predicts the best LLM for each test inquiry using a kNN-based correctness predictor.
    
    Parameters:
    - X_train: numpy array of shape (N, m), training embeddings.
    - Y_train: numpy array of shape (N, p), binary correctness labels for each LLM.
    - X_test: numpy array of shape (N', m), test embeddings.
    - Y_test: numpy array of shape (N', p), binary correctness labels for each LLM.
    - knearest: int, number of nearest neighbors to use (default is 5).
    
    Returns:
    - mu: the overall performance of the LLMs selected by router on the given benchmark.
    - vb: mu/bsm, bsm denotes best performance of a single model in the candidate set
    - ep: classification bias (measure the diversity of the classifier’s prediction distribution)
    """
    
    # Initialize the nearest neighbors model using cosine distance.
    nn_model = NearestNeighbors(n_neighbors=knearest, metric='cosine')
    nn_model.fit(X_train)
    
    # For each test inquiry, find the indices of its k nearest training inquiries.
    distances, indices = nn_model.kneighbors(X_test)
    
    # Number of test inquiries and number of available LLMs.
    num_test = X_test.shape[0]
    num_llms = Y_train.shape[1]
    
    # Initialize an array to store the predicted correctness probability for each LLM.
    predicted_probs = np.zeros((num_test, num_llms))
    
    # For each test inquiry, average the correctness labels of its k nearest neighbors.
    for i in range(num_test):
        neighbor_indices = indices[i]  # indices of the k nearest neighbors for test inquiry i.
        # Average the correctness labels across these neighbors for each LLM.
        predicted_probs[i] = np.mean(Y_train[neighbor_indices], axis=0)
    
    # For each test inquiry, select the LLM with the highest predicted correctness probability.
    predicted_llm_indices = np.argmax(predicted_probs, axis=1)
    
    
    # calculate metrics
    mu = np.mean(Y_test[np.arange(Y_test.shape[0]), predicted_llm_indices])
    vb = mu / np.max(np.mean(Y_test, axis=0))
    
    # softmax
    predicted_probs = np.exp(predicted_probs - np.max(predicted_probs, axis=1, keepdims=True)) / np.sum(np.exp(predicted_probs - np.max(predicted_probs, axis=1, keepdims=True)), axis=1, keepdims=True)
    # calculate classification bias Ep
    terms = np.where(predicted_probs > 1e-10, predicted_probs * np.log2(predicted_probs), 0)
    ep = -np.sum(terms) / predicted_probs.shape[0] 
   
    return mu, vb, ep

### train and test the router

Use the data from the router dataset to train and test the router, and output the performance metrics.

In [5]:
# The difficulty level and num_candidates settings of the router dataset
difficulty = 'easy'  # 'easy' / 'hard'

# Under the "easy" setting, you can choose from [3, 5]
# under the "hard" setting, you can choose from [10, 100, 1000].
num_candidates = 5   # 3 / 5 / 10 / 100 / 1000

# The parameter k of the KNN router, default = 5
knearest = 5


# For each benchmark, we select a representative LLM with strong performance as the reference, such as GPT-4
acc_ref_dict = {'arc': 0.852, 'hellaswag': 0.953, 'mmlu': 0.864, 'harness_truthfulqa_mc_0': 0.669, 'winogrande': 0.875, 'gsm8k': 0.92,
           'ifeval': 0.7689, 'bbh': 0.8303, 'gpqa': 0.397, 'math': 0.4, 'musr': 0.699, 'mmlu_pro': 0.637}

# get acc of the ref LLM
acc_ref = acc_ref_dict[to_handle_dataset]

mu = []
vr = []
vb = []
ep = []
for config in ['all_strong', 'all_weak', 'strong_to_weak']:
    train_embed = router_dataset['embedding']['train_embed']
    test_embed = router_dataset['embedding']['test_embed']
    train_score = router_dataset[difficulty][num_candidates][config]['data']['train_score']
    test_score = router_dataset[difficulty][num_candidates][config]['data']['test_score']
    
    print(f"{to_handle_dataset:<10}", f"{'knn':<10}", f"num={num_candidates:<10}", f"{config:<15}")
    mu1, vb1, ep1 = knn_router(train_embed, train_score, test_embed, test_score, knearest)
    vr1 = mu1 / acc_ref
    print(f"mu: {mu1:.4f},  Vr: {vr1:.4f},  Vb: {vb1:.4f},  Ep: {ep1:.4f}")
    print()
    mu.append(mu1)
    vr.append(vr1)
    vb.append(vb1)
    ep.append(ep1)


print(f"{to_handle_dataset:<10}", f"{'knn':<10}", f"num={num_candidates:<10}", f"{'avg_metrics':<15}")
print(f"mu: {np.mean(mu):.4f},  Vr: {np.mean(vr):.4f},  Vb: {np.mean(vb):.4f},  Ep: {np.mean(ep):.4f}")
print()



gsm8k      knn        num=5          all_strong     
mu: 0.8258,  Vr: 0.8976,  Vb: 0.9397,  Ep: 2.3123

gsm8k      knn        num=5          all_weak       
mu: 0.4545,  Vr: 0.4941,  Vb: 1.1321,  Ep: 2.3025

gsm8k      knn        num=5          strong_to_weak 
mu: 0.8485,  Vr: 0.9223,  Vb: 0.9333,  Ep: 2.2711

gsm8k      knn        num=5          avg_metrics    
mu: 0.7096,  Vr: 0.7713,  Vb: 1.0017,  Ep: 2.2953

