## Config the environment

In [1]:
import llmselector, os
if not os.path.exists('../cache/db_livecodebench.sqlite'): 
    !wget -P ../cache https://github.com/LLMSELECTOR/LLMSELECTOR/releases/download/0.0.1/db_livecodebench.sqlite
llmselector.config.config(
    db_path=f"../cache/db_livecodebench.sqlite" )

## Load the livecodebench dataset

In [2]:
from llmselector.data_utils.livecodebench import DataLoader_livecodebench 
from sklearn.model_selection import train_test_split
Mydataloader = DataLoader_livecodebench()
q_data = Mydataloader.get_query_df()
train_df, test_df = train_test_split(q_data,test_size=0.5, random_state=2025)

## Use a single LLM

In [3]:
from llmselector.compoundai.module.selfrefine import SelfRefine
from llmselector.compoundai.optimizer import OptimizerFullSearch
from llmselector.compoundai.metric import Metric, compute_score
model_list = ['gpt-4o-2024-05-13','claude-3-5-sonnet-20240620','gemini-1.5-pro']
Agents_SameModel ={}
for name in model_list:
    Agents_SameModel[name] = SelfRefine()
    Opt0 = OptimizerFullSearch(model_list = [name])
    Opt0.optimize( train_df, Metric('em'), Agents_SameModel[name])
results = compute_score(Agents_SameModel, test_df, Metric('em'))
print(results)

Processing: 100%|██████████| 239/239 [00:00<00:00, 963.07it/s]
100%|██████████| 1/1 [00:00<00:00,  3.49it/s]


('gpt-4o-2024-05-13', 'gpt-4o-2024-05-13', 'gpt-4o-2024-05-13')


Processing: 100%|██████████| 239/239 [00:00<00:00, 995.61it/s]
100%|██████████| 1/1 [00:00<00:00,  3.80it/s]


('claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20240620')


Processing: 100%|██████████| 239/239 [00:00<00:00, 1318.96it/s]
100%|██████████| 1/1 [00:00<00:00,  4.77it/s]


('gemini-1.5-pro', 'gemini-1.5-pro', 'gemini-1.5-pro')


100%|██████████| 240/240 [00:00<00:00, 2033.36it/s]
100%|██████████| 240/240 [00:00<00:00, 10008.38it/s]
100%|██████████| 240/240 [00:00<00:00, 1900.45it/s]
100%|██████████| 240/240 [00:00<00:00, 17657.44it/s]
100%|██████████| 240/240 [00:00<00:00, 2190.40it/s]
100%|██████████| 240/240 [00:00<00:00, 12998.88it/s]

                         Name  Mean_Score
0           gpt-4o-2024-05-13    0.862500
1  claude-3-5-sonnet-20240620    0.891667
2              gemini-1.5-pro    0.866667





## Optimize model selection

In [5]:
from llmselector.compoundai.optimizer import OptimizerLLMDiagnoser
LLMSELECTOR = SelfRefine()
Optimizer = OptimizerLLMDiagnoser()
Optimizer.optimize( train_df, Metric('em'), LLMSELECTOR)

pre-compute the score... with allocations: [['gpt-4o-2024-05-13', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'claude-3-haiku-20240307'], ['gpt-4-turbo-2024-04-09', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'claude-3-haiku-20240307'], ['gpt-4o-mini-2024-07-18', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'claude-3-haiku-20240307'], ['claude-3-5-sonnet-20240620', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'claude-3-haiku-20240307'], ['claude-3-haiku-20240307', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'claude-3-haiku-20240307'], ['gemini-1.5-pro', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'claude-3-haiku-20240307'], ['gemini-1.5-flash', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'claude-3-haiku-20240307'], ['meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'claude-3-haiku-20240307'], ['meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'claude-3-haiku-2

Processing: 100%|██████████| 239/239 [00:00<00:00, 1607.60it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1616.16it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1539.92it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1763.72it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1965.40it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1627.49it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1797.34it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1803.68it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1814.25it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1796.62it/s]
100%|██████████| 10/10 [00:01<00:00,  5.92it/s]
100%|██████████| 239/239 [00:03<00:00, 75.14it/s]


pre-compute the score... with allocations: [['gemini-1.5-pro', 'gpt-4o-2024-05-13', 'claude-3-haiku-20240307'], ['gemini-1.5-pro', 'gpt-4-turbo-2024-04-09', 'claude-3-haiku-20240307'], ['gemini-1.5-pro', 'gpt-4o-mini-2024-07-18', 'claude-3-haiku-20240307'], ['gemini-1.5-pro', 'claude-3-5-sonnet-20240620', 'claude-3-haiku-20240307'], ['gemini-1.5-pro', 'claude-3-haiku-20240307', 'claude-3-haiku-20240307'], ['gemini-1.5-pro', 'gemini-1.5-pro', 'claude-3-haiku-20240307'], ['gemini-1.5-pro', 'gemini-1.5-flash', 'claude-3-haiku-20240307'], ['gemini-1.5-pro', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'claude-3-haiku-20240307'], ['gemini-1.5-pro', 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo', 'claude-3-haiku-20240307'], ['gemini-1.5-pro', 'Qwen/Qwen2.5-72B-Instruct-Turbo', 'claude-3-haiku-20240307']]


Processing: 100%|██████████| 239/239 [00:00<00:00, 1497.90it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1579.61it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1638.78it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1886.39it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1723.99it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1761.16it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1762.92it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1574.94it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1623.55it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1621.61it/s]
100%|██████████| 10/10 [00:01<00:00,  5.76it/s]
100%|██████████| 239/239 [00:03<00:00, 72.54it/s]


pre-compute the score... with allocations: [['gemini-1.5-pro', 'gpt-4o-2024-05-13', 'gpt-4o-2024-05-13'], ['gemini-1.5-pro', 'gpt-4o-2024-05-13', 'gpt-4-turbo-2024-04-09'], ['gemini-1.5-pro', 'gpt-4o-2024-05-13', 'gpt-4o-mini-2024-07-18'], ['gemini-1.5-pro', 'gpt-4o-2024-05-13', 'claude-3-5-sonnet-20240620'], ['gemini-1.5-pro', 'gpt-4o-2024-05-13', 'claude-3-haiku-20240307'], ['gemini-1.5-pro', 'gpt-4o-2024-05-13', 'gemini-1.5-pro'], ['gemini-1.5-pro', 'gpt-4o-2024-05-13', 'gemini-1.5-flash'], ['gemini-1.5-pro', 'gpt-4o-2024-05-13', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo'], ['gemini-1.5-pro', 'gpt-4o-2024-05-13', 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo'], ['gemini-1.5-pro', 'gpt-4o-2024-05-13', 'Qwen/Qwen2.5-72B-Instruct-Turbo']]


Processing: 100%|██████████| 239/239 [00:00<00:00, 1805.79it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 888.74it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1885.78it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1600.13it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1768.20it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1947.67it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1836.72it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1738.55it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1612.15it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1641.65it/s]
100%|██████████| 10/10 [00:01<00:00,  5.52it/s]
100%|██████████| 239/239 [00:03<00:00, 74.21it/s]


pre-compute the score... with allocations: [['gpt-4o-2024-05-13', 'gpt-4o-2024-05-13', 'claude-3-5-sonnet-20240620'], ['gpt-4-turbo-2024-04-09', 'gpt-4o-2024-05-13', 'claude-3-5-sonnet-20240620'], ['gpt-4o-mini-2024-07-18', 'gpt-4o-2024-05-13', 'claude-3-5-sonnet-20240620'], ['claude-3-5-sonnet-20240620', 'gpt-4o-2024-05-13', 'claude-3-5-sonnet-20240620'], ['claude-3-haiku-20240307', 'gpt-4o-2024-05-13', 'claude-3-5-sonnet-20240620'], ['gemini-1.5-pro', 'gpt-4o-2024-05-13', 'claude-3-5-sonnet-20240620'], ['gemini-1.5-flash', 'gpt-4o-2024-05-13', 'claude-3-5-sonnet-20240620'], ['meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'gpt-4o-2024-05-13', 'claude-3-5-sonnet-20240620'], ['meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo', 'gpt-4o-2024-05-13', 'claude-3-5-sonnet-20240620'], ['Qwen/Qwen2.5-72B-Instruct-Turbo', 'gpt-4o-2024-05-13', 'claude-3-5-sonnet-20240620']]


Processing: 100%|██████████| 239/239 [00:00<00:00, 1889.46it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1786.66it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 2082.60it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1921.19it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1834.14it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1892.77it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1642.40it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1596.46it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1577.28it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1554.65it/s]
100%|██████████| 10/10 [00:01<00:00,  6.07it/s]
100%|██████████| 239/239 [00:03<00:00, 73.58it/s]


pre-compute the score... with allocations: [['claude-3-5-sonnet-20240620', 'gpt-4o-2024-05-13', 'claude-3-5-sonnet-20240620'], ['claude-3-5-sonnet-20240620', 'gpt-4-turbo-2024-04-09', 'claude-3-5-sonnet-20240620'], ['claude-3-5-sonnet-20240620', 'gpt-4o-mini-2024-07-18', 'claude-3-5-sonnet-20240620'], ['claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20240620'], ['claude-3-5-sonnet-20240620', 'claude-3-haiku-20240307', 'claude-3-5-sonnet-20240620'], ['claude-3-5-sonnet-20240620', 'gemini-1.5-pro', 'claude-3-5-sonnet-20240620'], ['claude-3-5-sonnet-20240620', 'gemini-1.5-flash', 'claude-3-5-sonnet-20240620'], ['claude-3-5-sonnet-20240620', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'claude-3-5-sonnet-20240620'], ['claude-3-5-sonnet-20240620', 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo', 'claude-3-5-sonnet-20240620'], ['claude-3-5-sonnet-20240620', 'Qwen/Qwen2.5-72B-Instruct-Turbo', 'claude-3-5-sonnet-20240620']]


Processing: 100%|██████████| 239/239 [00:00<00:00, 1678.53it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1928.62it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1781.34it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 2149.13it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 2253.82it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1921.22it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1846.83it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1789.99it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1681.99it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1707.48it/s]
100%|██████████| 10/10 [00:01<00:00,  6.46it/s]
100%|██████████| 239/239 [00:03<00:00, 77.21it/s]


pre-compute the score... with allocations: [['claude-3-5-sonnet-20240620', 'gpt-4o-2024-05-13', 'gpt-4o-2024-05-13'], ['claude-3-5-sonnet-20240620', 'gpt-4o-2024-05-13', 'gpt-4-turbo-2024-04-09'], ['claude-3-5-sonnet-20240620', 'gpt-4o-2024-05-13', 'gpt-4o-mini-2024-07-18'], ['claude-3-5-sonnet-20240620', 'gpt-4o-2024-05-13', 'claude-3-5-sonnet-20240620'], ['claude-3-5-sonnet-20240620', 'gpt-4o-2024-05-13', 'claude-3-haiku-20240307'], ['claude-3-5-sonnet-20240620', 'gpt-4o-2024-05-13', 'gemini-1.5-pro'], ['claude-3-5-sonnet-20240620', 'gpt-4o-2024-05-13', 'gemini-1.5-flash'], ['claude-3-5-sonnet-20240620', 'gpt-4o-2024-05-13', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo'], ['claude-3-5-sonnet-20240620', 'gpt-4o-2024-05-13', 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo'], ['claude-3-5-sonnet-20240620', 'gpt-4o-2024-05-13', 'Qwen/Qwen2.5-72B-Instruct-Turbo']]


Processing: 100%|██████████| 239/239 [00:00<00:00, 1701.06it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1718.56it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1712.89it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1688.54it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1641.67it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1578.96it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1643.09it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1650.37it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1827.39it/s]
Processing: 100%|██████████| 239/239 [00:00<00:00, 1651.49it/s]
100%|██████████| 10/10 [00:01<00:00,  5.45it/s]
100%|██████████| 239/239 [00:02<00:00, 80.39it/s]


In [6]:
results = compute_score({"LLMSELECTOR":LLMSELECTOR}, test_df, Metric('em'))
print(results)

100%|██████████| 240/240 [00:00<00:00, 1750.47it/s]
100%|██████████| 240/240 [00:00<00:00, 17027.23it/s]

          Name  Mean_Score
0  LLMSELECTOR    0.954167



