## Setup the environment

In [None]:
from llmselector.data_utils.fever import DataLoader_FEVER
from llmselector.compoundai.module.debate import MultiAgentDebateMultiRound
from llmselector.compoundai.metric import Metric, compute_score
from llmselector.compoundai.optimizer import OptimizerFullSearch, OptimizerLLMDiagnoser
import llmselector, os
import os
db_path = '../../cache/db_fever.sqlite'
if not os.path.exists(db_path): 
    !wget -P ../cache https://github.com/LLMSELECTOR/LLMSELECTOR/releases/download/0.0.1/db_fever.sqlite

In [None]:
llmselector.config.config(
    db_path=db_path,
)

## 1. Load dataset

In [None]:
from sklearn.model_selection import train_test_split
Mydataloader = DataLoader_FEVER()
q_data = Mydataloader.get_query_df()
#q_data = q_data.drop(q_data.index[[1588,2272]]) # these points invoke the output filtering by claude
random_state = 2025
random_state_list = [2027,2028,2029,2025,2026]
opt_seed_list = [0,0,0,0,0]
num_worker=40
train_df, test_df = train_test_split(q_data,test_size=0.5, random_state=random_state)

## 2. Specify model and eval metric

In [None]:
model_list = ['gpt-5-2025-08-07minimal','gpt-5-mini-2025-08-07minimal','gpt-5-nano-2025-08-07minimal',
              'claude-opus-4-1-20250805minimal','claude-sonnet-4-20250514minimal',#'claude-3-5-haiku-20241022',
              'gemini-2.5-prominimal','gemini-2.5-flashminimal','gemini-2.5-flash-liteminimal',
              ]
M1 = Metric('em_direct')

## 3. Standard systems using one fixed model

In [None]:
Agents_SameModel ={}
num_debator=3
round=2
prompt_template_debate='''Below is a user question, your own answer, and other agents' answers. Can you please update your answer? Critically analyze your solution, that of the other agents, as well as your own knowledge. Then give your final answer at the end as (X), where X is one of SUPPORTS, REFUTES, and NOT ENOUGH INFO.
[User Question]:{query}
[Your Answer]: {response}
[Other agents' answers]: {other_responses}
'''
prompt_template_initdebate='''Verify the following statement accurately. Give your answer as (X), where X is one of SUPPORTS, REFUTES, and NOT ENOUGH INFO. Give a one-sentence explanation.
[Claim]: {query}
'''
for name in model_list:
    Agents_SameModel[name] = MultiAgentDebateMultiRound(num_debator=num_debator,round=round,
                                                       prompt_template_initdebate=prompt_template_initdebate,
                                                        prompt_template_debate=prompt_template_debate,
                                                         )
    Opt0 = OptimizerFullSearch(model_list = [name])
    Opt0.optimize(train_df, M1, Agents_SameModel[name])

In [None]:
All_systems = {**Agents_SameModel}
results_train = compute_score(All_systems, train_df, M1)
display("train accuracy",results_train)
results = compute_score(All_systems, test_df, M1)
display("test accuracy",results)

## 4. LLMSELECTOR

In [None]:
def run_experiment(q_data, random_seed=2025, opt_seed=0, log_path=r'../../log/TableArithmetic/',train_size=500):
    train_df, test_df = train_test_split(q_data,test_size=0.5, random_state=random_seed)
    LLMSELECTOR = MultiAgentDebateMultiRound(num_debator=num_debator,round=round,
                                                       prompt_template_initdebate=prompt_template_initdebate,
                                                        prompt_template_debate=prompt_template_debate,
                                                         )
    Optimizer = OptimizerLLMDiagnoser(model_list = model_list,max_budget=1000,max_worker=num_worker,seed=opt_seed)
    score_hist = Optimizer.optimize( train_df.head(train_size), M1, LLMSELECTOR,show_progress=False)
    All_systems = {"LLMSELECTOR": LLMSELECTOR, **Agents_SameModel}
    results_train = compute_score(All_systems, train_df, M1)
    display("train accuracy",results_train)
    os.makedirs(log_path, exist_ok=True)
    results_train.to_csv(f"{log_path}/train_acc_{random_seed}_{opt_seed}.csv")
    results = compute_score(All_systems, test_df, M1)
    display("test accuracy",results)
    results.to_csv(f"{log_path}/test_acc_{random_seed}_{opt_seed}.csv")
    return

In [None]:
from tqdm import tqdm
dataname = f'MAD_FEVER_generalmodel2025_20250918'
[run_experiment(q_data, random_seed=x, opt_seed=y, log_path=f'../../log/{dataname}/') for x,y in tqdm(zip(random_state_list,opt_seed_list))]