In [12]:
import numpy as np

In [13]:
rng = np.random.default_rng(seed=20250306)

In [14]:
def generate_random_problem(n=64, k=5):
    props = rng.uniform(size=k)
    return rng.choice(k, size=n, p=props/props.sum())

In [15]:
import typing
import sys

import pydantic
import langchain_openai
import langchain_core
import tqdm.auto
import pandas as pd

from consol.confidence_models import AbstractConfidenceModel, SbftConfidenceModel, SprtConfidenceModel, PValueConfidenceModel, BayesianConfidenceModel, VoteConfidenceModel

class ConfidentSolverConfig(pydantic.BaseModel):
    max_trials: int

class ConfidentSolver:
    def __init__(
        self,
        confidence_model: typing.Union[str, AbstractConfidenceModel],
        max_trials=64,
    ):
        self.config = ConfidentSolverConfig(
            max_trials=max_trials,
        )
        if confidence_model == "sbft":
            self.confidence_model = SbftConfidenceModel()
        elif confidence_model == "sprt":
            self.confidence_model = SprtConfidenceModel()
        elif confidence_model == "pvalue":
            self.confidence_model = PValueConfidenceModel()
        elif confidence_model == "bayesian":
            self.confidence_model = BayesianConfidenceModel()
        elif confidence_model == "vote":
            self.confidence_model = VoteConfidenceModel()
        elif isinstance(confidence_model, AbstractConfidenceModel):
            self.confidence_model = confidence_model
        else:
            raise ValueError(f"Unknown Confidence Model: {confidence_model}")

    def invoke(self, answers, debug=False):
        max_trials = self.config.max_trials

        total_raw_outputs = []
        total_raw_outputs.append(answers[0]) 
        i = 1

        total_invoke = 0
        while True:     
            total_invoke += 1       
            first, second = self._get_top_two_answers(total_raw_outputs)
            trials = self._determine_trials(first, second, max_trials, len(total_raw_outputs))
            if trials == 0:
                break
            for j in range(trials):
                total_raw_outputs.append(answers[i])
                i += 1
        df = self._create_dataframe(total_raw_outputs)
        if debug:
            return df
        return {"target": df['answer'].mode().iloc[0], "total_invoke": total_invoke, "total_runs": len(total_raw_outputs)}

    def _get_top_two_answers(self, total_raw_outputs):
        total_ss = pd.Series(total_raw_outputs).value_counts()
        two = total_ss.sort_values(ascending=False).head(2).to_list()
        while len(two) < 2:
            two += [0]
        return two[0], two[1]

    def _determine_trials(self, first, second, max_trials, current_trials):
        for trials in range(0, max_trials + 1):
            if first + trials == 0:
                continue
            if self.confidence_model.test(first + trials, second):
                break
        if trials >= max_trials - current_trials:
            trials = max_trials - current_trials
        return trials

    def _create_dataframe(self, total_raw_outputs):
        return pd.DataFrame({
            'answer': total_raw_outputs,
        })

In [16]:

MAX_EXPR = 10_000
MAX_ITER = 64

records = []
for i in tqdm.tqdm(range(MAX_EXPR)):
    problem = generate_random_problem(k=2)
    answer = np.bincount(problem).argmax()

    for solver_name in ["sbft", "sprt", "pvalue","bayesian"]:
        solver = ConfidentSolver(solver_name)
        ret = solver.invoke(problem)
        ret["solver"] = solver_name
        ret["answer"] = answer
        ret["correct"] = ret["answer"] == ret["target"]
        records.append(ret)




100%|██████████| 10000/10000 [00:55<00:00, 180.21it/s]


In [21]:
df_k2 = pd.DataFrame(records)

In [22]:
1-df_k2.groupby("solver")["correct"].mean()

solver
bayesian    0.0342
pvalue      0.0155
sbft        0.0003
sprt        0.0519
Name: correct, dtype: float64

In [29]:
df_k2.groupby("solver")["total_runs"].mean()

solver
bayesian    22.1943
pvalue      26.8065
sbft        45.7483
sprt        34.9943
Name: total_runs, dtype: float64

In [23]:

MAX_EXPR = 10_000
MAX_ITER = 64

records = []
for i in tqdm.tqdm(range(MAX_EXPR)):
    problem = generate_random_problem(k=5)
    answer = np.bincount(problem).argmax()

    for solver_name in ["sbft", "sprt", "pvalue","bayesian"]:
        solver = ConfidentSolver(solver_name)
        ret = solver.invoke(problem)
        ret["solver"] = solver_name
        ret["answer"] = answer
        ret["correct"] = ret["answer"] == ret["target"]
        records.append(ret)




100%|██████████| 10000/10000 [01:30<00:00, 110.05it/s]


In [24]:
df_k5 = pd.DataFrame(records)

In [None]:
1-df_k5.groupby("solver")["correct"].mean()

solver
bayesian    0.0216
pvalue      0.0066
sbft        0.0000
sprt        0.0848
Name: correct, dtype: float64

In [28]:
df_k5.groupby("solver")["total_runs"].mean()

solver
bayesian    50.4819
pvalue      54.7092
sbft        63.3170
sprt        58.1474
Name: total_runs, dtype: float64

In [30]:

MAX_EXPR = 10_000
MAX_ITER = 64

records = []
for i in tqdm.tqdm(range(MAX_EXPR)):
    problem = generate_random_problem(k=1)
    answer = np.bincount(problem).argmax()

    for solver_name in ["sbft", "sprt", "pvalue","bayesian"]:
        solver = ConfidentSolver(solver_name)
        ret = solver.invoke(problem)
        ret["solver"] = solver_name
        ret["answer"] = answer
        ret["correct"] = ret["answer"] == ret["target"]
        records.append(ret)




100%|██████████| 10000/10000 [00:16<00:00, 620.41it/s]


In [31]:
df_k1 = pd.DataFrame(records)
1-df_k1.groupby("solver")["correct"].mean()

solver
bayesian    0.0
pvalue      0.0
sbft        0.0
sprt        0.0
Name: correct, dtype: float64

In [32]:
df_k1.groupby("solver")["total_runs"].mean()

solver
bayesian    4.0
pvalue      5.0
sbft        9.0
sprt        9.0
Name: total_runs, dtype: float64