# Improving Large Language Model Reasoning Accuracy via Multi-Agent Optimization

## VECTRA demo (Transformers-only)

This notebook demonstrates the project thesis: **reasoning accuracy can be improved at inference-time** by using **multi-agent optimization** rather than a single-pass generation.

We compare:
- **Baseline**: one local Transformers generation per question
- **VECTRA**: tool-loop + parallel attempts + early-stop voting (an optimization over multiple candidate trajectories)

The benchmark uses 4 small reasoning suites and runs **without any OpenAI API calls**.

## Constraints
- Model is loaded locally via `transformers` with Hugging Face id: `openai/gpt-oss-20b`.
- If you canâ€™t load a 20B model on your machine, set `MODEL_ID` to a smaller model to validate the methodology.

## Setup
Install VECTRA with Transformers support. You must install an appropriate `torch` build for your machine (CPU or CUDA).

This demo is intentionally **Transformers-only** so the results reflect inference-time optimization (multi-attempt orchestration + tool execution), not API/provider effects.

In [None]:
# Bootstrap: use VECTRA directly from GitHub source when this notebook is not run inside the repo.
from __future__ import annotations
import importlib.util
import shutil
import subprocess
import sys
import zipfile
from pathlib import Path

REPO_GIT_URL = "https://github.com/GokulPrasathM/VECTRA.git"
REPO_ZIP_URL = "https://github.com/GokulPrasathM/VECTRA/archive/refs/heads/main.zip"
BRANCH = "main"
CLONE_DIR = Path.cwd() / "_vectra_repo"

def _pip_install(*args: str) -> None:
    cmd = [sys.executable, "-m", "pip", "install", "-q", *args]
    print("Running:", " ".join(cmd))
    subprocess.check_call(cmd)

def _ensure_importable(module: str, pip_spec: str | None = None) -> None:
    if importlib.util.find_spec(module) is not None:
        return
    _pip_install(pip_spec or module)

def _ensure_repo_on_syspath() -> Path:
    # If notebook is running from inside the repo already.
    here = Path.cwd()
    if (here / "src" / "vectra").exists():
        repo_root = here
    elif (CLONE_DIR / "src" / "vectra").exists():
        repo_root = CLONE_DIR
    else:
        # Try git clone; fall back to zip download if git isn't available.
        shutil.rmtree(CLONE_DIR, ignore_errors=True)
        try:
            subprocess.check_call(["git", "--version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            subprocess.check_call([
                "git",
                "clone",
                "--depth",
                "1",
                "--branch",
                BRANCH,
                REPO_GIT_URL,
                str(CLONE_DIR),
            ])
        except Exception:
            import urllib.request
            zip_path = here / "_vectra_repo.zip"
            print("Downloading:", REPO_ZIP_URL)
            urllib.request.urlretrieve(REPO_ZIP_URL, zip_path)
            with zipfile.ZipFile(zip_path) as zf:
                zf.extractall(here)
            extracted = here / f"VECTRA-{BRANCH}"
            if extracted.exists():
                extracted.rename(CLONE_DIR)
            zip_path.unlink(missing_ok=True)
        repo_root = CLONE_DIR
    src_dir = repo_root / "src"
    if str(src_dir) not in sys.path:
        sys.path.insert(0, str(src_dir))
    return repo_root

# Ensure non-torch deps are present. (Torch install is environment-specific on Kaggle.)
_ensure_importable("httpx", "httpx>=0.26")
_ensure_importable("datasets")
_ensure_importable("transformers")
_ensure_importable("accelerate")
repo_root = _ensure_repo_on_syspath()
print("Using VECTRA source from:", repo_root.resolve())

In [None]:
# Dependencies:
# - This notebook now bootstraps VECTRA from GitHub source (see the cell above)
# - You still need an appropriate `torch` build for your machine (CPU or CUDA).
# If you're running locally and want to manage deps manually, you can use:
# %pip install -q datasets transformers accelerate httpx
# %pip install torch  # choose the right wheel/index-url for your environment
import json
import math
import re
import statistics
import time
from vectra import (
    ScenarioSolveConfig,
    TransformersClient,
    TransformersClientConfig,
    solve_scenario_async,
)
from vectra.scenario.attempts import AttemptConfig
from vectra.tools.tool_loop import ToolLoopConfig
from vectra.types import ChatMessage

In [None]:
# Model + generation settings
MODEL_ID = 'openai/gpt-oss-20b'
TEMPERATURE = 0.2
MAX_NEW_TOKENS = 256

# Create ONE shared local Transformers client so we do not load 20B weights twice.
client = TransformersClient(
    TransformersClientConfig(
        model_id=MODEL_ID,
        max_new_tokens=MAX_NEW_TOKENS,
        device_map='auto',
        torch_dtype='auto',
    )
)

print('Loaded Transformers model:', MODEL_ID)

In [None]:
# Real reasoning benchmarks (sampled) via Hugging Face `datasets`
#
# Benchmarks used (commonly reported for reasoning evaluation):
# - GSM8K (grade school math word problems)
# - SVAMP (math word problems with simple arithmetic)
# - MATH500 (subset of MATH; competition-style math problems)
# - CommonsenseQA (multiple-choice commonsense reasoning)
#
# NOTE: This notebook still runs Transformers-only (no OpenAI API).
# It may download datasets from Hugging Face the first time you run it.

import random

SAMPLES_PER_SUITE = 25  # increase for a more reliable estimate
RANDOM_SEED = 7

def _norm(s: str) -> str:
    s = (s or '').strip()
    s = re.sub(r'\s+', ' ', s)
    return s

def _norm_compact(s: str) -> str:
    s = (s or '').strip().lower()
    s = re.sub(r'\s+', '', s)
    return s

def extract_final(text: str) -> str:
    if not text:
        return ''
    m = re.search(r'FINAL\s*:\s*(.+)$', text.strip(), flags=re.IGNORECASE)
    return (m.group(1).strip() if m else text.strip())

def _normalize_numeric(s: str) -> str:
    s = _norm_compact(s).replace(',', '')
    m = re.search(r'-?\d+(?:\.\d+)?', s)
    return m.group(0) if m else s

def _normalize_choice_letter(s: str) -> str:
    s2 = (s or '').strip().upper()
    m = re.search(r'\b([A-E])\b', s2)
    if m:
        return m.group(1)
    if s2[:1] in {'A','B','C','D','E'}:
        return s2[:1]
    return s2[:1]

def is_correct(pred: str, ref: str, *, kind: str) -> bool:
    if kind == 'numeric':
        return _normalize_numeric(pred) == _normalize_numeric(ref)
    if kind == 'choice':
        return _normalize_choice_letter(pred) == _normalize_choice_letter(ref)
    return _norm_compact(pred) == _norm_compact(ref)

def _require_load_dataset():
    try:
        from datasets import load_dataset  # type: ignore
    except Exception as e:  # noqa: BLE001
        raise RuntimeError(
            "Missing dependency: datasets. Install with: %pip install datasets"
        ) from e
    return load_dataset

def _sample_indices(n_total: int, n_sample: int, seed: int) -> list[int]:
    n_sample = min(int(n_sample), int(n_total))
    rnd = random.Random(seed)
    return rnd.sample(range(n_total), n_sample)

def _parse_gsm8k_reference(ans: str) -> str:
    # GSM8K answers typically end with '#### <number>'
    if '####' in ans:
        return ans.split('####')[-1].strip()
    return (ans or '').strip()

def _parse_math_reference(ans: str) -> str:
    # MATH solutions often contain a boxed final answer.
    if not ans:
        return ''
    m = re.findall(r'\\boxed\{([^}]*)\}', ans)
    if m:
        return m[-1].strip()
    # fallback: also handle \boxed <...> variants loosely
    m2 = re.findall(r'\\boxed\s*([^\n\r]+)', ans)
    if m2:
        return m2[-1].strip()
    return ans.strip()

def _format_freeform_problem(question: str) -> str:
    q = _norm(question)
    return q + "\n\nReturn exactly one line: FINAL: <answer>."

def _format_mc_problem(stem: str, choices: list[tuple[str, str]]) -> str:
    lines = [_norm(stem), "", "Choices:"]
    for label, text in choices:
        lines.append(f"{label}. {_norm(text)}")
    lines.append("")
    lines.append("Return exactly one line: FINAL: <choice-letter>. Example: FINAL: C")
    return "\n".join(lines)

def _load_math500(load_dataset):
    # Prefer a stable MATH-500 dataset id; keep fallbacks for portability.
    candidates = [
        ('HuggingFaceH4/MATH-500', None),
        ('lighteval/MATH-500', None),
    ]
    last_err = None
    for name, subset in candidates:
        try:
            if subset is None:
                return load_dataset(name, split='test')
            return load_dataset(name, subset, split='test')
        except Exception as e:  # noqa: BLE001
            last_err = e
    raise RuntimeError(f"Failed to load MATH500 from {candidates}: {last_err}")

def load_real_benchmarks(*, samples_per_suite: int, seed: int) -> dict[str, list[dict]]:
    load_dataset = _require_load_dataset()

    # 1) GSM8K (use test split)
    gsm = load_dataset('gsm8k', 'main', split='test')
    gsm_idx = _sample_indices(len(gsm), samples_per_suite, seed + 1)
    gsm_items = []
    for i in gsm_idx:
        row = gsm[int(i)]
        q = row.get('question', '')
        ref = _parse_gsm8k_reference(row.get('answer', ''))
        gsm_items.append({
            'id': f"gsm8k:test:{int(i)}",
            'problem': _format_freeform_problem(q),
            'reference': str(ref),
            'kind': 'numeric',
        })

    # 2) SVAMP (use test split)
    sv = load_dataset('svamp', split='test')
    sv_idx = _sample_indices(len(sv), samples_per_suite, seed + 2)
    sv_items = []
    for i in sv_idx:
        row = sv[int(i)]
        body = row.get('Body', '') or row.get('body', '')
        question = row.get('Question', '') or row.get('question', '')
        ref = row.get('Answer', None)
        if ref is None:
            ref = row.get('answer', '')
        q_text = (str(body).strip() + "\n" + str(question).strip()).strip()
        sv_items.append({
            'id': f"svamp:test:{int(i)}",
            'problem': _format_freeform_problem(q_text),
            'reference': str(ref),
            'kind': 'numeric',
        })

    # 3) MATH500 (competition-style math; free-form answer)
    math500 = _load_math500(load_dataset)
    m_idx = _sample_indices(len(math500), samples_per_suite, seed + 3)
    m_items = []
    for i in m_idx:
        row = math500[int(i)]
        # Common fields: 'problem', 'solution', sometimes 'answer'
        q = row.get('problem', '') or row.get('question', '')
        ref = row.get('answer', None)
        if ref is None:
            ref = _parse_math_reference(row.get('solution', '') or row.get('final_answer', '') or '')
        m_items.append({
            'id': f"math500:test:{int(i)}",
            'problem': _format_freeform_problem(str(q)),
            'reference': str(ref),
            # Scoring MATH exactly is non-trivial; we use normalized exact match as a strict baseline.
            'kind': 'text',
        })

    # 4) CommonsenseQA (use validation split; multiple choice)
    csqa = load_dataset('commonsense_qa', split='validation')
    csqa_idx = _sample_indices(len(csqa), samples_per_suite, seed + 4)
    csqa_items = []
    for i in csqa_idx:
        row = csqa[int(i)]
        q = row.get('question', '')
        ch = row.get('choices', {}) or {}
        labels = ch.get('label', []) or []
        texts = ch.get('text', []) or []
        choices = list(zip(labels, texts, strict=False))
        ref = row.get('answerKey', '')
        csqa_items.append({
            'id': f"commonsense_qa:val:{int(i)}",
            'problem': _format_mc_problem(q, choices),
            'reference': str(ref),
            'kind': 'choice',
        })

    return {
        'GSM8K': gsm_items,
        'SVAMP': sv_items,
        'MATH500': m_items,
        'CommonsenseQA': csqa_items,
    }

BENCHMARKS = load_real_benchmarks(samples_per_suite=SAMPLES_PER_SUITE, seed=RANDOM_SEED)
print({k: len(v) for k, v in BENCHMARKS.items()})

In [None]:
# Baseline: single-pass local Transformers inference (no tools, no parallel attempts)
BASELINE_SYSTEM = (
    'You are a careful reasoner. Do not call tools. '
    'Return exactly one line: FINAL: <answer>.'
)

async def baseline_answer(problem: str) -> str:
    messages = [
        ChatMessage(role='system', content=BASELINE_SYSTEM),
        ChatMessage(role='user', content=problem),
    ]
    out = await client.chat(messages, temperature=TEMPERATURE, max_tokens=MAX_NEW_TOKENS, n=1)
    return extract_final(out[0])

async def baseline_run_suite(items):
    rows = []
    for it in items:
        t0 = time.time()
        pred = await baseline_answer(it['problem'])
        dt = time.time() - t0
        rows.append({
            'id': it['id'],
            'pred': pred,
            'ref': it['reference'],
            'kind': it.get('kind', 'text'),
            'correct': is_correct(pred, it['reference'], kind=it.get('kind', 'text')),
            'latency_s': dt,
        })
    return rows

In [None]:
# VECTRA function-calling: tool loop + parallel attempts + early-stop consensus
VECTRA_ATTEMPTS = 4
VECTRA_EARLY_STOP = 2
VECTRA_MAX_TURNS = 16

tool_example = json.dumps({'tool': 'python', 'code': 'print(2+2)'})
VECTRA_SYSTEM = (
    'You may use an external Python tool. '
    f'To call it, output ONLY a JSON object like: {tool_example}. '
    'To finish, output: FINAL: <your answer>.'
)

async def vectra_answer(problem: str) -> str:
    cfg = ScenarioSolveConfig(
        transformers_client=client,
        system_prompt=VECTRA_SYSTEM,
        attempts=AttemptConfig(
            attempts=VECTRA_ATTEMPTS,
            early_stop=VECTRA_EARLY_STOP,
            max_concurrency=VECTRA_ATTEMPTS,
            tool_loop=ToolLoopConfig(
                max_turns=VECTRA_MAX_TURNS,
                temperature=TEMPERATURE,
            ),
        ),
        return_trace=False,
    )
    res = await solve_scenario_async(problem, cfg)
    return extract_final('FINAL: ' + res.answer)

async def vectra_run_suite(items):
    rows = []
    for it in items:
        t0 = time.time()
        pred = await vectra_answer(it['problem'])
        dt = time.time() - t0
        rows.append({
            'id': it['id'],
            'pred': pred,
            'ref': it['reference'],
            'kind': it.get('kind', 'text'),
            'correct': is_correct(pred, it['reference'], kind=it.get('kind', 'text')),
            'latency_s': dt,
        })
    return rows

In [None]:
def summarize(rows):
    n = len(rows)
    acc = sum(1 for r in rows if r['correct']) / max(1, n)
    lat = [r['latency_s'] for r in rows]
    if not lat:
        return {'n': n, 'accuracy': acc, 'avg_latency_s': 0.0, 'p95_latency_s': 0.0}
    lat_sorted = sorted(lat)
    p95 = lat_sorted[max(0, math.ceil(0.95 * len(lat_sorted)) - 1)]
    return {
        'n': n,
        'accuracy': acc,
        'avg_latency_s': statistics.mean(lat),
        'p95_latency_s': p95,
    }

def print_table(rows):
    headers = ['suite', 'mode', 'n', 'accuracy', 'avg_latency_s', 'p95_latency_s']
    print('\t'.join(headers))
    for r in rows:
        print('\t'.join([
            str(r['suite']),
            str(r['mode']),
            str(r['n']),
            f"{r['accuracy']:.3f}",
            f"{r['avg_latency_s']:.3f}",
            f"{r['p95_latency_s']:.3f}",
        ]))

async def run_all():
    baseline_all = {}
    vectra_all = {}

    for suite, items in BENCHMARKS.items():
        print(f"Baseline: {suite}")
        baseline_all[suite] = await baseline_run_suite(items)

    for suite, items in BENCHMARKS.items():
        print(f"VECTRA: {suite}")
        vectra_all[suite] = await vectra_run_suite(items)

    summary = []
    for suite in BENCHMARKS.keys():
        summary.append({'suite': suite, 'mode': 'baseline', **summarize(baseline_all[suite])})
        summary.append({'suite': suite, 'mode': 'vectra', **summarize(vectra_all[suite])})

    print('\nPer-suite summary')
    print('---------------')
    print_table(summary)

    baseline_flat = [r for v in baseline_all.values() for r in v]
    vectra_flat = [r for v in vectra_all.values() for r in v]

    agg = [
        {'suite': 'ALL', 'mode': 'baseline', **summarize(baseline_flat)},
        {'suite': 'ALL', 'mode': 'vectra', **summarize(vectra_flat)},
    ]

    print('\nAggregate summary')
    print('-----------------')
    print_table(agg)

    acc_delta = agg[1]['accuracy'] - agg[0]['accuracy']
    lat_delta = agg[1]['avg_latency_s'] - agg[0]['avg_latency_s']

    print('\nImpact (VECTRA - Baseline)')
    print('--------------------------')
    print('Accuracy delta:', f"{acc_delta:+.3f}")
    print('Avg latency delta (s):', f"{lat_delta:+.3f}")

    print('\nPaste-ready summary')
    print('------------------')
    print(
        f"Model: {MODEL_ID} (Transformers local inference)\n"
        f"Benchmarks: GSM8K, SVAMP, MATH500, CommonsenseQA (sampled subsets).\n"
        f"Scope: {agg[0]['n']} questions total (SAMPLES_PER_SUITE={SAMPLES_PER_SUITE}).\n"
        f"Baseline (single-pass): accuracy={agg[0]['accuracy']:.3f}, avg latency={agg[0]['avg_latency_s']:.3f}s.\n"
        f"VECTRA (tools + parallel attempts): accuracy={agg[1]['accuracy']:.3f}, avg latency={agg[1]['avg_latency_s']:.3f}s.\n"
        f"Net impact: accuracy delta={acc_delta:+.3f}, latency delta={lat_delta:+.3f}s.\n"
        "",
        'Method notes:\n',
        '- Baseline uses one local generation per question.\n',
        '- VECTRA runs multiple attempts and can execute Python when requested via a strict JSON tool protocol.\n',
        '- Early-stop consensus can reduce extra attempts when answers match.\n',
        '- MATH500 scoring here uses strict normalized string match; for papers, consider a domain-aware grader.\n',
    )

await run_all()