# Architecture A — single-agent baseline

Questa notebook prepara l'ambiente su Colab, clona la repo, configura i modelli HuggingFace e lancia il grafo per l'architettura **A** (single-agent → tester). Salva i log in `log/`.

Note operative:
- Imposta un token HuggingFace valido (HF Inference API) quando richiesto.
- Lancia pochi task della APPS per evitare costi/tempo eccessivi.
- I log sono salvati sia su stdout sia in file JSONL/LOG per analisi successive.

In [1]:
import os
import sys
import subprocess
import pathlib

REPO_URL = "https://github.com/LLM4SE-group-15/ArchitecturesForCodeDevelopmentWithLLMs.git"
REPO_DIR = pathlib.Path("/content/ArchitecturesForCodeDevelopmentWithLLMs")

if not REPO_DIR.exists():
    subprocess.run(["git", "clone", REPO_URL, str(REPO_DIR)], check=True)

os.chdir(REPO_DIR)
subprocess.run([sys.executable, "-m", "pip", "install", "-U", "pip"], check=True)
subprocess.run([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"], check=True)

print(f"Using repo at {REPO_DIR.resolve()}")

Using repo at /content/ArchitecturesForCodeDevelopmentWithLLMs


In [2]:
!pip install -r requirements.txt



In [3]:
import os
import getpass
from huggingface_hub import login

# Forza sempre l'uso del token che inserisci
os.environ["HF_TOKEN"] = getpass.getpass("Enter HF_TOKEN (kept hidden): ")

login(token=os.environ["HF_TOKEN"], add_to_git_credential=False)

os.environ["ARCHITECTURE"] = "A"
print("ARCHITECTURE set to", os.environ["ARCHITECTURE"])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


ARCHITECTURE set to A


In [4]:
from huggingface_hub import HfApi

api = HfApi()
try:
    user_info = api.whoami(token=os.environ["HF_TOKEN"])
    print("Logged in to Hugging Face as:", user_info.get("name") or user_info.get("user"))
except Exception as exc:
    print("Login check failed:", exc)

Logged in to Hugging Face as: Riaburger


In [5]:
import json
import time
import logging
import pathlib
import os

from datetime import datetime

DEFAULT_ROOT = pathlib.Path("/content/ArchitecturesForCodeDevelopmentWithLLMs")
ROOT = DEFAULT_ROOT if DEFAULT_ROOT.exists() else pathlib.Path.cwd()

sys.path.insert(0, str(ROOT))

LOG_DIR = ROOT / "log"
LOG_DIR.mkdir(exist_ok=True)

logger = logging.getLogger("architecture_A")
logger.setLevel(logging.INFO)
if logger.handlers:
    logger.handlers.clear()
formatter = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
file_handler = logging.FileHandler(LOG_DIR / "architecture_A.log")
stream_handler = logging.StreamHandler()
for handler in (file_handler, stream_handler):
    handler.setFormatter(formatter)
    logger.addHandler(handler)

logger.info("Logger ready. Repo root: %s", ROOT)
logger.info("Log files: %s", LOG_DIR)
print("Logs ->", LOG_DIR)

2026-01-03 17:18:33,134 | INFO | Logger ready. Repo root: /content/ArchitecturesForCodeDevelopmentWithLLMs
INFO:architecture_A:Logger ready. Repo root: /content/ArchitecturesForCodeDevelopmentWithLLMs
2026-01-03 17:18:33,136 | INFO | Log files: /content/ArchitecturesForCodeDevelopmentWithLLMs/log
INFO:architecture_A:Log files: /content/ArchitecturesForCodeDevelopmentWithLLMs/log


Logs -> /content/ArchitecturesForCodeDevelopmentWithLLMs/log


In [6]:
import time
from src.data.task_loader import APPSTaskLoader
from src.graph.graph import run_graph
from src.agents.llm import Architecture

ARCH = Architecture.A

def run_sample_tasks(
    per_level: int = 10,
    split: str = "test",
    difficulties: tuple[str, ...] = ("introductory",),
    shuffle: bool = True,
):
    loader = APPSTaskLoader(split=split)
    if shuffle:
        loader._dataset = loader.dataset.shuffle(seed=int(time.time()))

    tasks = []
    for diff in difficulties:
        tasks.extend(loader.load_by_difficulty(diff, limit=per_level))

    results = []
    total = len(tasks)
    logger.info("Loaded %s tasks (%s per difficulty: %s)", total, per_level, ", ".join(difficulties))

    for idx, task in enumerate(tasks, 1):
        logger.info("Running %s/%s %s (%s)", idx, total, task.task_id, task.difficulty)
        start = time.time()
        state = run_graph(
            task_id=task.task_id,
            task_description=task.question,
            test_inputs=task.inputs,
            test_outputs=task.outputs,
            architecture=ARCH,
        )

        print("State:", state)
        print()

        elapsed = time.time() - start
        record = {
            "task_id": task.task_id,
            "difficulty": task.difficulty,
            "architecture": str(ARCH.value),
            "test_passed": state["test_passed"],
            "developer_tier": state.get("developer_tier"),
            "escalations": state["escalations"],
            "story_points_initial": state.get("story_points_initial"),
            "story_points_final": state.get("story_points_current"),
            "elapsed_seconds": elapsed,
        }
        results.append(record)
        logger.info(
            "Finished %s | pass=%s tier=%s escalations=%s elapsed=%.1fs",
            task.task_id,
            state["test_passed"],
            record["developer_tier"],
            record["escalations"],
            elapsed,
        )
        with open(LOG_DIR / "architecture_A.jsonl", "a", encoding="utf-8") as f:
            f.write(json.dumps(record) + "\n")
    return results

sample_results = run_sample_tasks(per_level=10, difficulties=("introductory",), shuffle=True)
sample_results

Loading APPS dataset (test split)...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Downloading builder script:   0%|          | 0.00/4.95k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.63k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/107M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

2026-01-03 17:19:22,724 | INFO | Loaded 10 tasks (10 per difficulty: introductory)
INFO:architecture_A:Loaded 10 tasks (10 per difficulty: introductory)
2026-01-03 17:19:22,725 | INFO | Running 1/10 apps_4590 (introductory)
INFO:architecture_A:Running 1/10 apps_4590 (introductory)


Loaded 5000 tasks.


2026-01-03 17:19:28,316 | INFO | Finished apps_4590 | pass=False tier=None escalations=0 elapsed=5.6s
INFO:architecture_A:Finished apps_4590 | pass=False tier=None escalations=0 elapsed=5.6s
2026-01-03 17:19:28,319 | INFO | Running 2/10 apps_4573 (introductory)
INFO:architecture_A:Running 2/10 apps_4573 (introductory)


State: {'task_id': 'apps_4590', 'task_description': 'We have two desks: A and B. Desk A has a vertical stack of N books on it, and Desk B similarly has M books on it.\nIt takes us A_i minutes to read the i-th book from the top on Desk A (1 \\leq i \\leq N), and B_i minutes to read the i-th book from the top on Desk B (1 \\leq i \\leq M).\nConsider the following action:\n - Choose a desk with a book remaining, read the topmost book on that desk, and remove it from the desk.\nHow many books can we read at most by repeating this action so that it takes us at most K minutes in total? We ignore the time it takes to do anything other than reading.\n\n-----Constraints-----\n - 1 \\leq N, M \\leq 200000\n - 1 \\leq K \\leq 10^9\n - 1 \\leq A_i, B_i \\leq 10^9\n - All values in input are integers.\n\n-----Input-----\nInput is given from Standard Input in the following format:\nN M K\nA_1 A_2 \\ldots A_N\nB_1 B_2 \\ldots B_M\n\n-----Output-----\nPrint an integer representing the maximum number o

2026-01-03 17:19:35,920 | INFO | Finished apps_4573 | pass=False tier=None escalations=0 elapsed=7.6s
INFO:architecture_A:Finished apps_4573 | pass=False tier=None escalations=0 elapsed=7.6s
2026-01-03 17:19:35,923 | INFO | Running 3/10 apps_4713 (introductory)
INFO:architecture_A:Running 3/10 apps_4713 (introductory)


State: {'task_id': 'apps_4573', 'task_description': 'When l is an odd number, the median of l numbers a_1, a_2, ..., a_l is the (\\frac{l+1}{2})-th largest value among a_1, a_2, ..., a_l.\nYou are given N numbers X_1, X_2, ..., X_N, where N is an even number.\nFor each i = 1, 2, ..., N, let the median of X_1, X_2, ..., X_N excluding X_i, that is, the median of X_1, X_2, ..., X_{i-1}, X_{i+1}, ..., X_N be B_i.\nFind B_i for each i = 1, 2, ..., N.\n\n-----Constraints-----\n - 2 \\leq N \\leq 200000\n - N is even.\n - 1 \\leq X_i \\leq 10^9\n - All values in input are integers.\n\n-----Input-----\nInput is given from Standard Input in the following format:\nN\nX_1 X_2 ... X_N\n\n-----Output-----\nPrint N lines.\nThe i-th line should contain B_i.\n\n-----Sample Input-----\n4\n2 4 4 3\n\n-----Sample Output-----\n4\n3\n3\n4\n\n - Since the median of X_2, X_3, X_4 is 4, B_1 = 4.\n - Since the median of X_1, X_3, X_4 is 3, B_2 = 3.\n - Since the median of X_1, X_2, X_4 is 3, B_3 = 3.\n - Since

2026-01-03 17:19:38,906 | INFO | Finished apps_4713 | pass=True tier=None escalations=0 elapsed=3.0s
INFO:architecture_A:Finished apps_4713 | pass=True tier=None escalations=0 elapsed=3.0s
2026-01-03 17:19:38,909 | INFO | Running 4/10 apps_4241 (introductory)
INFO:architecture_A:Running 4/10 apps_4241 (introductory)


State: {'task_id': 'apps_4713', 'task_description': 'You have an integer variable x.\nInitially, x=0.\nSome person gave you a string S of length N, and using the string you performed the following operation N times.\nIn the i-th operation, you incremented the value of x by 1 if S_i=I, and decremented the value of x by 1 if S_i=D.\nFind the maximum value taken by x during the operations (including before the first operation, and after the last operation).\n\n-----Constraints-----\n - 1≤N≤100\n - |S|=N\n - No characters except I and D occur in S.\n\n-----Input-----\nThe input is given from Standard Input in the following format:\nN\nS\n\n-----Output-----\nPrint the maximum value taken by x during the operations.\n\n-----Sample Input-----\n5\nIIDID\n\n-----Sample Output-----\n2\n\nAfter each operation, the value of x becomes 1, 2, 1, 2 and 1, respectively. Thus, the output should be 2, the maximum value.', 'plan': None, 'story_points_initial': None, 'story_points_current': None, 'escalati

2026-01-03 17:19:45,472 | INFO | Finished apps_4241 | pass=False tier=None escalations=0 elapsed=6.6s
INFO:architecture_A:Finished apps_4241 | pass=False tier=None escalations=0 elapsed=6.6s
2026-01-03 17:19:45,475 | INFO | Running 5/10 apps_4553 (introductory)
INFO:architecture_A:Running 5/10 apps_4553 (introductory)


State: {'task_id': 'apps_4241', 'task_description': 'Given are two strings S and T.\nLet us change some of the characters in S so that T will be a substring of S.\nAt least how many characters do we need to change?\nHere, a substring is a consecutive subsequence. For example, xxx is a substring of yxxxy, but not a substring of xxyxx.\n\n-----Constraints-----\n - The lengths of S and T are each at least 1 and at most 1000.\n - The length of T is at most that of S.\n - S and T consist of lowercase English letters.\n\n-----Input-----\nInput is given from Standard Input in the following format:\nS\nT\n\n-----Output-----\nPrint the minimum number of characters in S that need to be changed.\n\n-----Sample Input-----\ncabacc\nabc\n\n-----Sample Output-----\n1\n\nFor example, changing the fourth character a in S to c will match the second through fourth characters in S to T.\nSince S itself does not have T as its substring, this number of changes - one - is the minimum needed.', 'plan': None, 

2026-01-03 17:19:48,571 | INFO | Finished apps_4553 | pass=False tier=None escalations=0 elapsed=3.1s
INFO:architecture_A:Finished apps_4553 | pass=False tier=None escalations=0 elapsed=3.1s
2026-01-03 17:19:48,574 | INFO | Running 6/10 apps_4624 (introductory)
INFO:architecture_A:Running 6/10 apps_4624 (introductory)


State: {'task_id': 'apps_4553', 'task_description': 'The postal code in Atcoder Kingdom is A+B+1 characters long, its (A+1)-th character is a hyphen -, and the other characters are digits from 0 through 9.\nYou are given a string S. Determine whether it follows the postal code format in Atcoder Kingdom.\n\n-----Constraints-----\n - 1≤A,B≤5\n - |S|=A+B+1\n - S consists of - and digits from 0 through 9.\n\n-----Input-----\nInput is given from Standard Input in the following format:\nA B\nS\n\n-----Output-----\nPrint Yes if S follows the postal code format in AtCoder Kingdom; print No otherwise.\n\n-----Sample Input-----\n3 4\n269-6650\n\n-----Sample Output-----\nYes\n\nThe (A+1)-th character of S is -, and the other characters are digits from 0 through 9, so it follows the format.', 'plan': None, 'story_points_initial': None, 'story_points_current': None, 'escalations': 0, 'developer_tier': None, 'generated_code': "def check_postal_code(A: int, B: int, S: str) -> str:\n    if len(S) != A

2026-01-03 17:19:53,481 | INFO | Finished apps_4624 | pass=True tier=None escalations=0 elapsed=4.9s
INFO:architecture_A:Finished apps_4624 | pass=True tier=None escalations=0 elapsed=4.9s
2026-01-03 17:19:53,484 | INFO | Running 7/10 apps_4819 (introductory)
INFO:architecture_A:Running 7/10 apps_4819 (introductory)


State: {'task_id': 'apps_4624', 'task_description': "Vasya goes to visit his classmate Petya. Vasya knows that Petya's apartment number is $n$. \n\nThere is only one entrance in Petya's house and the distribution of apartments is the following: the first floor contains $2$ apartments, every other floor contains $x$ apartments each. Apartments are numbered starting from one, from the first floor. I.e. apartments on the first floor have numbers $1$ and $2$, apartments on the second floor have numbers from $3$ to $(x + 2)$, apartments on the third floor have numbers from $(x + 3)$ to $(2 \\cdot x + 2)$, and so on.\n\nYour task is to find the number of floor on which Petya lives. Assume that the house is always high enough to fit at least $n$ apartments.\n\nYou have to answer $t$ independent test cases.\n\n\n-----Input-----\n\nThe first line of the input contains one integer $t$ ($1 \\le t \\le 1000$) — the number of test cases. Then $t$ test cases follow.\n\nThe only line of the test case

2026-01-03 17:19:55,971 | INFO | Finished apps_4819 | pass=True tier=None escalations=0 elapsed=2.5s
INFO:architecture_A:Finished apps_4819 | pass=True tier=None escalations=0 elapsed=2.5s
2026-01-03 17:19:55,973 | INFO | Running 8/10 apps_4357 (introductory)
INFO:architecture_A:Running 8/10 apps_4357 (introductory)


State: {'task_id': 'apps_4819', 'task_description': 'Given two integers $A$ and $B$, $A$ modulo $B$ is the remainder when dividing $A$ by $B$. For example, the numbers $7$, $14$, $27$ and $38$ become $1$, $2$, $0$ and $2$, modulo $3$. Write a program that accepts $10$ numbers as input and outputs the number of distinct numbers in the input, if the numbers are considered modulo $42$.\n\n-----Input-----\nThe input will contain 10 non-negative integers, each smaller than $1000$, one per line.\n\n-----Output-----\nOutput the number of distinct values when considered modulo $42$ on a single line.\n\n-----Explanation of Sample Inputs-----\nIn sample input $1$, the numbers modulo $42$ are $1, 2, 3, 4, 5, 6, 7, 8, 9$ and $10$.\n\nIn sample input $2$, all numbers modulo $42$ are $0$.\n\nIn sample input $3$, the numbers modulo $42$ are $39, 40, 41, 0, 1, 2, 40, 41, 0$ and $1$. There are $6$ distinct numbers.\n\n-----Examples-----\nSample Input 1:\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\nSample Output 1:\

2026-01-03 17:20:00,093 | INFO | Finished apps_4357 | pass=True tier=None escalations=0 elapsed=4.1s
INFO:architecture_A:Finished apps_4357 | pass=True tier=None escalations=0 elapsed=4.1s
2026-01-03 17:20:00,096 | INFO | Running 9/10 apps_4823 (introductory)
INFO:architecture_A:Running 9/10 apps_4823 (introductory)


State: {'task_id': 'apps_4357', 'task_description': 'You have decided to give an allowance to your child depending on the outcome of the game that he will play now.\nThe game is played as follows:\n - There are three "integer panels", each with a digit between 1 and 9 (inclusive) printed on it, and one "operator panel" with a + printed on it.\n - The player should construct a formula of the form X + Y, by arranging the four panels from left to right. (The operator panel should not be placed at either end of the formula.)\n - Then, the amount of the allowance will be equal to the resulting value of the formula.\nGiven the values A, B and C printed on the integer panels used in the game, find the maximum possible amount of the allowance.\n\n-----Constraints-----\n - All values in input are integers.\n - 1 \\leq A, B, C \\leq 9\n\n-----Input-----\nInput is given from Standard Input in the following format:\nA B C\n\n-----Output-----\nPrint the maximum possible amount of the allowance.\n\n

2026-01-03 17:20:03,572 | INFO | Finished apps_4823 | pass=False tier=None escalations=0 elapsed=3.5s
INFO:architecture_A:Finished apps_4823 | pass=False tier=None escalations=0 elapsed=3.5s
2026-01-03 17:20:03,574 | INFO | Running 10/10 apps_4048 (introductory)
INFO:architecture_A:Running 10/10 apps_4048 (introductory)


State: {'task_id': 'apps_4823', 'task_description': 'You are given a five-card hand drawn from a standard $52$-card deck. The strength of your hand is the maximum value $k$ such that there are $k$ cards in your hand that have the same rank.\n\nCompute the strength of your hand.\n\n-----Input-----\nThe input will consist of a single line, with five two-character strings separated by spaces.\n\nThe first character in each string will be the rank of the card, and will be one of A23456789TJQK. The second character in the string will be the suit of the card, and will be one of CDHS.\n\nYou may assume all the strings are distinct.\n\n-----Output-----\nOutput, on a single line, the strength of your hand.\n\n-----Examples-----\nSample Input 1:\nAC AD AH AS KD\nSample Output 1:\n4\n\nSample Input 2:\n2C 4D 4H 2D 2H\nSample Output 2:\n3', 'plan': None, 'story_points_initial': None, 'story_points_current': None, 'escalations': 0, 'developer_tier': None, 'generated_code': "import collections\n\nde

2026-01-03 17:20:10,903 | INFO | Finished apps_4048 | pass=False tier=None escalations=0 elapsed=7.3s
INFO:architecture_A:Finished apps_4048 | pass=False tier=None escalations=0 elapsed=7.3s


State: {'task_id': 'apps_4048', 'task_description': 'Takahashi is standing on a multiplication table with infinitely many rows and columns.\nThe square (i,j) contains the integer i \\times j. Initially, Takahashi is standing at (1,1).\nIn one move, he can move from (i,j) to either (i+1,j) or (i,j+1).\nGiven an integer N, find the minimum number of moves needed to reach a square that contains N.\n\n-----Constraints-----\n - 2 \\leq N \\leq 10^{12}\n - N is an integer.\n\n-----Input-----\nInput is given from Standard Input in the following format:\nN\n\n-----Output-----\nPrint the minimum number of moves needed to reach a square that contains the integer N.\n\n-----Sample Input-----\n10\n\n-----Sample Output-----\n5\n\n(2,5) can be reached in five moves. We cannot reach a square that contains 10 in less than five moves.', 'plan': None, 'story_points_initial': None, 'story_points_current': None, 'escalations': 0, 'developer_tier': None, 'generated_code': 'import math\n\ndef min_moves_to_r

[{'task_id': 'apps_4590',
  'difficulty': 'introductory',
  'architecture': 'A',
  'test_passed': False,
  'developer_tier': None,
  'escalations': 0,
  'story_points_initial': None,
  'story_points_final': None,
  'elapsed_seconds': 5.587212800979614},
 {'task_id': 'apps_4573',
  'difficulty': 'introductory',
  'architecture': 'A',
  'test_passed': False,
  'developer_tier': None,
  'escalations': 0,
  'story_points_initial': None,
  'story_points_final': None,
  'elapsed_seconds': 7.5999555587768555},
 {'task_id': 'apps_4713',
  'difficulty': 'introductory',
  'architecture': 'A',
  'test_passed': True,
  'developer_tier': None,
  'escalations': 0,
  'story_points_initial': None,
  'story_points_final': None,
  'elapsed_seconds': 2.982330083847046},
 {'task_id': 'apps_4241',
  'difficulty': 'introductory',
  'architecture': 'A',
  'test_passed': False,
  'developer_tier': None,
  'escalations': 0,
  'story_points_initial': None,
  'story_points_final': None,
  'elapsed_seconds': 6.56

In [39]:
!cd log && cat architecture_A.jsonl

{"task_id": "apps_4000", "difficulty": "introductory", "architecture": "A", "test_passed": false, "developer_tier": null, "escalations": 0, "story_points_initial": null, "story_points_final": null, "elapsed_seconds": 21.43400001525879}
{"task_id": "apps_4000", "difficulty": "introductory", "architecture": "A", "test_passed": false, "developer_tier": null, "escalations": 0, "story_points_initial": null, "story_points_final": null, "elapsed_seconds": 24.04499888420105}
{"task_id": "apps_4406", "difficulty": "introductory", "architecture": "A", "test_passed": false, "developer_tier": null, "escalations": 0, "story_points_initial": null, "story_points_final": null, "elapsed_seconds": 2.9327504634857178}
{"task_id": "apps_4326", "difficulty": "introductory", "architecture": "A", "test_passed": false, "developer_tier": null, "escalations": 0, "story_points_initial": null, "story_points_final": null, "elapsed_seconds": 2.3970892429351807}
{"task_id": "apps_4118", "difficulty": "introductory",

Miao