In [1]:
import concurrent.futures
import json
from pathlib import Path

from virtual_lab.constants import CONSISTENT_TEMPERATURE, CREATIVE_TEMPERATURE
from virtual_lab.prompts import (
    CODING_RULES,
    REWRITE_PROMPT,
    create_merge_prompt,
)
from virtual_lab.run_meeting import run_meeting
from virtual_lab.utils import load_summaries

from knowledge_base_constants import (
    background_prompt,
    num_iterations,
    num_rounds,
    discussions_phase_to_dir,
    principal_investigator,
    team_members,
    scientific_critic,
    tech_lead,
    biomedical_ontologist,
    data_scientist,
)

## Team selection

In [2]:
# Team selection - prompts
team_selection_agenda = f"""{background_prompt}
TASK: Define 3 distinct Agents to form the AlzKB Implementation Team.

OUTPUT FORMAT: Python `Agent()` objects ONLY. No conversational filler.
Do not include yourself.
TEMPLATE:
Agent(
    title="Principal Investigator (Alzheimer's KG)",
    expertise=(
        "Lead scientist specializing in Alzheimer's Disease (AD) data integration. "
        "Expert in constructing heterogeneous Knowledge Graphs connecting clinical phenotypes, "
        "neuroimaging features, genetic biomarkers (e.g., APOE), and tau/amyloid pathology."
    ),
    role=(
        "1. Define rigorous schemas aligning with standard ontologies (e.g., SNOMED CT, Gene Ontology). "
        "2. Direct the Tech Lead to prioritize high-confidence data sources (e.g., ADNI, AMP-AD). "
        "3. Review extraction pipelines for precision over recall to prevent hallucinated associations. "
        "4. Enforce strict validation protocols for entity resolution across multi-modal datasets."
    ),
)
"""

In [3]:
# Team selection - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="individual",
            team_member=principal_investigator,
            agenda=team_selection_agenda,
            save_dir=discussions_phase_to_dir["team_selection"],
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
        ) for iteration_num in range(num_iterations)
    ])

Rounds (+ Final Round):   0%|          | 0/1 [00:00<?, ?it/s]
[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







Team:   0%|          | 0/2 [00:11<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:11<00:00, 11.59s/it]


Input token count: 274
Output token count: 456
Tool token count: 0
Max token length: 730
Cost: $0.00
Time: 0:13


Team:   0%|          | 0/2 [00:13<?, ?it/s]







Rounds (+ Final Round): 100%|██████████| 1/1 [00:13<00:00, 13.62s/it]


Input token count: 274
Output token count: 550
Tool token count: 0
Max token length: 824
Cost: $0.00
Time: 0:15


Team:   0%|          | 0/2 [00:15<?, ?it/s]





Rounds (+ Final Round): 100%|██████████| 1/1 [00:15<00:00, 15.32s/it]


Input token count: 274
Output token count: 409
Tool token count: 0
Max token length: 683
Cost: $0.00
Time: 0:17


Team:   0%|          | 0/2 [00:16<?, ?it/s]



Rounds (+ Final Round): 100%|██████████| 1/1 [00:16<00:00, 16.67s/it]
Team:   0%|          | 0/2 [00:17<?, ?it/s]

Rounds (+ Final Round): 100%|██████████| 1/1 [00:17<00:00, 17.13s/it]

Input token count: 274
Output token count: 463
Tool token count: 0
Max token length: 737
Cost: $0.00
Time: 0:18
Input token count: 274
Output token count: 396
Tool token count: 0
Max token length: 670
Cost: $0.00
Time: 0:18





In [4]:
# Team selection - merge
team_selection_summaries = load_summaries(
    discussion_paths=list(discussions_phase_to_dir["team_selection"].glob("discussion_*.json")))
print(f"Number of summaries: {len(team_selection_summaries)}")

team_selection_merge_prompt = create_merge_prompt(agenda=team_selection_agenda)

run_meeting(
    meeting_type="individual",
    team_member=principal_investigator,
    summaries=team_selection_summaries,
    agenda=team_selection_merge_prompt,
    save_dir=discussions_phase_to_dir["team_selection"],
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
)

Number of summaries: 5


Team:   0%|          | 0/2 [00:21<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:21<00:00, 21.45s/it]


Input token count: 2,678
Output token count: 794
Tool token count: 0
Max token length: 3,472
Cost: $0.01
Time: 0:22


## Projects specification

In [4]:
# Project specification - prompts (Optimized)

project_specification_agenda = f"""{background_prompt}

TASK: Define the Technical Specification for AlzKB.
The team must agree on the architectural foundation before implementation.

OUTPUT REQUIREMENTS:
1. SCHEMA CORE.
2. DATA STRATEGY.
3. SUCCESS METRICS.

CONSTRAINTS: Be specific. No fluff. Prioritize AD-specific nuances (e.g., Tau isoforms).
"""

project_specification_questions = (
    ""
)

In [5]:
# Project specification - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="team",
            team_lead=principal_investigator,
            team_members=team_members,
            agenda=project_specification_agenda,
            agenda_questions=project_specification_questions,
            save_dir=discussions_phase_to_dir["project_specification"],
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
            num_rounds=num_rounds,
        ) for iteration_num in range(num_iterations)
    ])

Rounds (+ Final Round):   0%|          | 0/3 [00:00<?, ?it/s]
[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A

[A[A



[A[A[A[A





[A[A[A[A[A[A







[A[A[A[A[A[A[A[A

[A[A



[A[A[A[A





[A[A[A[A[A[A







[A[A[A[A[A[A[A[A





[A[A[A[A[A[A



[A[A[A[A

[A[A





[A[A[A[A[A[A

[A[A







[A[A[A[A[A[A[A[A



Team: 100%|██████████| 5/5 [01:22<00:00, 16.52s/it]
Rounds (+ Final Round):  33%|███▎      | 1/3 [01:22<02:45, 82.62s/it]





Team: 100%|██████████| 5/5 [01:24<00:00, 16.92s/it]





[A[A[A[A[A





[A[A[A[A[A[A



Team: 100%|██████████| 5/5 [01:32<00:00, 18.43s/it]



[A[A[A



[A[A[A[A

Team: 100%|██████████| 5/5 [01:35<00:00, 19.07s/it]

[A

[A[A





Team:   0%|          | 0/5 [00:06<?, ?it/s]
Rounds (+ Final Round):  33%|███▎      | 1/3 [01:41<03:23, 101.75s/it]








Team:  20%|██    

Input token count: 52,515
Output token count: 9,184
Tool token count: 0
Max token length: 10,454
Cost: $0.18
Time: 4:37


In [4]:
# Project specification - merge
project_specification_summaries = load_summaries(
    discussion_paths=list(discussions_phase_to_dir["project_specification"].glob("discussion_*.json")))
print(f"Number of summaries: {len(project_specification_summaries)}")

project_specification_merge_prompt = create_merge_prompt(
    agenda=project_specification_agenda,
    agenda_questions=project_specification_questions,
)

run_meeting(
    meeting_type="individual",
    team_member=principal_investigator,
    summaries=project_specification_summaries,
    agenda=project_specification_merge_prompt,
    save_dir=discussions_phase_to_dir["project_specification"],
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
    num_rounds=num_rounds,
)

Number of summaries: 1


Team: 100%|██████████| 2/2 [00:47<00:00, 23.76s/it]<?, ?it/s]
Team: 100%|██████████| 2/2 [00:37<00:00, 18.59s/it]<01:35, 47.52s/it]
Team:   0%|          | 0/2 [00:48<?, ?it/s]3 [01:24<00:41, 41.44s/it]
Rounds (+ Final Round): 100%|██████████| 3/3 [02:13<00:00, 44.47s/it]


Input token count: 23,745
Output token count: 7,601
Tool token count: 0
Max token length: 9,672
Cost: $0.11
Time: 2:15


## Tool Selection

In [2]:
# Tools selection - prompts
tools_selection_agenda = f"""{background_prompt} TASK: Select the Technology Stack for AlzKB Implementation.
Based on the Technical Specification, decide on the best tools to handle high-precision graph data.

DECISIONS REQUIRED:
1. GRAPH DATABASE: Select 1 DB (e.g., Neo4j, ArangoDB, Neptune). Justify based on support for "Edge Properties" (essential for evidence scoring).
2. AGENT FRAMEWORK: Select 1 Framework (e.g., LangGraph, AutoGen, CrewAI) that supports "Human-in-the-loop" (for Critic) and "Stateful Memory".
3. ETL & ONTOLOGY: Select libraries for handling OBO/OWL parsing (e.g., Owlready2, rdflib).

CONSTRAINT: Choose open-source/standard tools where possible to ensure reproducibility.
"""

tools_selection_questions = (
    "DATABASE: Select ONE primary Graph Database (e.g., Neo4j, ArangoDB, Neptune). Justify the choice based on our requirement for extensive 'Edge Properties' (Evidence Score, Provenance) and 'Vector Search' capabilities.",
    "ETL STACK: Which specific Python libraries will be used for (a) Parsing Bio-Ontologies (e.g., Owlready2, rdflib) and (b) High-performance Triple Ingestion? Avoid generic answers.",
    "ORCHESTRATION: Choose an Agent Framework (e.g., LangGraph, AutoGen, CrewAI). Specifically, how does it support 'Stateful Memory' (to track design iterations) and 'Human-in-the-loop' (for Gold Standard auditing)?",
)

tools_selection_prior_summaries = load_summaries(
    discussion_paths=[discussions_phase_to_dir["project_specification"] / "merged.json"])
print(f"Number of prior summaries: {len(tools_selection_prior_summaries)}")

Number of prior summaries: 1


In [3]:
# Tools selection - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="team",
            team_lead=principal_investigator,
            team_members=team_members,
            summaries=tools_selection_prior_summaries,
            agenda=tools_selection_agenda,
            agenda_questions=tools_selection_questions,
            save_dir=discussions_phase_to_dir["tools_selection"],
            save_name=f"discussion_{iteration_num + 1}",
            temperature=0.5,
            num_rounds=num_rounds,
        ) for iteration_num in range(num_iterations)
    ])

Rounds (+ Final Round):   0%|                                                                                                                                                                  | 0/2 [00:00<?, ?it/s]
Team:   0%|                                                                                                                                                                                    | 0/5 [00:00<?, ?it/s][A

Rounds (+ Final Round):   0%|                                                                                                                                                                  | 0/2 [00:00<?, ?it/s][A[A


Team:   0%|                                                                                                                                                                                    | 0/5 [00:08<?, ?it/s][A[A[A
Rounds (+ Final Round):   0%|                                                                                              

In [8]:
# Tools selection - merge
tools_selection_summaries = load_summaries(
    discussion_paths=list(discussions_phase_to_dir["tools_selection"].glob("discussion_*.json")))
print(f"Number of summaries: {len(tools_selection_summaries)}")

tools_selection_merge_prompt = create_merge_prompt(
    agenda=tools_selection_agenda,
    agenda_questions=tools_selection_questions,
)

run_meeting(
    meeting_type="individual",
    team_member=principal_investigator,
    summaries=tools_selection_summaries,
    agenda=tools_selection_merge_prompt,
    save_dir=discussions_phase_to_dir["tools_selection"],
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
    num_rounds=num_rounds,
)

Number of summaries: 1


Team: 100%|██████████| 2/2 [00:57<00:00, 28.84s/it]<?, ?it/s]
Team: 100%|██████████| 2/2 [01:06<00:00, 33.09s/it]<01:55, 57.68s/it]
Team:   0%|          | 0/2 [00:18<?, ?it/s]3 [02:03<01:02, 62.68s/it]
Rounds (+ Final Round): 100%|██████████| 3/3 [02:22<00:00, 47.40s/it]


Input token count: 22,378
Output token count: 7,231
Tool token count: 0
Max token length: 9,236
Cost: $0.10
Time: 2:23


## Data Access

In [None]:
# Tools selection - prompts
tools_selection_agenda = f"""{background_prompt} TASK: Select the Technology Stack for AlzKB Implementation.
Based on the Technical Specification, decide on the best tools to handle high-precision graph data.

DECISIONS REQUIRED:
1. GRAPH DATABASE: Select 1 DB (e.g., Neo4j, ArangoDB, Neptune). Justify based on support for "Edge Properties" (essential for evidence scoring).
2. AGENT FRAMEWORK: Select 1 Framework (e.g., LangGraph, AutoGen, CrewAI) that supports "Human-in-the-loop" (for Critic) and "Stateful Memory".
3. ETL & ONTOLOGY: Select libraries for handling OBO/OWL parsing (e.g., Owlready2, rdflib).

CONSTRAINT: Choose open-source/standard tools where possible to ensure reproducibility.
"""

tools_selection_questions = (
    "DATABASE: Select ONE primary Graph Database (e.g., Neo4j, ArangoDB, Neptune). Justify the choice based on our requirement for extensive 'Edge Properties' (Evidence Score, Provenance) and 'Vector Search' capabilities.",
    "ETL STACK: Which specific Python libraries will be used for (a) Parsing Bio-Ontologies (e.g., Owlready2, rdflib) and (b) High-performance Triple Ingestion? Avoid generic answers.",
    "ORCHESTRATION: Choose an Agent Framework (e.g., LangGraph, AutoGen, CrewAI). Specifically, how does it support 'Stateful Memory' (to track design iterations) and 'Human-in-the-loop' (for Gold Standard auditing)?",
)

tools_selection_prior_summaries = load_summaries(
    discussion_paths=[discussions_phase_to_dir["project_specification"] / "merged.json"])
print(f"Number of prior summaries: {len(tools_selection_prior_summaries)}")

Number of prior summaries: 1
