In [1]:
import concurrent.futures
import json
from pathlib import Path

from virtual_lab.constants import CONSISTENT_TEMPERATURE, CREATIVE_TEMPERATURE
from virtual_lab.prompts import (
    CODING_RULES,
    REWRITE_PROMPT,
    create_merge_prompt,
)
from virtual_lab.run_meeting import run_meeting
from virtual_lab.utils import load_summaries

from knowledge_base_constants import (
    background_prompt,
    # nanobody_prompt,
    num_iterations,
    num_rounds,
    discussions_phase_to_dir,
    principal_investigator,
    team_members,
    chief_knowledge_architect,
    data_integration_scientist,
    knowledge_validation_specialist,
    scientific_critic
)

## Team selection

In [2]:
# Team selection - prompts
team_selection_agenda = f"""{background_prompt}
You must now assemble a team of three scientists to design a structured knowledge base for this project.
These team members will later participate in meetings to define a schema, identify sources of information, and decide how knowledge will be updated and validated.

Please list exactly three team members in the following format.
Do not include yourself.

Agent(
title="Chief Knowledge Architect",
expertise=(
"knowledge graph construction, ontology development, information extraction, and schema governance"
),
goal=(
"design a structure that enables scalable knowledge updates and supports retrieval-based reasoning"
),
role=(
"propose schemas, specify entity/relationship definitions, design attribute-level constraints, and map knowledge sources to the schema"
),
)
"""

In [3]:
# Team selection - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="individual",
            team_member=principal_investigator,
            agenda=team_selection_agenda,
            save_dir=discussions_phase_to_dir["team_selection"],
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
        ) for iteration_num in range(num_iterations)
    ])

Rounds (+ Final Round):   0%|                                                                                                                                                                  | 0/1 [00:00<?, ?it/s]
Team:   0%|                                                                                                                                                                                    | 0/2 [00:00<?, ?it/s][A

Rounds (+ Final Round):   0%|                                                                                                                                                                  | 0/1 [00:00<?, ?it/s][A[A


Team:   0%|                                                                                                                                                                                    | 0/2 [00:00<?, ?it/s][A[A[A



Rounds (+ Final Round):   0%|                                                                                           

Input token count: 212
Output token count: 230
Tool token count: 0
Max token length: 442
Time: 0:08


Team:   0%|                                                                                                                                                                                    | 0/2 [00:07<?, ?it/s]




Rounds (+ Final Round): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.91s/it][A[A[A[A


Input token count: 212
Output token count: 236
Tool token count: 0
Max token length: 448
Time: 0:09


Team:   0%|                                                                                                                                                                                    | 0/2 [00:09<?, ?it/s]








Rounds (+ Final Round): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.79s/it][A[A[A[A[A[A[A[A


Input token count: 212
Output token count: 235
Tool token count: 0
Max token length: 447
Time: 0:11


Team:   0%|                                                                                                                                                                                    | 0/2 [00:15<?, ?it/s]


Rounds (+ Final Round): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.35s/it][A[A


Input token count: 212
Output token count: 231
Tool token count: 0
Max token length: 443
Time: 0:17


Team:   0%|                                                                                                                                                                                    | 0/2 [00:42<?, ?it/s]
Rounds (+ Final Round): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:42<00:00, 42.59s/it]

Input token count: 212
Output token count: 227
Tool token count: 0
Max token length: 439
Time: 0:44





In [4]:
# Team selection - merge
team_selection_summaries = load_summaries(
    discussion_paths=list(discussions_phase_to_dir["team_selection"].glob("discussion_*.json")))
print(f"Number of summaries: {len(team_selection_summaries)}")

team_selection_merge_prompt = create_merge_prompt(agenda=team_selection_agenda)

run_meeting(
    meeting_type="individual",
    team_member=principal_investigator,
    summaries=team_selection_summaries,
    agenda=team_selection_merge_prompt,
    save_dir=discussions_phase_to_dir["team_selection"],
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
)

Number of summaries: 5


Rounds (+ Final Round):   0%|                                                                                                                                                                  | 0/1 [00:00<?, ?it/s]
Team:   0%|                                                                                                                                                                                    | 0/2 [00:29<?, ?it/s][A
Rounds (+ Final Round): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:29<00:00, 29.21s/it]


Input token count: 1,504
Output token count: 994
Tool token count: 0
Max token length: 2,498
Time: 0:31


## Projects specification

In [2]:
# Project specification - prompts
project_specification_agenda = f"""
{background_prompt} You are working on a research project to design ALZ-KB, a structured knowledge base that integrates and retrieves validated information related to Alzheimer’s disease. The knowledge base should connect molecular biology, genetics, clinical phenotypes, experimental assays, preclinical findings, and therapeutic development pipelines. Your design should specifically enable temporal tracking of evidence, support reasoning across heterogeneous data sources, and facilitate prioritization of hypotheses, targets, interventions, and biomarkers.
Please now define a concrete approach for how ALZ-KB will be constructed and used. You must decide what existing knowledge and artifacts should be imported, how to structure entities and relationships, how knowledge will be updated over time, and how completeness and consistency will be evaluated. At the end of this exercise, produce a structured approach that can be evaluated by a scientific committee and implemented by a technical team.
"""

project_specification_questions = (
    "What major entity types should ALZ-KB contain (e.g., genes, variants, biomarkers, phenotypes, mechanistic pathways, drugs, clinical cohorts, experimental protocols)? Please list no more than 8 and justify each.",
    "Which relationship classes between entities are essential to support inference and hypothesis generation (e.g., gene-impacts-phenotype, drug-targets-pathway, biomarker-validated-in-assay)? Provide 6–10 precise types.",
    "How will evidence quality be encoded and versioned, considering that Alzheimer’s research is rapidly evolving and contradictory results often arise?",
    "What validated external databases, knowledge sources, publications, knowledge graphs, or assay repositories should be imported initially? Provide at least 6 specific ones and their expected contribution.",
    "What schema-level rules, constraints, or ontological commitments are needed to ensure interoperability and temporal consistency?",
    "What measurable evaluation framework should be defined to assess (a) coverage, (b) correctness, and (c) reproducibility of knowledge, in a way suitable for scientific auditing?",
    "Which types of research workflows will ALZ-KB enable—such as biomarker discovery, cross-cohort stratification analysis, therapeutic candidate prioritization—and how will this influence schema scope?",
)

In [3]:
# Project specification - discussion
with concurrent.futures.ThreadPoolExecutor() as executor:
    concurrent.futures.wait([
        executor.submit(
            run_meeting,
            meeting_type="team",
            team_lead=principal_investigator,
            team_members=team_members,
            agenda=project_specification_agenda,
            agenda_questions=project_specification_questions,
            save_dir=discussions_phase_to_dir["project_specification"],
            save_name=f"discussion_{iteration_num + 1}",
            temperature=CREATIVE_TEMPERATURE,
            num_rounds=num_rounds,
        ) for iteration_num in range(num_iterations)
    ])

Rounds (+ Final Round):   0%|                                                                                                                                                                  | 0/4 [00:00<?, ?it/s]
Team:   0%|                                                                                                                                                                                    | 0/5 [00:00<?, ?it/s][A

Rounds (+ Final Round):   0%|                                                                                                                                                                  | 0/4 [00:00<?, ?it/s][A[A


Team:   0%|                                                                                                                                                                                    | 0/5 [00:00<?, ?it/s][A[A[A



Rounds (+ Final Round):   0%|                                                                                           

Input token count: 181,079
Output token count: 21,779
Tool token count: 0
Max token length: 24,101
Time: 16:53


In [4]:
# Project specification - merge
project_specification_summaries = load_summaries(
    discussion_paths=list(discussions_phase_to_dir["project_specification"].glob("discussion_*.json")))
print(f"Number of summaries: {len(project_specification_summaries)}")

project_specification_merge_prompt = create_merge_prompt(
    agenda=project_specification_agenda,
    agenda_questions=project_specification_questions,
)

run_meeting(
    meeting_type="individual",
    team_member=principal_investigator,
    summaries=project_specification_summaries,
    agenda=project_specification_merge_prompt,
    save_dir=discussions_phase_to_dir["project_specification"],
    save_name="merged",
    temperature=CONSISTENT_TEMPERATURE,
    num_rounds=num_rounds,
)

Number of summaries: 5


Rounds (+ Final Round):   0%|                                                                                                                                                                  | 0/4 [00:00<?, ?it/s]
Team:   0%|                                                                                                                                                                                    | 0/2 [00:00<?, ?it/s][A
Team:  50%|██████████████████████████████████████████████████████████████████████████████████████                                                                                      | 1/2 [00:28<00:28, 28.05s/it][A
Team: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:56<00:00, 28.40s/it][A
Rounds (+ Final Round):  25%|██████████████████████████████████████▌                                                                   

ValueError: Run failed: failed

## Tool Selection

In [None]:
# Tools selection - prompts
tools_selection_agenda = f"{background_prompt} {nanobody_prompt} Now you need to select machine learning and/or computational tools to implement this nanobody design approach. Please list several tools (5-10) that would be relevant to this nanobody design approach and how they could be used in the context of this project. If selecting machine learning tools, please prioritize pre-trained models (e.g., pre-trained protein language models or protein structure prediction models) for simplicity."

tools_selection_questions = (
    "What machine learning and/or computational tools could be used for this nanobody design approach (list 5-10)?",
    "For each tool, how could it be used for designing modified nanobodies?",
)

tools_selection_prior_summaries = load_summaries(
    discussion_paths=[discussions_phase_to_dir["project_specification"] / "merged.json"])
print(f"Number of prior summaries: {len(tools_selection_prior_summaries)}")