In [1]:
from pydantic import BaseModel, Field
import json
from pathlib import Path

from dotenv import load_dotenv
import dspy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SIMULATED_BACKEND_PATH = "simulated_backend/"

output_dir = Path(SIMULATED_BACKEND_PATH) / "generated_documents"
output_dir.mkdir(exist_ok=True, parents=True)

In [3]:
load_dotenv("../.env")
lm = dspy.LM("gemini/gemini-2.5-pro", temperature=0.5, cache=True, max_tokens=25000)
dspy.settings.configure(lm=lm, track_usage=True)

In [4]:
ORG_NAME = "Denali Therapeutics"
ORG_TYPE = "Sponsor"

STAFF_NAME = "Alex Johnson"
STAFF_TYPE = "Clinical Research Coordinator"

In [5]:
# Enter into ChatGPT

ORG_INFO_PROMPT = f"""Given the name and type (sponsor, CRO, or site organization) of the organization running clinical research below, do research and think hard to give information about the organization and their clinical research that might be relevant to their clinical research operations. 

Describe
- therapeutic focus (or the many)
- any ongoing clinical trials
- location of the clinical research (or many cites/countries)
- other information relevant to their clinical trial operations

Give a concise but complete bullet point for each of these.

Organization Name: {ORG_NAME}
Organization Type: {ORG_TYPE}"""

print(ORG_INFO_PROMPT)

Given the name and type (sponsor, CRO, or site organization) of the organization running clinical research below, do research and think hard to give information about the organization and their clinical research that might be relevant to their clinical research operations. 

Describe
- therapeutic focus (or the many)
- any ongoing clinical trials
- location of the clinical research (or many cites/countries)
- other information relevant to their clinical trial operations

Give a concise but complete bullet point for each of these.

Organization Name: Denali Therapeutics
Organization Type: Sponsor


In [6]:
ORG_INFORMATION = """
- Therapeutic focus: Biotech focused on neurodegenerative and lysosomal storage diseases, especially disorders with high unmet need like Parkinson’s, Alzheimer’s, ALS, frontotemporal dementia (FTD-GRN), Hunter syndrome (MPS II), and Sanfilippo syndrome (MPS IIIA), using its blood–brain barrier “TransportVehicle™” platform for antibodies, enzymes and oligonucleotides to deliver drugs into the CNS.  ￼
- Ongoing clinical trials: Key active programs include: tividenofusp alfa (DNL310) for Hunter syndrome, with a Phase 1/2 dataset supporting an FDA BLA under priority review and long-term extension studies; DNL126 for MPS IIIA in a Phase 1 pediatric trial; BIIB122/DNL151 (with Biogen) in Phase 2b LUMA and Phase 3 LIGHTHOUSE studies for Parkinson’s disease; TAK-594/DNL593 (with Takeda) for FTD-GRN; DNL919 for Alzheimer’s disease; and DNL343 for ALS, which showed a negative 6-month efficacy readout but remained safe and is still under further data analysis.  ￼
- Clinical trial geographies & sites: Headquarters is in South San Francisco, with a new biologics manufacturing and development facility in Salt Lake City, Utah; interventional trials are run across multi-center networks, including dozens of sites in the United States (e.g., California, Wisconsin) plus additional locations in Europe and other regions through EU-registered trials and global genetics/observational studies (e.g., ROPAD Parkinson’s genetics study at ~48 sites worldwide that can funnel into Denali interventional trials).  ￼
- Other clinical-operations-relevant details: Clinical development is tightly coupled to Denali’s BBB-crossing platform (Antibody and Oligonucleotide Transport Vehicles) and run in partnership with large pharmas (Biogen for BIIB122/DNL151, Takeda for TAK-594/DNL593, Sanofi for SAR443122/DNL758 in ulcerative colitis), supported by strong financing with cash runway projected into 2028; they also run grants/sponsorships and patient-advocacy programs and maintain an in-house clinical trials contact group (clinical-trials@dnli.com) to coordinate multi-site studies.  ￼
"""

In [7]:
# class Document(BaseModel):
#     """A document relevant to clinical trial operations.

#     Fields:
#     - id: Unique identifier for the document (e.g., doc_001)
#     - name: Filename of the document (e.g., Protocol_DNLI-001_Amendment_v3.2.pdf)
#     - description: Brief description of the document's content and purpose
#     - modified: Date when the document was last modified (ISO 8601 format)
#     - version: Version number of the document
#     - type: File type/extension (e.g., pdf, docx)
#     """

#     id: str
#     name: str
#     description: str
#     modified: str
#     version: str
#     type: str


# class GenerateOrgDocuments(dspy.Signature):
#     """Generate a list of documents that might be relevant to the operations of running clinical trials.

#     Based on the organization's information, create realistic documents such as protocols, amendments, vendor guides, etc that are routinely used in clinical trial operations.
#     Vary the types of documents to cover different aspects of clinical trial operations.
#     """

#     org_name: str = dspy.InputField(desc="Name of the organization running clinical trials")
#     org_type: str = dspy.InputField(
#         desc="Type of the organization (e.g., Sponsor, CRO, Site Organization)"
#     )
#     org_information: str = dspy.InputField(
#         desc="Detailed information about the organization running clinical trials"
#     )
#     num_documents: int = dspy.InputField(desc="Number of documents to generate")

#     documents: list[Document] = dspy.OutputField(
#         desc="List of relevant documents for the organization's operations"
#     )


# documents_predict = dspy.Predict(GenerateOrgDocuments)
# documents = documents_predict(
#     org_name=ORG_NAME,
#     org_type=ORG_TYPE,
#     org_information=ORG_INFORMATION,
#     num_documents=10,
# ).documents

# documents_data = [doc.model_dump() for doc in documents]

# with open(output_dir / "docs_info.json", "w") as f:
#     json.dump(documents_data, f, indent=2)

# documents_data

In [None]:
# class Action(BaseModel):
#     """An action within a workflow for clinical trial automation.

#     Fields:
#     - actionNumber: Sequential number of the action in the workflow
#     - action: Name or title of the action
#     - input: System of input (e.g., Email, EDC, CTMS)
#     - output: System of output (e.g., Email, EDC, Calendly)
#     - description: Detailed description of what the action does
#     - approval: Whether this action requires approval (Yes/No)
#     """

#     actionNumber: int
#     action: str
#     input: str
#     output: str
#     description: str
#     approval: str


# class Workflow(BaseModel):
#     """A workflow for automating clinical trial communications and processes.

#     Fields:
#     - id: Unique identifier for the workflow (e.g., workflow_001)
#     - name: Name of the workflow
#     - description: Brief description of the workflow's purpose and functionality
#     - modified: Date when the workflow was last modified (ISO 8601 format)
#     - approval: Whether the workflow requires approval to run (Yes/No)
#     - actions: List of actions that make up the workflow
#     """

#     id: str
#     name: str
#     description: str
#     modified: str
#     approval: str
#     actions: list[Action]


# class GenerateOrgWorkflows(dspy.Signature):
#     """Generate a list of workflows that might be relevant to the operations of a company running clinical trials.

#     Based on the organization's information, create realistic workflows for automating clinical trial communications and processes, such as handling SAE reports, vendor results, monitoring visits, etc.
#     Vary the workflows to cover different aspects of clinical trial operations.
#     Workflows should be concise, with only 1-3 actions.
#     Descriptions of action steps should be concise and active tense, for example: Forward SAE details to Jim Collins (Medical Director) for urgent review
#     At least one of input or output of each action should be "email", but don't say an input/output is related to email if another system is most likely used.
#     Think hard and prioritize workflows that would be most beneficial for the organization and their clinical research operations.

#     Example workflows:
#     [
#       {
#         "id": "workflow_001",
#         "name": "Brain Scan Vendor Results",
#         "description": "Processes brain scan results from Dupliplex vendor, notifies PI of abnormal values, and logs normal results in EDC automatically",
#         "modified": "2024-11-24",
#         "approval": "Yes",
#         "actions": [
#           {
#             "actionNumber": 1,
#             "action": "Abnormal Values Notification",
#             "input": "email",
#             "output": "email",
#             "description": "Check if values are abnormal, if so send email to notify PI",
#             "approval": "Yes"
#           },
#           {
#             "actionNumber": 2,
#             "action": "Normal Value Logging",
#             "input": "email",
#             "output": "EDC",
#             "description": "Check if values are normal, if so log results in EDC",
#             "approval": "Yes"
#           }
#         ]
#       },
#       {
#         "id": "workflow_002",
#         "name": "SAE Auto-Forward to Medical Director",
#         "description": "Automatically forwards serious adverse events to medical director and sends acknowledgment to reporting site",
#         "modified": "2024-11-24",
#         "approval": "No",
#         "actions": [
#           {
#             "actionNumber": 1,
#             "action": "Send Site Acknowledgment",
#             "input": "email",
#             "output": "email",
#             "description": "Thank site for SAE report and confirm follow-up is in progress",
#             "approval": "No"
#           },
#           {
#             "actionNumber": 2,
#             "action": "Urgent Notification to Medical Director",
#             "input": "email",
#             "output": "email",
#             "description": "Forward SAE details to Jim Collins (Medical Director) for urgent review",
#             "approval": "No"
#           }
#         ]
#       }
#     ]
#     """

#     org_name: str = dspy.InputField(desc="Name of the organization running clinical trials")
#     org_type: str = dspy.InputField(
#         desc="Type of the organization (e.g., Sponsor, CRO, Site Organization)"
#     )
#     org_information: str = dspy.InputField(
#         desc="Detailed information about the organization running clinical trials"
#     )
#     num_workflows: int = dspy.InputField(desc="Number of workflows to generate")

#     workflows: list[Workflow] = dspy.OutputField(
#         desc="List of relevant workflows for the organization's operations"
#     )


# workflows_predict = dspy.Predict(GenerateOrgWorkflows)
# workflows = workflows_predict(
#     org_name=ORG_NAME,
#     org_type=ORG_TYPE,
#     org_information=ORG_INFORMATION,
#     num_workflows=5,
# ).workflows

# workflows_data = [workflow.model_dump() for workflow in workflows]

# with open(Path(SIMULATED_BACKEND_PATH) / "workflow_settings.json", "w") as f:
#     json.dump(workflows_data, f, indent=2)

# workflows_data

In [9]:
with open(Path(SIMULATED_BACKEND_PATH) / "workflow_settings.json", "r") as f:
    workflows_data = json.load(f)

# Summarize workflow settings for input
workflow_summary = "\n".join([f"- {wf['name']}: {wf['description']}" for wf in workflows_data])


class EmailDescriptions(dspy.Signature):
    """Given organization details, staff role, and workflow settings, create descriptions of email threads that the staff member would be expected to handle, and that could potentially be automated by the workflows.

    Vary the email threads to cover scenarios that match the workflow settings, such as vendor reports, SAE notifications, monitoring updates, etc.
    Vary the urgency of the email threads (e.g., routine, urgent, critical).
    Vary the complexity of the email threads (e.g., single email, multi-email back and forth).
    Vary the source of the email thread (site to sponsor, CRO to sponsor, vendor to sponsor, etc).

    Examples based on workflows:
    - For vendor result workflows: Email from vendor with lab results or scan reports.
    - For SAE workflows: Email reporting serious adverse events.
    - For monitoring workflows: Emails about visit scheduling or data queries.
    """

    org_name: str = dspy.InputField(desc="Name of the organization")
    staff_name: str = dspy.InputField(desc="Name of the staff member")
    staff_role: str = dspy.InputField(desc="Role of the staff member")
    org_information: str = dspy.InputField(desc="Detailed information about the organization")
    workflow_summary: str = dspy.InputField(
        desc="Summary of available workflow settings for automation"
    )
    num_threads: int = dspy.InputField(desc="Number of email threads to create descriptions for")

    email_descriptions: list[str] = dspy.OutputField(
        desc="A list of descriptions of the email threads that the staff member would handle. Don't number threads."
    )


email_descriptions_predict = dspy.Predict(EmailDescriptions)
email_descriptions = email_descriptions_predict(
    org_name=ORG_NAME,
    staff_name=STAFF_NAME,
    staff_role=STAFF_TYPE,
    org_information=ORG_INFORMATION,
    workflow_summary=workflow_summary,
    num_threads=3,
).email_descriptions

email_descriptions

["An urgent email chain initiated by the study coordinator at a LUMA study site (BIIB122/DNL151 for Parkinson's disease) reporting a new Serious Adverse Event. The event is described as a 'sudden onset of severe confusion,' requiring Alex to immediately acknowledge receipt and request the de-identified source documents for safety reporting and escalation to the partner, Biogen.",
 'A routine notification from the central imaging vendor for the TAK-594/DNL593 FTD-GRN study. The email contains a link to newly uploaded PET scan results for a specific patient, which are ready for review to assess BBB Transport Vehicle target engagement. Alex needs to ensure this is filed correctly and that the automated triage to the Clinical Scientist is triggered.',
 'A back-and-forth email thread regarding the DNL310 Hunter syndrome study. It begins with a site coordinator inquiring about the status of their next Investigational Medicinal Product (IMP) shipment, mentioning their current stock is running

In [None]:
class Email(BaseModel):
    """An email message including metadata for clinical trial communications.

    Metadata fields:
    - from_address: Email address of the sender (e.g., site coordinator, CRA, medical monitor)
    - to_addresses: List of primary recipient email addresses who need to take action
    - cc_addresses: List of email addresses copied for awareness (e.g., project managers, medical monitors)
    - subject: Email subject line, often includes study ID, urgency indicators, and topic
    - timestamp: ISO 8601 formatted timestamp indicating when the email was sent
    - body: Full text content of the email message, including greetings, content, and signature
    - attachments: List of attachment filenames (e.g., SAE forms, source documents, meeting agendas)
    - message_id: Identifier for the email message in the thread, starting at 0 and incrementing by 1 for each subsequent email
    - in_reply_to: Message ID of the email being replied to, or None if this starts a new thread
    """

    from_address: str
    to_addresses: list[str]
    cc_addresses: list[str]
    subject: str
    timestamp: str
    body: str
    attachments: list[str]
    message_id: int
    in_reply_to: int | None


class EmailThread(dspy.Signature):
    """Given a description of an email thread and a role of a person to include, draft a synthetic email thread that includes the person with the given role (this person is always at the sponsor organization).
    Come up with names for organizations (sponsor, CRO, site) people, attachments in the email thread as needed to make the email thread realistic.
    The email thread should include realistic email metadata (from, to, cc, subject, timestamp, attachments, message IDs, in-reply-to fields).
    The email thread should end before the specified person takes action (reply to email, redirect email, confirm follow up, schedule meeting, etc), such that the next reply would be from this person.
    """

    email_description: str = dspy.InputField(desc="The description of the email thread to draft")
    name: str = dspy.InputField(
        desc="The name of the person (at the sponsor level) that must take action (reply to email, redirect email, confirm follow up, schedule meeting, etc) in the email thread."
    )
    role: str = dspy.InputField(
        desc="The role of the person (at the sponsor level) that must take action (reply to email, redirect email, confirm follow up, schedule meeting, etc) in the email thread. This person is always at the sponsor organization."
    )
    organization: str = dspy.InputField(
        desc="The organization of the person with the given role (eg sponsor, CRO, site) that must take action in the email thread."
    )

    email_thread: list[Email] = dspy.OutputField(desc="The synthetic email thread")
    role_descriptions: list[str] = dspy.OutputField(
        desc="""Map of "email addresses: organization, role" in the email thread. 
        Organization can be sponsor, CRO or site. 
        Given role is always at sponsor organization and must be included in mapping.
        Ex: jane.doe@denalitx.com: Denali Therapeutics (Sponsor), Clinical Research Associate"""
    )


for idx, description in enumerate(email_descriptions):
    print("Generating email thread for description: ", description)

    email_thread_predict = dspy.Predict(EmailThread)
    email_thread_result = email_thread_predict(
        email_description=description, name=STAFF_NAME, role=STAFF_TYPE, organization=ORG_NAME
    )

    break

email_thread_result

In [None]:
class DraftSyntheticEmailDescription(dspy.Signature):
    """Given a ClinicalTrials.gov study ID and a role (eg CRA, CTM), create descriptions of email threads that the person with the role (at the sponsor level) would be expected to be help resolve.
    Vary the email threads to cover different types of common clinical trial communications (eg SAE reporting, monitoring visit scheduling, data queries, protocol deviations, etc).
    Vary the urgency of the email threads (eg routine, urgent, critical).
    Vary the complexity of the email threads (eg single email, multi-email back and forth).
    Vary the source of the email thread (site to sponsor, CRO to sponsor, sponsor to site, etc).

    Ex for a CRA, make one email thread about scheduling a monitoring visit, one about following up on overdue data queries, one about reporting an SAE, etc.
    """

    study_overview: str = dspy.InputField(desc="The ClinicalTrials.gov study overview")
    name: str = dspy.InputField(
        desc="The name of the person who would be expected to take action in response to the email thread"
    )
    role: str = dspy.InputField(
        desc="The role of the person who would be expected to take action in response to the email thread"
    )
    organization: str = dspy.InputField(
        desc="The organization of the person with the given role (eg sponsor, CRO, site)"
    )
    num_threads: int = dspy.InputField(
        desc="The number of email threads to create descriptions for"
    )

    # Outputs
    email_descriptions: list[str] = dspy.OutputField(
        desc="A list of descriptions of the email threads that the role would be expected to help resolve. Don't number threads."
    )


class Email(BaseModel):
    """An email message including metadata for clinical trial communications.

    Metadata fields:
    - from_address: Email address of the sender (e.g., site coordinator, CRA, medical monitor)
    - to_addresses: List of primary recipient email addresses who need to take action
    - cc_addresses: List of email addresses copied for awareness (e.g., project managers, medical monitors)
    - subject: Email subject line, often includes study ID, urgency indicators, and topic
    - timestamp: ISO 8601 formatted timestamp indicating when the email was sent
    - body: Full text content of the email message, including greetings, content, and signature
    - attachments: List of attachment filenames (e.g., SAE forms, source documents, meeting agendas)
    - message_id: Unique identifier for the email message in the thread
    - in_reply_to: Message ID of the email being replied to, or None if this starts a new thread
    """

    from_address: str
    to_addresses: list[str]
    cc_addresses: list[str]
    subject: str
    timestamp: str
    body: str
    attachments: list[str]
    message_id: int
    in_reply_to: int | None


class DraftSyntheticEmailThread(dspy.Signature):
    """Given a description of an email thread and a role of a person to include, draft a synthetic email thread that includes the person with the given role (this person is always at the sponsor organization).
    Come up with names for organizations (sponsor, CRO, site) people, attachments in the email thread as needed to make the email thread realistic.
    The email thread should include realistic email metadata (from, to, cc, subject, timestamp, attachments, message IDs, in-reply-to fields).
    The email thread should end before the specified person takes action (reply to email, redirect email, confirm follow up, schedule meeting, etc), such that the next reply would be from this person.
    """

    email_description: str = dspy.InputField(desc="The description of the email thread to draft")
    name: str = dspy.InputField(
        desc="The name of the person (at the sponsor level) that must take action (reply to email, redirect email, confirm follow up, schedule meeting, etc) in the email thread."
    )
    role: str = dspy.InputField(
        desc="The role of the person (at the sponsor level) that must take action (reply to email, redirect email, confirm follow up, schedule meeting, etc) in the email thread. This person is always at the sponsor organization."
    )
    organization: str = dspy.InputField(
        desc="The organization of the person with the given role (eg sponsor, CRO, site) that must take action in the email thread."
    )

    email_thread: list[Email] = dspy.OutputField(desc="The synthetic email thread")
    role_descriptions: list[str] = dspy.OutputField(
        desc="""Map of "email addresses: organization, role" in the email thread. 
        Organization can be sponsor, CRO or site. 
        Given role is always at sponsor organization and must be included in mapping.
        Ex: jane.doe@denalitx.com: Denali Therapeutics (Sponsor), Clinical Research Associate"""
    )

In [None]:
STUDY_OVERVIEW = """
            Brief Summary

            This is a Phase 1/2, multicenter, randomized, placebo-controlled, double-blind study to evaluate the safety, tolerability, pharmacokinetics (PK), and pharmacodynamics (PD) of single and multiple doses of DNL593 in two parts followed by an optional open-label extension (OLE) period.

            Part A will evaluate the safety, tolerability, PK, and PD of single doses of DNL593 in healthy male and healthy female participants of nonchildbearing potential. Part B will evaluate the safety, tolerability, PK, and PD of multiple doses of DNL593 in participants with frontotemporal dementia (FTD) over 25 weeks. Part B will be followed by Part C, an optional 18-month OLE period available for all participants who complete Part B.
            Official Title
            A Phase 1/2, Multicenter, Randomized, Placebo-Controlled, Double Blind Single Dose and Multiple Dose Study to Evaluate the Safety, Tolerability, Pharmacokinetics, and Pharmacodynamics of DNL593 in Healthy Participants and Participants With Frontotemporal Dementia Followed by an Open-Label Extension
            Conditions
            Frontotemporal Dementia
            Intervention / Treatment

                Drug: DNL593
                Drug: Placebo

            Sponsor: Denali Therapeutics
          """

NAME = "Alex Johnson"

ROLE = "Clinical Research Associate"

ORGANIZATION = "Denali Therapeutics (Sponsor)"

NUM_THREADS = 10

In [None]:
email_descriptions_predict = dspy.Predict(DraftSyntheticEmailDescription)
email_descriptions = email_descriptions_predict(
    study_overview=STUDY_OVERVIEW,
    name=NAME,
    role=ROLE,
    organization=ORGANIZATION,
    num_threads=NUM_THREADS,
).email_descriptions

email_descriptions

In [None]:
threads = []
output_dir = Path("generated_email_threads_2")
output_dir.mkdir(exist_ok=True, parents=True)

for idx, description in enumerate(email_descriptions):
    print("Generating email thread for description: ", description)

    email_thread_predict = dspy.Predict(DraftSyntheticEmailThread)
    email_thread_result = email_thread_predict(
        email_description=description, name=NAME, role=ROLE, organization=ORGANIZATION
    )

    emails_data = [
        {
            "from_address": email.from_address,
            "to_addresses": email.to_addresses,
            "cc_addresses": email.cc_addresses,
            "subject": email.subject,
            "timestamp": email.timestamp,
            "body": email.body,
            "attachments": email.attachments,
            "message_id": email.message_id,
            "in_reply_to": email.in_reply_to,
        }
        for email in email_thread_result.email_thread
    ]

    thread_data = {
        "emails": emails_data,
        "role_descriptions": email_thread_result.role_descriptions,
        "description": description,
    }
    threads.append(thread_data)

    thread_file = output_dir / f"thread_{idx:03d}.json"
    with open(thread_file, "w") as f:
        json.dump(thread_data, f, indent=2)
    print(f"Saved thread {idx} to {thread_file}")

summary_file = output_dir / "all_threads.json"
with open(summary_file, "w") as f:
    json.dump({"study_overview": STUDY_OVERVIEW, "role": ROLE, "threads": threads}, f, indent=2)
print(f"Saved summary to {summary_file}")

thread_data