#Notebook Setup
-Import Dependencies
-Load Environmental Variables
-Setup Loging

In [1]:
import os
import string
import time
import pickle

from tqdm import tqdm
from dotenv import load_dotenv
from datetime import datetime, timedelta
from langchain_openai import ChatOpenAI
from pathlib import Path
from services import clinicaltrials_retriever, logging_config, mesh_mapper
from langchain_core.messages import SystemMessage, HumanMessage
from enum import Enum
from pydantic import BaseModel, Field
from typing import List, Optional
import pandas as pd
from falkordb import FalkorDB


[2025-11-09 16:30:04] INFO     - root - Logging initialized
[2025-11-09 16:30:04] INFO     - root - Log level: INFO
[2025-11-09 16:30:04] INFO     - root - Log directory: /Users/joshziel/Documents/Coding/glp-1-landscape/logs
[2025-11-09 16:30:04] INFO     - root - Console logging: True


In [2]:
# Load environment variables
load_dotenv()

# Initialize logger
logger = logging_config.get_logger(__name__)

#Set data location
DATA_STORAGE = os.getenv("DATA_LOC")

#Define Core Clinical Trials Searches

In [3]:
drug_queries = {
    "EFPEGLENATIDE": "efpeglenatide",
    "LIRAGLUTIDE": "liraglutide",
    "SURVODUTIDE": "survodutide",
    "DULAGLUTIDE": "dulaglutide",
    "RETATRUTIDE": "retatrutide",
    "COTADUTIDE": "cotadutide",
    "LIXISENATIDE": "lixisenatide",
    "ALBIGLUTIDE": "albiglutide",
    "PEGSEBRENATIDE": "pegsembrenatide",
    "EXENATIDE": "exenatide",
    "PEGAPAMODUTIDE": "pegapamodutide",
    "AVEXITIDE": "avexitide",
    "TIRZEPATIDE": "tirzepatide",
    "DANUGLIPRON": "danuglipron",
    "EFINOPEGDUTIDE": "efinopegdutide",
    "TASPOGLUTIDE": "taspoglutide",
    "SEMAGLUTIDE": "semaglutide",
}

In [4]:
drug_queries_results = []
for key in drug_queries.keys():
    drug_query = drug_queries[key]
    logger.info(f"Processing drug: {key} with query term: {drug_query}")

    # Retrieve clinical trials
    results = clinicaltrials_retriever.retrieve_batched_studies(query_intervention=drug_query)
    logger.info(f"Retrieved {results.shape[0]} clinical trials for drug: {key}")
    results['drug_name'] = key
    drug_queries_results.append(results)

all_trials = pd.concat(drug_queries_results, axis = 0, ignore_index=True)
all_trials.to_csv(f"{DATA_STORAGE}/all_raw_trials.csv")
logger.info(f"Retrieved a total of {all_trials.shape[0]} and saved to csv:{DATA_STORAGE}/all_raw_trials.csv")


[2025-11-09 16:30:09] INFO     - __main__ - Processing drug: EFPEGLENATIDE with query term: efpeglenatide
[2025-11-09 16:30:09] INFO     - services.clinicaltrials_retriever - Starting ClinicalTrials.gov study retrieval
[2025-11-09 16:30:09] INFO     - services.clinicaltrials_retriever - Starting ClinicalTrials.gov study retrieval
[2025-11-09 16:30:09] INFO     - utils.clinicaltrials - Retrieving study details from ClinicalTrials.gov
[2025-11-09 16:30:09] INFO     - utils.clinicaltrials - Total studies available: 5
[2025-11-09 16:30:09] INFO     - utils.clinicaltrials - No more pages available
[2025-11-09 16:30:09] INFO     - utils.clinicaltrials - Successfully retrieved 5 studies from ClinicalTrials.gov
[2025-11-09 16:30:09] INFO     - services.clinicaltrials_retriever - Successfully processed 5 studies
[2025-11-09 16:30:09] INFO     - __main__ - Retrieved 5 clinical trials for drug: EFPEGLENATIDE
[2025-11-09 16:30:09] INFO     - __main__ - Processing drug: LIRAGLUTIDE with query term:

In [5]:
def deduplicate_trials(df):
    """Deduplicate trials and combine drug names"""
    
    # Pre-allocate list to collect results
    deduped_list = []
    
    # Group by nct_id to handle duplicates
    for nct_id, group in df.groupby('nct_id'):
        if len(group) > 1:
            logger.info(f"Deduplicating {len(group)} entries for NCT ID: {nct_id}")
        
        # Take first row as template and ensure drug_name column can hold lists
        template = group.iloc[0].copy()
        
        # Get all unique drug names for this trial
        drug_names = group['drug_name'].dropna().unique().tolist()
        
        # Convert to Series to allow assignment
        template = pd.Series(template)
        template['drug_name'] = drug_names
        
        deduped_list.append(template)
    
    # Create DataFrame from list (much more efficient)
    deduped_results = pd.DataFrame(deduped_list)
    
    return deduped_results

def remove_punctuation(text):
    """Remove punctuation with null handling"""
    if pd.isna(text) or text is None:
        return ""
    return str(text).translate(str.maketrans('', '', string.punctuation)).upper()

def convert_date(text):
    if pd.isna(text) or text is None:
        return None
    elif "-" in text:
        date_info = text.split("-")
        if len(date_info) == 2:
            year = int(date_info[0])
            month = int(date_info[1])
            return datetime(year = year, month = month, day = 1)
        if len(date_info) == 3:
            year = int(date_info[0])
            month = int(date_info[1])
            day = int(date_info[2])
            return datetime(year = year, month = month, day = day)
    else:
        return None

def calculate_duration(row):
    """
    Precise calculation using total_seconds with robust null handling
    """
    start_date = row['cln_start_date']
    completion_date = row['cln_completion_date']
    
    # Handle various types of missing/empty values
    if (pd.isna(start_date) or pd.isna(completion_date) or 
        start_date is None or completion_date is None or
        start_date == '' or completion_date == ''):
        return None
    
    # Calculate duration using total_seconds for precision
    duration = completion_date - start_date
    seconds_per_year = 365.25 * 24 * 60 * 60
    years = duration.total_seconds() / seconds_per_year
    return round(years, 1)

def generate_mesh_term_map(df):
    unique_conditions = df.conditions.explode().unique()
    term_mappings = {}
    for condition in tqdm(unique_conditions, desc="Matching conditions to MeSH terms"):
        if pd.notna(condition) and condition:  # Skip null/empty conditions
            result = mesh_mapper.search_mesh_term(condition, filter_diseases_only=True)  # Change to True to filter
            if result:
                mapping = f"{result["mesh_term"]} (MeSH ID:{result["mesh_id"]})"
                term_mappings[condition] = mapping
            # Add delay after each search to respect rate limits
            time.sleep(0.35)
    # Display results
    logger.info(f"\nSuccessfully matched {len(term_mappings.keys())} conditions to MeSH terms")
    return term_mappings

def add_mesh_mappings(conditions, term_map):
    matched_conditions = []
    if conditions is not None and len(conditions)>0:  # Skip null/empty conditions
        for condition in conditions:
            if term_map.keys() and condition in term_map.keys():
                matched_term = term_map[condition]
                logger.info(f"Found a MeSH term mapping for {condition}")
                matched_conditions.append(matched_term)
            else:
                logger.info(f"No existing MeSH term mapping for {condition}")
    return list(set(matched_conditions))

#Deduplication
logger.info(f"Deduplicating {all_trials.shape[0]} trials")
deduped_trials = deduplicate_trials(all_trials)
logger.info(f"Removed {all_trials.shape[0]-deduped_trials.shape[0]} trials during de-duplication")

#Data Cleaning
logger.info(f"Initiating data clean-up on {deduped_trials.shape[0]} trials.")
cleaned_trials = deduped_trials[
    # Check if 'DRUG' is in intervention_types list
    (deduped_trials['intervention_types'].apply(
        lambda x: isinstance(x, list) and ('DRUG' or 'BIOLOGICAL') in x
    )) 
    & 
    # Check healthy_volunteers 
    ((deduped_trials['healthy_volunteers'] == False) | 
     (deduped_trials['healthy_volunteers'].isnull()))
    & 
    # Check study type
    (deduped_trials['study_type'] == 'INTERVENTIONAL')
    &
    # Check if primary_outcomes is not empty/null
    (deduped_trials['primary_outcomes'].notna() & 
     (deduped_trials['primary_outcomes'].astype(str) != '[]'))
    &
    # Check if interventions is not empty/null  
    (deduped_trials['interventions'].notna() & 
     (deduped_trials['interventions'].astype(str) != '[]'))
].copy()
mesh_term_map = generate_mesh_term_map(cleaned_trials)
pd.DataFrame.from_dict(mesh_term_map, orient = 'index').to_csv(f"{DATA_STORAGE}/mesh_term_mappings.csv")


cleaned_trials['cleaned_sponsor'] = cleaned_trials['lead_sponsor'].apply(remove_punctuation)
cleaned_trials['cln_start_date']= cleaned_trials['start_date'].apply(convert_date)
cleaned_trials['cln_completion_date']= cleaned_trials['completion_date'].apply(convert_date)
cleaned_trials['duration'] = cleaned_trials.apply(calculate_duration, axis = 1)
cleaned_trials["matched_conditions"] = cleaned_trials["conditions"].apply(add_mesh_mappings, term_map=mesh_term_map)
cleaned_trials.shape
cleaned_trials.to_csv(f"{DATA_STORAGE}/cleaned_trials.csv")
with open(f"{DATA_STORAGE}/cleaned_trials.pkl", 'wb') as f:
    pickle.dump(cleaned_trials, f)
logger.info(f"Saved {cleaned_trials.shape[0]} trials after data cleaning in {DATA_STORAGE}")

[2025-11-09 16:30:34] INFO     - __main__ - Deduplicating 2057 trials
[2025-11-09 16:30:34] INFO     - __main__ - Deduplicating 2 entries for NCT ID: NCT00518115
[2025-11-09 16:30:34] INFO     - __main__ - Deduplicating 2 entries for NCT ID: NCT00518882
[2025-11-09 16:30:34] INFO     - __main__ - Deduplicating 2 entries for NCT ID: NCT00696657
[2025-11-09 16:30:34] INFO     - __main__ - Deduplicating 2 entries for NCT ID: NCT00707031
[2025-11-09 16:30:34] INFO     - __main__ - Deduplicating 2 entries for NCT ID: NCT00717457
[2025-11-09 16:30:34] INFO     - __main__ - Deduplicating 2 entries for NCT ID: NCT01029886
[2025-11-09 16:30:34] INFO     - __main__ - Deduplicating 2 entries for NCT ID: NCT01064687
[2025-11-09 16:30:34] INFO     - __main__ - Deduplicating 2 entries for NCT ID: NCT01128894
[2025-11-09 16:30:34] INFO     - __main__ - Deduplicating 2 entries for NCT ID: NCT01175473
[2025-11-09 16:30:34] INFO     - __main__ - Deduplicating 2 entries for NCT ID: NCT01181986
[2025-11-0

In [6]:
cleaned_trials_loc = "data/cleaned_trials.pkl"
if Path(cleaned_trials_loc).exists():
    with open(cleaned_trials_loc, "rb") as f:
        cleaned_trials = pickle.load(f)

In [None]:

class ConfidenceLevel(Enum):
    VERY_HIGH = "very_high"
    HIGH = "high"
    MODERATE = "moderate"
    LOW = "low"
    VERY_LOW = "very_low"
    
class TherapeuticCategory(Enum):
    ONCOLOGY = "oncology"
    CARDIOVASCULAR = "cardiovascular"
    CENTRAL_NERVOUS_SYSTEM = "central_nervous_system"
    RESPIRATORY = "respiratory"
    GASTROINTESTINAL = "gastrointestinal"
    ENDOCRINOLOGY = "endocrinology"
    INFECTIOUS_DISEASES = "infectious_diseases"
    IMMUNOLOGY = "immunology"
    DERMATOLOGY = "dermatology"
    OPHTHALMOLOGY = "ophthalmology"
    RHEUMATOLOGY = "rheumatology"
    NEPHROLOGY = "nephrology"
    HEMATOLOGY = "hematology"
    PAIN_MANAGEMENT = "pain_management"
    WOMENS_HEALTH = "womens_health"
    MENS_HEALTH = "mens_health"
    PEDIATRICS = "pediatrics"
    GERIATRICS = "geriatrics"
    RARE_DISEASES = "rare_diseases"
    VACCINES = "vaccines"
    MEDICAL_DEVICES = "medical_devices"
    DIAGNOSTICS = "diagnostics"

class TrialOrganSystemAnnotation(BaseModel):
    tx_category: TherapeuticCategory = Field(..., description="The primary organ system affected by the disease or condition being studied in the trial.")
    confidence: ConfidenceLevel = Field(..., description="Your confidence in the organ system selection rated as very_high, high, moderate, low, or very_low.")
    explanation: str = Field(..., description="A brief explanation justifying the organ system selection and inclusion decision and any uncertainty.")

llm = ChatOpenAI(base_url ="http://127.0.0.1:1234/v1", model="ibm/granite-3.2-8b", temperature=0)
classifier_agent = llm.with_structured_output(TrialOrganSystemAnnotation)

In [None]:
system_prompt = """You are a clinical trial annotation specialist tasked with analyzing trials from clinicaltrials.gov to identify GLP-1 receptor-targeting drug studies. Your role is to systematically evaluate each trial and provide structured annotations.  

## Your Tasks

For each clinical trial, you must:
1. **Determine the primary organ system** affected by the disease or condition being studied
4. **Assess your confidence level** in the organ system classification
5. **Provide a brief explanation** justifying your decisions and any uncertainties

## Input Information

You will receive:
- Trial title
- Brief description of the study
- List of interventions being tested
- Primary outcome measures 

## Classification Guidelines

### Organ System Classification
Identify the PRIMARY organ system most relevant to the condition/disease being studied:

- **CARDIOVASCULAR**: Heart disease, hypertension, atherosclerosis, stroke, arrhythmias
- **DIGESTIVE**: Gastrointestinal disorders, liver disease, inflammatory bowel disease, GERD
- **ENDOCRINE**: Diabetes, thyroid disorders, adrenal conditions, hormone-related diseases
- **EXOCRINE**: Pancreatic exocrine function, digestive enzyme disorders
- **GENITOURINARY**: Kidney disease, bladder disorders, sexual dysfunction
- **HEMATOPOIETIC_AND_LYMPHATIC**: Blood disorders, anemia, lymphomas, clotting disorders
- **IMMUNE**: Autoimmune diseases, immunodeficiencies, allergies
- **INTEGUMENTARY**: Skin conditions, wound healing, dermatological disorders
- **MUSCULOSKELETAL**: Bone, joint, muscle disorders, arthritis, osteoporosis
- **NERVOUS**: Neurological conditions, mental health, cognitive disorders, neuropathy
- **REPRODUCTIVE**: Fertility, pregnancy-related conditions, reproductive health
- **RESPIRATORY**: Lung diseases, asthma, COPD, respiratory infections
- **URINARY**: Kidney function, urological conditions, fluid balance
- **UNSURE**: When the primary organ system cannot be determined from available information

### Confidence Assessment
- **VERY_HIGH** Clear, unambiguous information allowing definitive classification
- **HIGH**: Clear information allowing classification
- **MODERATE**: Some ambiguity exists but reasonable classification possible based on available data
- **LOW**: Uncertainty impairing accurate classification
- **VERY_LOW**: Significant uncertainty preventing accurate classification

### Explanation
- Provide a concise rationale for the classification 
- Highlight any uncertainties or ambiguities encountered during evaluation

## Analysis Approach
1. **Read carefully**: Review all provided information thoroughly
2. **Identify key terms**: Look for condition descriptions, and study objectives
5. **Assess certainty**: Honestly evaluate the quality and clarity of available information

## Important Notes
- Focus on the PRIMARY condition being studied
- GLP-1 receptor agonists may be studied for conditions beyond diabetes (obesity, cardiovascular protection, etc.)
- Be conservative with confidence levels - acknowledge uncertainty when information is incomplete

Provide your analysis in the structured format specified, ensuring each field is completed based on your systematic evaluation of the trial information."""

classify_prompt = """Please analyze the following clinical trial and provide your structured annotation:

**Official Title:** {trial_title}\n

**Brief Title** {brief_title\n}

**Brief Description:** \n
{brief_description}

**Detailed Description (if available):** \n
{detailed_description}

**Interventions:** \n
{interventions}

**Primary Outcome Measures:** \n
{primary_outcomes}

---

Analyze this trial according to your guidelines and provide:

1. The primary organ system affected by the disease or condition being studied
5. Your confidence level in this annotation
6. A brief explanation for your inclusion decision, including any uncertainties

Consider:
- Are any of the interventions directly targeting the GLP-1 receptor?
- Is the study objective to evaluate safety or efficacy of such drugs?
- How clear and complete is the provided information?

Provide your response using the structured output format.
"""

In [None]:
annotations_list = []
for row in cleaned_trials.itertuples():
    trial_title = row.official_title if hasattr(row, 'official_title') else "Not Available"
    brief_title = row.brief_title if hasattr(row, 'brief_title') else "Not Available"
    brief_description = row.brief_summary
    detailed_description = row.detailed_description if hasattr(row, 'detailed_description') else "Not Available"
    interventions = ", ".join(row.interventions) if isinstance(row.interventions, list) else row.interventions
    primary_outcome_measures = ", ".join(row.primary_outcomes) if isinstance(row.primary_outcomes, list) else row.primary_outcomes

    response = classifier_agent.invoke(
        input=[
            SystemMessage(content=system_prompt),
            HumanMessage(content=classify_prompt.format(
                trial_title=trial_title,
                brief_title = brief_title,
                brief_description=brief_description,
                detailed_description=detailed_description,
                interventions=interventions,
                primary_outcomes=primary_outcome_measures
            ))
        ]
    )
    annotation = [
        response.organ_system.value if getattr(response, "tx_category", None) else "",
        response.confidence.value if getattr(response, "confidence", None) else "",
        response.explanation if getattr(response, "explanation", None) else ""
    ]
    annotations_list.append(annotation)
    
    logger.info(f"Now Evaluating Trial: {trial_title}")
    logger.info(f"Annotation: {response}")


In [None]:
annotations_frame = pd.DataFrame(data=annotations_list, columns=["organ_system",  "confidence", "explanation"])
annotated_trials = pd.concat([.reset_index(drop=True), annotations_frame], axis=1)
annotated_trials.to_csv("/Users/joshziel/Documents/Coding/glp-1-landscape/data/glp1_trial_annotations.csv", index=False)

In [None]:
db = FalkorDB(host='localhost', port=6379)
g = db.select_graph(os.getenv("GRAPH_NAME"))
g.delete()

In [None]:

def add_trial_to_graph(g, row):
    # Single parameterized query for all nodes and relationships
    query = """
    MERGE (t:ClinicalTrial {nct_id: $nct_id})
    SET t.title = $title,
        t.acronym = $acronym,
        t.trial_status = $status,
        t.start_date = $start_date,
        t.completion_date = $completion_date,
        t.approximate_duration_years = $approximate_duration_years,
        t.brief_description = $brief_description,
        t.detailed_description = $detailed_description,
        t.sponsor_class = $sponsor_class,
        t.annotation_confidence = $trial_annotation_confidence
    MERGE (s:Sponsor {name: $sponsor_name})
    MERGE (o:OrganSystem {name: $organ_system})

    WITH t, s, o
    UNWIND $mapped_conditions as condition_name
    MERGE (c:Condition {name: condition_name})
    MERGE (s)-[:SPONSORED]->(t)
    MERGE (t)-[:INCLUDED]->(c)
    MERGE (t)-[:RELEVANT_FOR]->(o)

    WITH t, c
    UNWIND $drug as drug_name
    MERGE (d:Drug {name: drug_name})
    MERGE (t)-[:INVESTIGATED]->(d)
    MERGE (d)-[:INVESTIGATED_IN]->(c)

    """
    
    params = {
        'nct_id': row.nct_id,
        'drug': row.drug_name,
        'title': row.official_title if hasattr(row, 'official_title') else "Not Available",
        'brief_title': row.brief_title if hasattr(row, 'official_title') else "Not Available",
        'enrollment': str(row.enrollment) if hasattr(row, "enrollment") else "Not Available",
        'acronym': row.acronym if hasattr(row, 'acronym') else "Not Available",
        'status': row.overall_status if hasattr(row, 'overall_status') else "Not Available",
        'start_date': row.cln_start_date.strftime("%Y-%m") if pd.notna(row.cln_start_date) else "Not Available",
        'completion_date': row.cln_completion_date.strftime("%Y-%m") if pd.notna(row.cln_completion_date) else "Not Available",
        'approximate_duration_years': str(row.duration) if pd.notna(row.duration) else "Not Available",
        'brief_description': row.brief_summary if hasattr(row, 'brief_summary') else "Not Available",
        'detailed_description': row.detailed_description if hasattr(row, 'detailed_description') else "Not Available",
        'sponsor_class': row.sponsor_class if hasattr(row, 'sponsor_class') else "Not Available",
        'sponsor_name': row.cleaned_sponsor if hasattr(row, 'cleaned_sponsor') else "Unknown",
        'mapped_conditions': row.matched_conditions if len(getattr(row, 'matched_conditions',[]))>0 else ["Not Available"],
        'organ_system': row.organ_system if hasattr(row, 'organ_system') else "Not Available",
        'trial_annotation_confidence': row.confidence if hasattr(row, 'confidence') else "Not Available"
    }
    
    g.query(query, params)


for row in cleaned_trials.itertuples():
    add_trial_to_graph(g, row)

In [None]:
from agents.annotator import annotator