In [None]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from langchain_openai import ChatOpenAI
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from services import logging_config, mesh_mapper, annotator


In [None]:
load_dotenv(find_dotenv())

logger = logging_config.get_logger(__name__)

DATA_STORAGE = os.getenv("DATA_LOC", None)

if DATA_STORAGE and Path(DATA_STORAGE).exists():
    logger.info(f"Data will be saved at:{DATA_STORAGE}")
else:
    DATA_STORAGE = Path(__file__).resolve()
    logger.warning(f"Warning: Data storage path in environment does not exist or was not set, saving data here: {DATA_STORAGE}")

cleaned_trials_loc = f"{DATA_STORAGE}/cleaned_trials.pkl"
if Path(cleaned_trials_loc).exists():
    with open(cleaned_trials_loc, "rb") as f:
        cleaned_trials = pickle.load(f)
else:
    logger.warning("No pkl file found: You must run the data retriever workflow before executing this noteboook")

annotator_llm = ChatOpenAI(base_url=os.getenv("LOCAL_LLM_URL"), model = os.getenv("LOCAL_LLM"))


[2025-11-13 08:30:03] INFO     - __main__ - Data will be saved at:/Users/joshuaziel/Documents/Coding/glp-1_landscape/data


In [None]:
session = annotator.AnnotatorWorkflow(df = cleaned_trials, llm = annotator_llm, data_loc = DATA_STORAGE )
session.run_annotation_workflow()
annotated_trials = session.annotated_data.copy()

[2025-11-13 08:30:06] INFO     - services.annotator - Loaded existing MeSH map containing 298 mappings as a <class 'dict'>


In [None]:
annotated_trials_loc = f"{DATA_STORAGE}/<fill in pkl filename to load>"
if Path(cleaned_trials_loc).exists():
    with open(annotated_trials_loc, "rb") as f:
        cleaned_trials = pickle.load(f)
else:
    logger.error("Check the path to the pickle!")

In [None]:
db = FalkorDB(host='localhost', port=6379)
g = db.select_graph(os.getenv("GRAPH_NAME"))
g.delete()

In [None]:
def add_trial_to_graph(g, row):
    # Single parameterized query for all nodes and relationships
    query = """
    MERGE (t:ClinicalTrial {nct_id: $nct_id})
    SET t.title = $title,
        t.acronym = $acronym,
        t.trial_status = $status,
        t.start_date = $start_date,
        t.completion_date = $completion_date,
        t.approximate_duration_years = $approximate_duration_years,
        t.brief_description = $brief_description,
        t.detailed_description = $detailed_description,
        t.sponsor_class = $sponsor_class,
        t.annotation_confidence = $trial_annotation_confidence
    MERGE (s:Sponsor {name: $sponsor_name})
    MERGE (o:OrganSystem {name: $organ_system})

    WITH t, s, o
    UNWIND $mapped_conditions as condition_name
    MERGE (c:Condition {name: condition_name})
    MERGE (s)-[:SPONSORED]->(t)
    MERGE (t)-[:INCLUDED]->(c)
    MERGE (t)-[:RELEVANT_FOR]->(o)

    WITH t, c
    UNWIND $drug as drug_name
    MERGE (d:Drug {name: drug_name})
    MERGE (t)-[:INVESTIGATED]->(d)
    MERGE (d)-[:INVESTIGATED_IN]->(c)

    """
    
    params = {
        'nct_id': row.nct_id,
        'drug': row.drug_name,
        'title': row.official_title if hasattr(row, 'official_title') else "Not Available",
        'brief_title': row.brief_title if hasattr(row, 'official_title') else "Not Available",
        'enrollment': str(row.enrollment) if hasattr(row, "enrollment") else "Not Available",
        'acronym': row.acronym if hasattr(row, 'acronym') else "Not Available",
        'status': row.overall_status if hasattr(row, 'overall_status') else "Not Available",
        'start_date': row.cln_start_date.strftime("%Y-%m") if pd.notna(row.cln_start_date) else "Not Available",
        'completion_date': row.cln_completion_date.strftime("%Y-%m") if pd.notna(row.cln_completion_date) else "Not Available",
        'approximate_duration_years': str(row.duration) if pd.notna(row.duration) else "Not Available",
        'brief_description': row.brief_summary if hasattr(row, 'brief_summary') else "Not Available",
        'detailed_description': row.detailed_description if hasattr(row, 'detailed_description') else "Not Available",
        'sponsor_class': row.sponsor_class if hasattr(row, 'sponsor_class') else "Not Available",
        'sponsor_name': row.cleaned_sponsor if hasattr(row, 'cleaned_sponsor') else "Unknown",
        'mapped_conditions': row.matched_conditions if len(getattr(row, 'matched_conditions',[]))>0 else ["Not Available"],
        'organ_system': row.organ_system if hasattr(row, 'organ_system') else "Not Available",
    }
    
    g.query(query, params)


for row in annotated_trials.itertuples():
    add_trial_to_graph(g, row)