#Notebook Setup
-Import Dependencies
-Load Environmental Variables
-Setup Loging

In [5]:
import os
import string
import time
import pickle

from tqdm import tqdm
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from datetime import datetime
from pathlib import Path
from services import clinicaltrials_retriever, logging_config, mesh_mapper
import pandas as pd


In [None]:
# Load environment variables
load_dotenv(find_dotenv())

# Initialize logger
logger = logging_config.get_logger(__name__)

#Set data location
DATA_STORAGE = os.getenv("DATA_LOC", None)

if DATA_STORAGE and Path(DATA_STORAGE).exists():
    logger.info(f"Data will be saved at:{DATA_STORAGE}")
else:
    DATA_STORAGE = Path(__file__).resolve()
    logger.warning(f"Warning: Data storage path in environment does not exist or was not set, saving data here: {DATA_STORAGE}")

#Define Core Clinical Trials Searches

In [None]:
drug_queries = {
    "EFPEGLENATIDE": "efpeglenatide",
    "LIRAGLUTIDE": "liraglutide",
    "SURVODUTIDE": "survodutide",
    "DULAGLUTIDE": "dulaglutide",
    "RETATRUTIDE": "retatrutide",
    "COTADUTIDE": "cotadutide",
    "LIXISENATIDE": "lixisenatide",
    "ALBIGLUTIDE": "albiglutide",
    "PEGSEBRENATIDE": "pegsembrenatide",
    "EXENATIDE": "exenatide",
    "PEGAPAMODUTIDE": "pegapamodutide",
    "AVEXITIDE": "avexitide",
    "TIRZEPATIDE": "tirzepatide",
    "DANUGLIPRON": "danuglipron",
    "EFINOPEGDUTIDE": "efinopegdutide",
    "TASPOGLUTIDE": "taspoglutide",
    "SEMAGLUTIDE": "semaglutide",
}

#Retrieve all clinical trials associated with intervention queries

In [None]:
drug_queries_results = []
for key in drug_queries.keys():
    drug_query = drug_queries[key]
    logger.info(f"Processing drug: {key} with query term: {drug_query}")

    # Retrieve clinical trials
    results = clinicaltrials_retriever.retrieve_batched_studies(query_intervention=drug_query)
    logger.info(f"Retrieved {results.shape[0]} clinical trials for drug: {key}")
    results['drug_name'] = key
    drug_queries_results.append(results)

all_trials = pd.concat(drug_queries_results, axis = 0, ignore_index=True)
all_trials.to_csv(f"{DATA_STORAGE}/all_raw_trials.csv")
logger.info(f"Retrieved a total of {all_trials.shape[0]} and saved to csv:{DATA_STORAGE}/all_raw_trials.csv")


#Trial filtering, data cleaning, and export

In [None]:
def deduplicate_trials(df):
    """Deduplicate trials and combine drug names"""
    
    # Pre-allocate list to collect results
    deduped_list = []
    
    # Group by nct_id to handle duplicates
    for nct_id, group in df.groupby('nct_id'):
        if len(group) > 1:
            logger.info(f"Deduplicating {len(group)} entries for NCT ID: {nct_id}")
        
        # Take first row as template and ensure drug_name column can hold lists
        template = group.iloc[0].copy()
        
        # Get all unique drug names for this trial
        drug_names = group['drug_name'].dropna().unique().tolist()
        
        # Convert to Series to allow assignment
        template = pd.Series(template)
        template['drug_name'] = drug_names
        
        deduped_list.append(template)
    
    # Create DataFrame from list (much more efficient)
    deduped_results = pd.DataFrame(deduped_list)
    
    return deduped_results

def remove_punctuation(text):
    """Remove punctuation with null handling"""
    if pd.isna(text) or text is None:
        return ""
    return str(text).translate(str.maketrans('', '', string.punctuation)).upper()

def convert_date(text):
    """
    Convert date to standard object
    """
    if pd.isna(text) or text is None:
        return None
    elif "-" in text:
        date_info = text.split("-")
        if len(date_info) == 2:
            year = int(date_info[0])
            month = int(date_info[1])
            return datetime(year = year, month = month, day = 1)
        if len(date_info) == 3:
            year = int(date_info[0])
            month = int(date_info[1])
            day = int(date_info[2])
            return datetime(year = year, month = month, day = day)
    else:
        return None

def calculate_duration(row):
    """
    Precise calculation using total_seconds with robust null handling
    """
    start_date = row['cln_start_date']
    completion_date = row['cln_completion_date']
    
    # Handle various types of missing/empty values
    if (pd.isna(start_date) or pd.isna(completion_date) or 
        start_date is None or completion_date is None or
        start_date == '' or completion_date == ''):
        return None
    
    # Calculate duration using total_seconds for precision
    duration = completion_date - start_date
    seconds_per_year = 365.25 * 24 * 60 * 60
    years = duration.total_seconds() / seconds_per_year
    return round(years, 1)

def generate_mesh_term_map(df):
    """
    Map conditions column to best matching MeSH condition term
    """
    unique_conditions = df.conditions.explode().unique()
    term_mappings = {}
    for condition in tqdm(unique_conditions, desc="Matching conditions to MeSH terms"):
        if pd.notna(condition) and condition:  # Skip null/empty conditions
            result = mesh_mapper.search_mesh_term(condition, filter_diseases_only=True)  # Change to True to filter
            if result:
                mapping = f"{result["mesh_term"]} (MeSH ID:{result["mesh_id"]})"
                term_mappings[condition] = mapping
            # Add delay after each search to respect rate limits
            time.sleep(0.35)
    # Display results
    logger.info(f"\nSuccessfully matched {len(term_mappings.keys())} conditions to MeSH terms")
    return term_mappings

def add_mesh_mappings(conditions, term_map):
    """
    Apply the MeSH term mapping to the Data
    """
    matched_conditions = []
    if conditions is not None and len(conditions)>0:  # Skip null/empty conditions
        for condition in conditions:
            if term_map.keys() and condition in term_map.keys():
                matched_term = term_map[condition]
                logger.info(f"Found a MeSH term mapping for {condition}")
                matched_conditions.append(matched_term)
            else:
                logger.info(f"No existing MeSH term mapping for {condition}")
    return list(set(matched_conditions))

#Deduplication
logger.info(f"Deduplicating {all_trials.shape[0]} trials")
deduped_trials = deduplicate_trials(all_trials)
logger.info(f"Removed {all_trials.shape[0]-deduped_trials.shape[0]} trials during de-duplication")

#Data Cleaning
logger.info(f"Initiating data clean-up on {deduped_trials.shape[0]} trials.")
cleaned_trials = deduped_trials[
    # Check if 'DRUG' is in intervention_types list
    (deduped_trials['intervention_types'].apply(
        lambda x: isinstance(x, list) and ('DRUG' or 'BIOLOGICAL') in x
    )) 
    & 
    # Check healthy_volunteers 
    ((deduped_trials['healthy_volunteers'] == False) | 
     (deduped_trials['healthy_volunteers'].isnull()))
    & 
    # Check study type
    (deduped_trials['study_type'] == 'INTERVENTIONAL')
    &
    # Check if primary_outcomes is not empty/null
    (deduped_trials['primary_outcomes'].notna() & 
     (deduped_trials['primary_outcomes'].astype(str) != '[]'))
    &
    # Check if interventions is not empty/null  
    (deduped_trials['interventions'].notna() & 
     (deduped_trials['interventions'].astype(str) != '[]'))
].copy()
mesh_term_map = generate_mesh_term_map(cleaned_trials)
mesh_frame = pd.DataFrame.from_dict(mesh_term_map, orient = 'index')
mesh_frame.to_csv(f"{DATA_STORAGE}/mesh_term_mappings.csv")


cleaned_trials['cleaned_sponsor'] = cleaned_trials['lead_sponsor'].apply(remove_punctuation)
cleaned_trials['cln_start_date']= cleaned_trials['start_date'].apply(convert_date)
cleaned_trials['cln_completion_date']= cleaned_trials['completion_date'].apply(convert_date)
cleaned_trials['duration'] = cleaned_trials.apply(calculate_duration, axis = 1)
cleaned_trials["matched_conditions"] = cleaned_trials["conditions"].apply(add_mesh_mappings, term_map=mesh_term_map)
cleaned_trials['llm_annotations'] = [[] for _ in range(len(cleaned_trials))]
cleaned_trials.shape

#Trial saved as CSV for ease of review and as a pkl - the pkl should be used by the annotator notebook
cleaned_trials.to_csv(f"{DATA_STORAGE}/cleaned_trials.csv")
with open(f"{DATA_STORAGE}/cleaned_trials.pkl", 'wb') as f:
    pickle.dump(cleaned_trials, f)
logger.info(f"Saved {cleaned_trials.shape[0]} trials after data cleaning in {DATA_STORAGE}")

In [None]:
#Reload the Dataframe from Pickle of Necessary
cleaned_trials_loc = "data/cleaned_trials.pkl"
if Path(cleaned_trials_loc).exists():
    with open(cleaned_trials_loc, "rb") as f:
        cleaned_trials = pickle.load(f)