In [4]:
# !pip3 install -r requirements.txt
# !pip install ipywidgets

## If widgets misbehave, run below commands and restart jupyter
# !pip install jupyter_contrib_nbextensions
# !jupyter contrib nbextension install --user
# !jupyter nbextension enable varInspector/main

### Notes on TODOs

- % trials with associated papers at all
- What AEs are unique, if any, to the papers (not mentioned in trial)
- How do authors prioritise which AEs they are discussing in paper? Are there any trends in this?
- What is the lag time between trial completion and papers being published?
- Extract target - drug - AEs
- Is there any other data in CT.gov which could help inform on AE prioritisation
- Can we define 'severity' of AEs with models - check w Ines
- Compare examples from OTs paper (less severe indications) to cancer examples, intention to link to Ines' severity score

In [6]:
nct_id = "NCT01753193"
# nct_id = "NCT05034952" # Efficacy and Safety VX-548

In [8]:
import ipywidgets as widgets
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
from trial_to_paper_utils import *
from pprint import pprint

## AACT queried with starting NCT ID of interest to collect referenced adverse events

In [23]:
study_title, aes, severe_aes, other_aes, patient_groups = aact_data_gather(nct_id)
# TODO Get % affected / see relevance of 'other' vs 'serious'
if study_title:
    print(
        f"For CT.gov trial:\n\t'{study_title}' ({nct_id}),\n{len(aes)} unique AEs were recorded in {len(patient_groups)} patient group(s)\n")
    # [print(f"- {a}\n") for a in aes]
else:
    print(f"No trials were found searching for ID: {nct_id}")

For CT.gov trial:
	'An Open-label Study to Evaluate the Long-term Safety of MEDI-546, for the Treatment of SLE, in Adults' (NCT01753193),
339 unique AEs were recorded in 1 patient group(s)



In [10]:
def display_widget():
    display_text = "<br>".join([f"- {a}" for a in aes])

    b = widgets.HTML(
        value=display_text,
        placeholder='AEs',
        description='Scroll',
        disabled=True
    )

    a = widgets.HBox([b], layout=widgets.Layout(height='150px', width='1000px', overflow_y='auto'))

    print(f'Adverse events recorded for trial: {nct_id}')
    display(a)

display_widget()

Adverse events recorded for trial: NCT01753193


HBox(children=(HTML(value="- Scar<br>- Hypertransaminasaemia<br>- Chikungunya virus infection<br>- Herpes simp…

## Search ePMC for papers mentioning trial ID, failing this search for papers relating to compound name

In [15]:
trial_in_pmids = query_epmc(query=nct_id, page_size=25)
print(f"NCT ID referenced in {len(trial_in_pmids)} PubMed paper(s).\n{trial_in_pmids}")
if not trial_in_pmids:
    print('NCT ID not referenced in papers, searching for compound name instead')
    trial_in_pmids = query_epmc(query="VX-548", page_size=25) #TODO - Drug name as alternative to NCT ID, automate this

test_pmid = trial_in_pmids[3] #TODO - remove test, do for all results & consider subject test groups
text = query_bioc(pmid=test_pmid)
print_text = False
if print_text:
    print("\n".join(text))
    print(len(text))
    print(text[3])

NCT ID referenced in 6 PubMed paper(s).
['39193183', '37148484', '31190735', '33225631', '34768756', '35383948']


## Testing HuggingFace AE models over literature text

In [16]:
# TODO Test open source models for AE detection in text, compare to those recorded in trial
pipe = pipeline(task="token-classification", model="MutazYoune/BiomedBERT-Adverse-Events-NER_pun", tokenizer="MutazYoune/BiomedBERT-Adverse-Events-NER_pun")
# pipe = pipeline(task="token-classification", model="MutazYoune/Medical-NER-Adverse-Events-NER", tokenizer="MutazYoune/Medical-NER-Adverse-Events-NER")
all_sections = []
for x in tqdm(text):
    try:
        # print(x)
        res = pipe(x)
        if res:
            section_df = pd.DataFrame(res)
            # pprint(section_df)
            all_sections.append(section_df)
    except:
        continue
paper_aes = pd.concat(all_sections, ignore_index=True)
model_name = pipe.model.name_or_path
model_name = model_name.rpartition('/')[-1]
# paper_aes.to_csv(f'./output/{model_name}_AEs_{test_pmid}.csv')

Device set to use mps:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|█████████████████████████████████████████| 116/116 [00:05<00:00, 22.48it/s]


In [17]:
pd.set_option('display.max_columns', None)
paper_aes

Unnamed: 0,entity,score,index,word,start,end
0,I-AE,0.730811,67,aes,290,293
1,I-AE,0.508139,150,adverse,703,710
2,I-AE,0.712941,151,events,711,717
3,B-AE,0.959111,174,herpes,831,837
4,I-AE,0.954907,175,zoster,838,844
...,...,...,...,...,...,...
64,B-AE,0.566466,286,herpes,1490,1496
65,I-AE,0.533914,4,organ,20,25
66,I-AE,0.566280,5,damage,26,32
67,B-AE,0.488388,53,antid,253,258


In [18]:
widgets.HTML(paper_aes.to_html(index=False))

HTML(value='<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th>en…