In [57]:
%load_ext autoreload
%autoreload 2

In [127]:

# from src import Match, Icsr
from src.utils import get_matches
from questions import *

from datetime import datetime
import random
import datasets
from collections import defaultdict
import numpy as np
import tiktoken
from copy import deepcopy
import json

# import pandas as pd
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split

import logging

# Set the logging level to INFO
logging.basicConfig(level=logging.INFO)

# Get the root logger
logger = logging.getLogger()

In [3]:
# load matches
dataset = datasets.load_dataset("FAERS-PubMed/raw_dataset")
matches = get_matches(dataset['train'])
print(len(matches))



  0%|          | 0/1 [00:00<?, ?it/s]

65648


In [4]:
# process full text
def remove_front(text):
    if '==== Body' in text:
        text = ('\n').join(text.split('==== Body')[1:])
    return text.strip()

def remove_refs(text):
    if '==== Refs' in text:
        text = ('\n').join(text.split('==== Refs')[:-1])
    return text.strip() 

def get_processed_fulltext(article):
    fulltext_filtered = remove_refs(remove_front(article.fulltext))
    data = [article.title,article.abstract, fulltext_filtered]
    return ('\n').join(data).strip()

## Filter

In [5]:
# arguments
report_cutoff = 10
fulltext_only = True
commercial_only = False
test_cutoff = datetime(year=2021, month=1, day=1)

In [6]:
# filter too many reports
matches = [m for m in matches if len(m.reports) <= report_cutoff]
print(f'Matches with <= {report_cutoff} reports: {len(matches):,}')

Matches with <= 10 reports: 62,168


In [7]:
# get articles with full text
if fulltext_only:
    matches = [m for m in matches if m.article.fulltext]
    print(f'Matches with full text: {len(matches):,}')

Matches with full text: 18,678


## Questions

In [8]:
# questions = [
#     WeightQuestion(),
#     DrugsGivenReactionQuestion(),
#     DrugAdministrationRouteQuestion(),
#     DrugDosageQuestion(),
#     DrugDosageTextQuestion(),
#     ReactionOutcomeQuestion(),
#     PatientAgeGivenReactionQuestion(),
#     PatientAgeGivenDrugQuestion()
# ]
# conversation_flow = None

# questions specifically focussed on icsr
questions = [
    PatientWeightAndSexQuestion(),
    DrugsQuestion(),
    ReactionsQuestion()
]
conversation_flow = [q.t for q in questions]



In [9]:
bad_answers = ['Unknown', 'UNK']
def get_questions(report):
    instances = []
    for question in questions:
        try:
            new_q = question.from_report(report)
        except Exception as ex:
            logger.warn(f'Error with question {question.t} on report {report.safetyreportid}: {type(ex).__name__}, {ex.args}')
        instances.extend(new_q)
    return instances

def filter_questions(questions):
    filtered_questions = []
    for q in questions:
        a = q[1]
        if 'Unknown' not in a and 'UNK' not in a:
            filtered_questions.append(q)
    return filtered_questions

In [10]:
# get questions associated with a report
report_to_question = {}
report_to_article = {}
reports = {}

for match in matches:
    for report in match.reports:
        if report.safetyreportid in report_to_question:
            raise KeyError('safetyreportid already in set')
        else:
            reports.update({
                report.safetyreportid: report
            })
            report_to_question.update({
                report.safetyreportid: filter_questions(get_questions(report))
            })
            report_to_article.update({
                report.safetyreportid: match.article.pmid
            })

articles = {}

for match in matches:
    articles.update({match.article.pmid: match.article})

## Conversation

In [11]:
def sample_conversation(questions, seed=4):
    random.seed(seed)
    # group questions per type
    questions_per_type = defaultdict(list)
    for q in questions:
        questions_per_type[q[2]].append(q)

    # if a conversation flow is not defined, uniformy sample some types and one question per type
    if not conversation_flow:
        types = questions_per_type.keys()
        types_sampled = random.sample(types, len(types))
    # else, follow the flow
    else:
        types_sampled = conversation_flow

    conversation = []
    for q_type in types_sampled:
        conversation.append(random.choice(questions_per_type[q_type]))

    return conversation

In [12]:
def conversation_to_chatml(conversation, article):
    # set system messagee
    chatml = [{
        'role': 'system',
        'content': 'You are a helpful assistant. You read biomedical texts and concisely answer user questions about adverse drug events. You give the most specific answer supported by the text.'
    }]
    # include article in the first full text
    first_q, first_a, _ = conversation[0]
    processed_article = get_processed_fulltext(article)
    chatml.append({
                'role': 'user',
                'content': f'Answer the question given the text. \n\nQuestion: {first_q}\n\nText: {processed_article}'
            })
    chatml.append({
        'role': 'assistant',
        'content': first_a
    })
    # QA over different turns
    for q,a, _ in conversation[1:]:
        chatml.append({
            'role': 'user',
            'content': q
        })
        chatml.append({
            'role': 'assistant',
            'content': a
        })
    return chatml

In [13]:
report_to_chatml = {}

for report_id in reports.keys():
    article = articles[report_to_article[report_id]]
    conversation = sample_conversation(report_to_question[report_id], seed = report_id)
    chatml = conversation_to_chatml(conversation, article)

    report_to_chatml.update({
        report_id: chatml
    })

In [14]:
report_to_chatml[19956654]

[{'role': 'system',
  'content': 'You are a helpful assistant. You read biomedical texts and concisely answer user questions about adverse drug events. You give the most specific answer supported by the text.'},
 {'role': 'user',
  'content': "Answer the question given the text. \n\nQuestion: This text describes an adverse drug event with regard to a patient or cohort. What is the weight of the patient? What is the sex of the patient? Produce an answer in the following format: 'The patient weights {{weight}} kg and is {{male|female}}'. If no weight or sex values can be identified, fill in 'N/A'.\n\nText: Case Report: Perioperative Kounis Syndrome in an Adolescent With Congenital Glaucoma.\nA 12-year-old male patient suffering from congenital glaucoma developed bradycardia, left ventricular failure, and hypotension after induction of anesthesia. Electrocardiography and echocardiography revealed a complete normalization of ECG and a complete spontaneous recovery in the cardiac function 7

## Truncate text if needed

In [15]:
enc = tiktoken.encoding_for_model('gpt-4')
max_prompt = 8192
max_generation = 256
buffer = 256 * 2

In [16]:
def truncate_chatml(chatml):
    new_chatml = deepcopy(chatml)
    # encode everything, get max length of the message
    content = [m['content'] for m in chatml]
    content_enc = [enc.encode(c) for c in content]
    content_len = [len(c) for c in content_enc]
    content_len[-1] = max_generation # always reserve the maximum for the final message
    # print(content_len)
    
    total_length = sum(content_len) + len(chatml) # special token added every message
    extra_tokens = buffer + total_length - max_prompt

    # print(extra_tokens)

    # truncate if needed
    if extra_tokens > 0:
        new_chatml[1]['content'] = enc.decode(content_enc[1][:-extra_tokens])
    return new_chatml

In [17]:
report_to_chatml_truncated = {k: truncate_chatml(v) for k,v in report_to_chatml.items()}

## Split and save
For now, sample one report per article. Sample the exact same one as in the ICSR-Extraction dataset

In [18]:
icsr_dataset = datasets.load_dataset('BioDEX/BioDEX-ICSR')



  0%|          | 0/3 [00:00<?, ?it/s]

In [19]:
train_chatml = {k:v for k,v in report_to_chatml_truncated.items() if k in icsr_dataset['train']['safetyreportid']}
validation_chatml = {k:v for k,v in report_to_chatml_truncated.items() if k in icsr_dataset['validation']['safetyreportid']}
test_chatml = {k:v for k,v in report_to_chatml_truncated.items() if k in icsr_dataset['test']['safetyreportid']}

print(len(train_chatml))
print(len(validation_chatml))
print(len(test_chatml))

9624
2407
3628


In [20]:
print(len(icsr_dataset['train']))
print(len(icsr_dataset['validation']))
print(len(icsr_dataset['test']))


9624
2407
3628


In [21]:
with open('pv_conv_train.json', 'w') as fp:
    json.dump(train_chatml, fp)
with open('pv_conv_validation.json', 'w') as fp:
    json.dump(validation_chatml, fp)
with open('pv_conv_test.json', 'w') as fp:
    json.dump(test_chatml, fp)


In [29]:
print(icsr_dataset['validation']['safetyreportid'][1000])
icsr_dataset['validation']['abstract'][1000]

14160119


'Neurolymphomatosis is a rare entity defined as nerve infiltration by neurotropic abnormal lymphocytes which can lead to the development of neuropathy, with typical presentations including pain, hypoesthesia, paresthesis and palsy. We herein report two cases where critical bilateral vocal cord paralysis due to neurolymphomatosis in recurrent nerves occurred in refractory Burkitt lymphoma and adult T-cell lymphoma patients. High-dose methotrexate and intrathecal chemotherapy injection for the nervous lesions were ineffective, and the patients died. Neurolymphomatosis of the recurrent nerve is an emergent and difficult complication and should be suspected when sudden onset of aphasia, hoarseness or shortness of breath is found in refractory lymphoma patients.'

In [30]:
validation_chatml[14160119]

[{'role': 'system',
  'content': 'You are a helpful assistant. You read biomedical texts and concisely answer user questions about adverse drug events. You give the most specific answer supported by the text.'},
 {'role': 'user',
  'content': "Answer the question given the text. \n\nQuestion: This text describes an adverse drug event with regard to a patient or cohort. What is the weight of the patient? What is the sex of the patient? Produce an answer in the following format: 'The patient weights {{weight}} kg and is {{male|female}}'. If no weight or sex values can be identified, fill in 'N/A'.\n\nText: Two Cases of Neurolymphomatosis with Fatal Bilateral Vocal Cord Paralysis that were Diagnosed with 18F-fluorodeoxyglucose Positron Emission Tomography (FDG PET)/CT.\nNeurolymphomatosis is a rare entity defined as nerve infiltration by neurotropic abnormal lymphocytes which can lead to the development of neuropathy, with typical presentations including pain, hypoesthesia, paresthesis an