In [20]:
from src import Match, Icsr
from src.utils import get_matches

from datetime import datetime
import random
import datasets
import pandas as pd


In [21]:
# load matches
dataset = datasets.load_dataset("BioDEX/raw_dataset")
matches = get_matches(dataset['train'])
print(len(matches))

Using custom data configuration BioDEX--raw_dataset-e1a8735a3d189f31
Found cached dataset json (/Users/kldooste/.cache/huggingface/datasets/BioDEX___json/BioDEX--raw_dataset-e1a8735a3d189f31/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

65648


In [54]:
import re
# reactions can not contain commas, else the csv-format fails
# replace ^ by '
def normalize_reactions(reaction):
    # commas in floats should be points
    reaction = re.sub(r'(\d+),(\d+)', r'\1.\2', reaction)

    # commas in enumerations should just be spaces
    reaction = re.sub(r', ', r' ', reaction)

    # commas in protein structures should be dashes
    reaction = re.sub(r'(\d+\'),(\d+\')', r'\1-\2', reaction)

    # remove all other commas
    reaction = re.sub(r',', r'', reaction)

    # replace ^ by '
    reaction = re.sub(r'\^', r"'", reaction)

    return reaction

In [55]:
# arguments
report_cutoff = 10
commercial_only = False
test_cutoff = datetime(year=2021, month=1, day=1)

In [56]:
matches = [m for m in matches if len(m.reports) <= report_cutoff]
print(f'Matches with <= {report_cutoff} reports: {len(matches):,}')

Matches with <= 10 reports: 18,678


In [57]:
matches = [m for m in matches if m.article.fulltext]
print(f'Matches with full text: {len(matches):,}')

Matches with full text: 18,678


In [58]:
def split_data(matches, test_cutoff):
    test_matches = []
    train_matches = []

    for m in matches:
        pubdate = datetime.strptime(m.article.pubdate[:4], '%Y')
        if  pubdate >= test_cutoff:
            test_matches.append(m)
        else:
            train_matches.append(m)

    print(f'Train size: {len(train_matches):,}')
    print(f'Test size: {len(test_matches):,}')

    return train_matches, test_matches

In [59]:
cutoff = datetime(year=2021, month=1, day=1)
train, test = split_data(matches, cutoff)

Train size: 14,429
Test size: 4,249


In [60]:
def reactions_from_report(report):
    # get all reactions and outcomes
    reactions = []
    for reaction in report.patient.reaction:
        if reaction.reactionmeddrapt:
            reactions.append(reaction.reactionmeddrapt)
    # deduplicate and sort
    reactions = sorted(list(set(reactions)))
    # normalize
    reactions = [normalize_reactions(r) for r in reactions]
    return reactions

def reactions_from_match(match):
    reactions = []
    for report in match.reports:
        reactions.extend(reactions_from_report(report))
    reactions = sorted(list(set(reactions)))
    return reactions

In [61]:
def get_ds_from_split(split):
    reactions = []
    reactions_unmerged = []
    safetyids = []

    for match in split:
        reactions.append(reactions_from_match(match))
        reactions_unmerged.append([reactions_from_report(r) for r in match.reports])
        safetyids.append([report.safetyreportid for report in match.reports])

    # format the data
    data = [m.article.dict() for m in split]
    for d, r, r_unm, reportids in zip(data, reactions, reactions_unmerged, safetyids):
        d.update({
            'reactions': ", ".join(r),
            'reactions_unmerged': [", ".join(r) for r in r_unm],
            'safetyreportids': reportids
        })
    
    df = pd.DataFrame(data=data)
    # reorder some of the columns
    df = df[['title', 'abstract','fulltext','reactions', 'reactions_unmerged', 'pmid', 'fulltext_license', 'title_normalized','issue', 'pages', 'journal', 'authors', 'pubdate', 'doi', 'affiliations', 'medline_ta', 'nlm_unique_id', 'issn_linking', 'country', 'mesh_terms', 'publication_types', 'chemical_list', 'keywords', 'references', 'delete', 'pmc', 'other_id', 'safetyreportids']]
    df = df.fillna('')
    ds = datasets.Dataset.from_pandas(df)
    return ds

In [62]:
train_ds = get_ds_from_split(train)
test_ds = get_ds_from_split(test)

train_ds = train_ds.shuffle(42)
test_ds = test_ds.shuffle(42)

In [63]:
# create a validation set
len_train = int(len(train_ds) * 0.8)

val_ds = train_ds.select(range(len_train,len(train_ds)))
train_ds = train_ds.select(range(len_train))

In [64]:
print('train size: ', len(train_ds))
print('val size: ', len(val_ds))
print('test size: ', len(test_ds))

train size:  11543
val size:  2886
test size:  4249


In [65]:
# preprocess fulltexdt
def remove_front(text):
    if '==== Body' in text:
        text = ('\n').join(text.split('==== Body')[1:])
    return text.strip()

def remove_refs(text):
    if '==== Refs' in text:
        text = ('\n').join(text.split('==== Refs')[:-1])
    return text.strip() 

def get_fulltext_input(row, fulltext_only=False):
    fulltext_filtered = remove_refs(remove_front(row['fulltext']))
    data = ['\nTITLE:', row['title'], '\nABSTRACT:', row['abstract']]
    if fulltext_only:
        data.extend(['\nTEXT:', fulltext_filtered])
    return ('\n').join(data).strip()


In [66]:
ds = datasets.DatasetDict({
    'train': train_ds,
    'validation': val_ds,
    'test': test_ds
})

In [67]:
ds = ds.map(lambda example: {'fulltext_processed': get_fulltext_input(example, fulltext_only=True)})

  0%|          | 0/11543 [00:00<?, ?ex/s]

  0%|          | 0/2886 [00:00<?, ?ex/s]

  0%|          | 0/4249 [00:00<?, ?ex/s]

In [37]:
# upload data
ds.push_to_hub('BioDEX/BioDEX-Reactions')
