In [1]:
from src import Match, Icsr
from src.utils import get_matches

from datetime import datetime
import random
import datasets
import pandas as pd


In [2]:
# load matches
dataset = datasets.load_dataset("BioDEX/raw_dataset")
matches = get_matches(dataset['train'])
print(len(matches))

Using custom data configuration BioDEX--raw_dataset-e1a8735a3d189f31


Downloading and preparing dataset json/BioDEX--raw_dataset to /Users/kldooste/.cache/huggingface/datasets/BioDEX___json/BioDEX--raw_dataset-e1a8735a3d189f31/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /Users/kldooste/.cache/huggingface/datasets/BioDEX___json/BioDEX--raw_dataset-e1a8735a3d189f31/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

65648


In [38]:
# arguments
report_cutoff = 10
fulltext_only = True
# fulltext_only = False
commercial_only = False
test_cutoff = datetime(year=2021, month=1, day=1)

### Filter too many reports
A few articles have many reports. They are typically survey articles, drop them.

In [39]:
matches = [m for m in matches if len(m.reports) <= report_cutoff]
print(f'Matches with <= {report_cutoff} reports: {len(matches):,}')

Matches with <= 10 reports: 62,168


### Get articles with a full text

In [40]:
if fulltext_only:
    matches = [m for m in matches if m.article.fulltext]
    print(f'Matches with full text: {len(matches):,}')

Matches with full text: 18,678


## (optional) Get articles with commercial license

In [41]:
# noncommercial_licenses = {'CC BY-NC', 'CC BY-NC-SA', 'CC BY-NC-ND'}
commercial_licenses = {'CC0', 'CC BY', 'CC BY-SA', 'CC BY-ND'}

if commercial_only:
    matches = [m for m in matches if m.article.fulltext_license in commercial_licenses]
    print(f'Fulltext commercial dataset: {len(matches):,}')

## Split the data

In [42]:
def split_data(matches, test_cutoff):
    test_matches = []
    train_matches = []

    for m in matches:
        pubdate = datetime.strptime(m.article.pubdate[:4], '%Y')
        if  pubdate >= test_cutoff:
            test_matches.append(m)
        else:
            train_matches.append(m)

    print(f'Train size: {len(train_matches):,}')
    print(f'Test size: {len(test_matches):,}')

    return train_matches, test_matches

In [43]:
cutoff = datetime(year=2021, month=1, day=1)
train, test = split_data(matches, cutoff)

Train size: 14,429
Test size: 4,249


### Format the data
For every article, sample one report as target.

Drop some fields and upload to huggingface.

In [44]:

def get_icsrs_from_split(split):
    random.seed(42)

    icsrs = []
    reportids = []
    for m in split:
        # get all valid icsrs 
        new_icsrs = [(Icsr.from_report(r), r.safetyreportid) for r in m.reports]
        new_icsrs = [t for t in new_icsrs if t[0]]
        # sample one
        if new_icsrs:
            new_icsr, new_reportid = random.choice(new_icsrs)
            icsrs.append(new_icsr)
            reportids.append(new_reportid)
        else:
            icsrs.append(None)
            reportids.append(None)
    return icsrs, reportids

def get_ds_from_split(split):
    icsrs, reportids = get_icsrs_from_split(split)
    # only keep data with valid icsrs
    valid = [m for index,m in enumerate(split) if icsrs[index]]
    reportids = [reportids[i] for i in range(len(reportids)) if icsrs[i]]
    icsrs = [i for i in icsrs if i]
    print(f'Found samples with valid icsrs: {len(valid):,}')

    # format the data
    data = [m.article.dict() for m in valid]
    for d, i, reportid in zip(data, icsrs, reportids):
        d.update({
            'target': i.to_string(),
            'safetyreportid': reportid
        })

    df = pd.DataFrame(data=data)
    # reorder some of the columns
    df = df[['title', 'abstract','fulltext','target', 'pmid', 'fulltext_license', 'title_normalized','issue', 'pages', 'journal', 'authors', 'pubdate', 'doi', 'affiliations', 'medline_ta', 'nlm_unique_id', 'issn_linking', 'country', 'mesh_terms', 'publication_types', 'chemical_list', 'keywords', 'references', 'delete', 'pmc', 'other_id', 'safetyreportid']]
    df = df.fillna('')
    ds = datasets.Dataset.from_pandas(df)
    return ds

In [45]:
train_ds = get_ds_from_split(train)
test_ds = get_ds_from_split(test)

train_ds = train_ds.shuffle(42)
test_ds = test_ds.shuffle(42)

Found samples with valid icsrs: 12,031
Found samples with valid icsrs: 3,628


In [46]:
train_ds

Dataset({
    features: ['title', 'abstract', 'fulltext', 'target', 'pmid', 'fulltext_license', 'title_normalized', 'issue', 'pages', 'journal', 'authors', 'pubdate', 'doi', 'affiliations', 'medline_ta', 'nlm_unique_id', 'issn_linking', 'country', 'mesh_terms', 'publication_types', 'chemical_list', 'keywords', 'references', 'delete', 'pmc', 'other_id', 'safetyreportid'],
    num_rows: 12031
})

In [47]:
# create a validation set
len_train = int(len(train_ds) * 0.8)

val_ds = train_ds.select(range(len_train,len(train_ds)))
train_ds = train_ds.select(range(len_train))

In [48]:
print('train size: ', len(train_ds))
print('val size: ', len(val_ds))
print('test size: ', len(test_ds))

train size:  9624
val size:  2407
test size:  3628


### Preprocess fulltext

In [58]:
def remove_front(text):
    if '==== Body' in text:
        text = ('\n').join(text.split('==== Body')[1:])
    return text.strip()

def remove_refs(text):
    if '==== Refs' in text:
        text = ('\n').join(text.split('==== Refs')[:-1])
    return text.strip() 

def get_fulltext_input(row, fulltext_only=False):
    fulltext_filtered = remove_refs(remove_front(row['fulltext']))
    data = ['\nTITLE:', row['title'], '\nABSTRACT:', row['abstract']]
    if fulltext_only:
        data.extend(['\nTEXT:', fulltext_filtered])
    return ('\n').join(data).strip()

# numbered_titles_re = r'^(\d[\.\d]*) (.*)\n'
# capitalized_titles_re = r'^([A-Z ]+)\n'

In [59]:
ds = datasets.DatasetDict({
    'train': train_ds,
    'validation': val_ds,
    'test': test_ds
})

In [60]:
ds = ds.map(lambda example: {'fulltext_processed': get_fulltext_input(example, fulltext_only=fulltext_only)})

  0%|          | 0/9624 [00:00<?, ?ex/s]

  0%|          | 0/2407 [00:00<?, ?ex/s]

  0%|          | 0/3628 [00:00<?, ?ex/s]

In [61]:
ds

DatasetDict({
    train: Dataset({
        features: ['title', 'abstract', 'fulltext', 'target', 'pmid', 'fulltext_license', 'title_normalized', 'issue', 'pages', 'journal', 'authors', 'pubdate', 'doi', 'affiliations', 'medline_ta', 'nlm_unique_id', 'issn_linking', 'country', 'mesh_terms', 'publication_types', 'chemical_list', 'keywords', 'references', 'delete', 'pmc', 'other_id', 'safetyreportid', 'fulltext_processed'],
        num_rows: 9624
    })
    validation: Dataset({
        features: ['title', 'abstract', 'fulltext', 'target', 'pmid', 'fulltext_license', 'title_normalized', 'issue', 'pages', 'journal', 'authors', 'pubdate', 'doi', 'affiliations', 'medline_ta', 'nlm_unique_id', 'issn_linking', 'country', 'mesh_terms', 'publication_types', 'chemical_list', 'keywords', 'references', 'delete', 'pmc', 'other_id', 'safetyreportid', 'fulltext_processed'],
        num_rows: 2407
    })
    test: Dataset({
        features: ['title', 'abstract', 'fulltext', 'target', 'pmid', 'fulltext

### Upload the data

In [69]:
if fulltext_only:
    ds.push_to_hub('BioDEX/BioDEX-ICSR')
else:
    ds.push_to_hub('BioDEX/BioDEX-ICSR-Abstract')


Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

### Load the data

In [53]:
ds = datasets.load_dataset('BioDEX/BioDEX-ICSR')

Using custom data configuration BioDEX--BioDEX-ICSR-56e19d4a04830a55
Found cached dataset parquet (/Users/kldooste/.cache/huggingface/datasets/BioDEX___parquet/BioDEX--BioDEX-ICSR-56e19d4a04830a55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# get the ranges
def get_ranges(dates):
    dates = [int(d[:4]) for d  in dates]
    return min(dates), max(dates)

print(get_ranges(ds['train']['pubdate']))
print(get_ranges(ds['validation']['pubdate']))
print(get_ranges(ds['test']['pubdate']))

(1990, 2020)
(1985, 2020)
(2021, 2022)


In [9]:
# average target length
target_lengths = [len(e['target']) for e in ds['train']]
print(sum(target_lengths) / len(target_lengths))
print(max(target_lengths))

# get percentile of a length
# Calculate percentile
import numpy as np
length = 128
percentile = np.percentile(target_lengths, (length / len(target_lengths)) * 100)

# Print percentile
print("The percentile of", length, "is", percentile)

163.05382377389859
1325
The percentile of 128 is 69.0
