
# Make dataset

This notebook contains the code to analyse content of the PubMedCentral Author Manuscript Collection. \
See: https://www.ncbi.nlm.nih.gov/pmc/about/mscollection/

Files should be downloaded from https://ftp.ncbi.nlm.nih.gov/pub/pmc/manuscript/xml/ into  `~/pmc_dataset` folder.

Resulting tables will be created under `~/review/dataset` folder (see `config.py`).

Please ensure that env variable `PYTHONPATH` includes project folder to be able to import `review.config` module.



In [None]:
% matplotlib inline
% config InlineBackend.figure_format='retina'

In [None]:
import logging
import os
import sys

import pandas as pd
from IPython.display import display

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')

import review.config as cfg

pmc_dataset_root = os.path.expanduser('~/pmc_dataset')
dataset_root = os.path.expanduser(cfg.dataset_path)

### Collecting articles

In [None]:
from glob import glob
from lxml import etree
from tqdm.auto import tqdm

dict_articles = {}

for filelist in tqdm(glob(os.path.join(pmc_dataset_root, '*filelist.txt'))):
    with open(filelist, 'r') as f:
        for line in f:
            if 'LastUpdated' in line:
                continue
            filename, pmcid, pmid, mid, date, time = line.split()
            dict_articles[pmid] = filename

In [None]:
print(list(dict_articles.items())[:10])

In [None]:
import nltk


def split_text(text):
    sents = nltk.tokenize.sent_tokenize(text)
    res_sents = []
    i = 0
    while i < len(sents):
        check = False
        if i + 1 < len(sents):
            check = sents[i + 1].strip()[0].islower() or sents[i + 1].strip()[0].isdigit()
        made = sents[i]
        while i + 1 < len(sents) and (made.endswith('Fig.') or check):
            made += " " + " ".join(sents[i + 1].strip().split())
            i += 1
            if i + 1 < len(sents):
                check = sents[i + 1].strip()[0].islower() or sents[i + 1].strip()[0].isdigit()
        res_sents.append(" ".join(made.strip().split()))
        i += 1
    return res_sents


def get_sentences(node):
    def helper(node, is_disc):
        if node.tag == 'xref':
            ntail = ''
            if node.tail is not None:
                ntail = node.tail
            res = f' xref_{node.get("ref-type")}_{node.get("rid")} ' + ntail
            if res is None:
                return '', ''
            if is_disc:
                return '', res
            return res, ''
        if node.tag == 'title':
            if node.tail is None:
                return '', ''
            if is_disc:
                return '', node.tail
            return node.tail, ''
        if not is_disc and node.find('title') is not None:
            title = "".join(node.find('title').itertext()).lower()
            if 'discussion' in title:
                is_disc = True
        st_text = ''
        if node.text is not None:
            st_text = node.text
        if is_disc:
            n_disc = st_text
            n_gen = ""
        else:
            n_gen = st_text
            n_disc = ""
        for ch in node.getchildren():
            gen, disc = helper(ch, is_disc)
            n_gen += gen
            n_disc += disc
        tail = ""
        if node.tail is not None:
            tail = node.tail
        if is_disc:
            n_disc += tail
        else:
            n_gen += tail
        return n_gen, n_disc

    gen_res, disc_res = helper(node.find('body'), False)
    gen_res = split_text(gen_res)
    disc_res = split_text(disc_res)

    abstract = ""

    try:
        abstract = "".join(node.find('front').find('article-meta').find('abstract').itertext())
        abstract = " ".join(abstract.strip().split())
    except Exception:
        pass
    return gen_res, disc_res, abstract

In [None]:
tree = etree.parse(f"{pmc_dataset_root}/PMC004xxxxxx/PMC4239434.xml")
sents = get_sentences(tree.getroot())
print(sents)
print(len(sents[0]), len(sents[1]), len(sents[2]))

In [None]:
def get_all_refs(node):
    def get_cit_id_type(node):
        if node.find('element-citation') is None:
            return None
        if node.find('element-citation').find('pub-id') is None:
            return None
        return node.find('element-citation').find('pub-id').get('pub-id-type')

    def get_citation_info(node):
        if node is None:
            return {}
        res = {}
        for ch in node.getchildren():
            if ch.tag == 'ref':
                id_type = get_cit_id_type(ch)
                if id_type is not None and id_type == 'pmid':
                    res[ch.get('id')] = {
                        'publication-type': ch.find('element-citation').get('publication-type'),
                        'pmid': ch.find('element-citation').find('pub-id').text
                    }
        return res

    def get_figs_info(node):
        if node is None:
            return {}
        res = {}
        for ch in node.getchildren():
            if ch.tag == 'fig' and ch.find('caption') is not None:
                res[ch.get('id')] = " ".join(''.join(ch.find('caption').itertext()).strip().split())
        return res

    def get_tables_info(node):
        if node is None:
            return {}
        res = {}
        for ch in node.getchildren():
            if ch.tag == 'table-wrap' and ch.find('caption') is not None:
                res[ch.get('id')] = " ".join(''.join(ch.find('caption').itertext()).strip().split())
        return res

    citations = get_citation_info(node.find('back').find('ref-list'))
    figs = get_figs_info(node.find('floats-group'))
    tables = get_tables_info(node.find('floats-group'))
    return citations, figs, tables

In [None]:
get_all_refs(tree.getroot())

In [None]:
import re

pattern = re.compile("(?<=xref_bibr_)[\d\w]+")


def count_reverse(sents_gen, sents_disc, pmid):
    result = []
    for i, sent in enumerate(sents_gen):
        results = re.findall(pattern, sent)
        result.extend(list(map(lambda x: (pmid, 'general', str(i), x), results)))
    for i, sent in enumerate(sents_disc):
        results = re.findall(pattern, sent)
        result.extend(list(map(lambda x: (pmid, 'discussion', str(i), x), results)))
    return result

In [None]:
gen_sents, disc_sents, abst = get_sentences(tree.getroot())
count_reverse(gen_sents, disc_sents, '2000292')

In [None]:
def is_review(tree):
    try:
        return any('Review' in sg.find('subject').text for sg in
                   tree.find('front').find('article-meta').find('article-categories').findall('subj-group'))
    except:
        return False


# Test
is_review(etree.parse(f"{pmc_dataset_root}/PMC001xxxxxx/PMC1817751.xml").getroot())

## Create tables required for model learning

In [None]:
! mkdir -p {dataset_root}

print('Headers')
with open(f'{dataset_root}/review_files.csv', 'w') as f:
    print('pmid', file=f)
with open(f'{dataset_root}/citations.csv', 'w') as f:
    print('\t'.join(['pmid', 'ref_id', 'pub_type', 'ref_pmid']), file=f)
with open(f'{dataset_root}/sentences.csv', 'w') as f:
    print('\t'.join(['pmid', 'sent_id', 'type', 'sentence']), file=f)
with open(f'{dataset_root}/abstracts.csv', 'w') as f:
    print('\t'.join(['pmid', 'abstract']), file=f)
with open(f'{dataset_root}/figures.csv', 'w') as f:
    print('\t'.join(['pmid', 'fig_id', 'caption']), file=f)
with open(f'{dataset_root}/tables.csv', 'w') as f:
    print('\t'.join(['pmid', 'tab_id', 'caption']), file=f)
with open(f'{dataset_root}/reverse_ref.csv', 'w') as f:
    print('\t'.join(['pmid', 'sent_type', 'sent_id', 'ref_id']), file=f)

print('Processing articles')
for id, filename in tqdm(list(dict_articles.items())):
    try:
        tree = etree.parse(pmc_dataset_root + "/" + filename).getroot()
        gen_sents, disc_sents, abstract = get_sentences(tree)
        cits, figs, tables = get_all_refs(tree)
    except Exception as e:
        print("\rsomething went wrong", id, filename, e)
        continue
    if is_review(tree):
        with open(f'{dataset_root}/review_files.csv', 'a') as f:
            print(id, file=f)
    with open(f'{dataset_root}/citations.csv', 'a') as f:
        for i, dic in cits.items():
            print('\t'.join([id, str(i), dic['publication-type'], dic['pmid']]), file=f)
    with open(f'{dataset_root}/sentences.csv', 'a') as f:
        for i, sent in enumerate(gen_sents):
            print('\t'.join([id, str(i), 'general', sent]), file=f)
        for i, sent in enumerate(disc_sents):
            print('\t'.join([id, str(i), 'discussion', sent]), file=f)
    if abstract != '':
        with open(f'{dataset_root}/abstracts.csv', 'a') as f:
            print('\t'.join([id, abstract]), file=f)
    with open(f'{dataset_root}/figures.csv', 'a') as f:
        for i, text in figs.items():
            print('\t'.join([id, i, text]), file=f)
    with open(f'{dataset_root}/tables.csv', 'a') as f:
        for i, text in tables.items():
            print('\t'.join([id, i, text]), file=f)
    with open(f'{dataset_root}/reverse_ref.csv', 'a') as f:
        res = count_reverse(gen_sents, disc_sents, id)
        for row in res:
            print('\t'.join(list(row)), file=f)

## Check dataset loading

In [None]:
def sizeof_fmt(num, suffix='B'):
    """Used memory analysis utility"""
    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

In [None]:
logging.info('Loading citations_df')
citations_df = pd.read_csv(os.path.join(dataset_root, "citations.csv"), sep='\t')
logging.info(sizeof_fmt(sys.getsizeof(citations_df)))
display(citations_df.head())
del citations_df

In [None]:
logging.info('Loading review_files_df')
review_files_df = pd.read_csv(os.path.join(dataset_root, "review_files.csv"), sep='\t')
logging.info(sizeof_fmt(sys.getsizeof(review_files_df)))
display(review_files_df.head())
del review_files_df

In [None]:
logging.info('Loading reverse_ref_df')
reverse_ref_df = pd.read_csv(os.path.join(dataset_root, "reverse_ref.csv"), sep='\t')
logging.info(sizeof_fmt(sys.getsizeof(reverse_ref_df)))
display(reverse_ref_df.head())
del reverse_ref_df

In [None]:
logging.info('Loading abstracts_df')
abstracts_df = pd.read_csv(os.path.join(dataset_root, "abstracts.csv"), sep='\t')
logging.info(sizeof_fmt(sys.getsizeof(abstracts_df)))
display(abstracts_df.head())
del abstracts_df

In [None]:
logging.info('Loading figures_df')
figures_df = pd.read_csv(os.path.join(dataset_root, "figures.csv"), sep='\t')
logging.info(sizeof_fmt(sys.getsizeof(figures_df)))
display(figures_df.head())
del figures_df

In [None]:
logging.info('Loading tables_df')
tables_df = pd.read_csv(os.path.join(dataset_root, "tables.csv"), sep='\t')
logging.info(sizeof_fmt(sys.getsizeof(tables_df)))
display(tables_df.head())
del tables_df

In [None]:
logging.info('Loading sentences_df')
sentences_df = pd.read_csv(os.path.join(dataset_root, "sentences.csv"), sep='\t')
logging.info(sizeof_fmt(sys.getsizeof(sentences_df)))
display(sentences_df.head())
del sentences_df