# Make dataset

This notebook contains the code to analyse content of the PubMedCentral Author Manuscript Collection. \
See: https://www.ncbi.nlm.nih.gov/pmc/about/mscollection/

Files can be downloaded here: https://ftp.ncbi.nlm.nih.gov/pub/pmc/manuscript/ \
**Please ensure** that files are downloaded into `~/pmc_dataset` folder to proceed.

Resulting tables will be created under `~/review/dataset` folder (see `config.py`).

In [None]:
import pysrc.review.config as cfg

pmc_dataset_root = os.path.expanduser('~/pmc_dataset')

dataset_root = os.path.expanduser(cfg.dataset_path)

In [None]:
dict_articles = {}

# These files should be downloaded from https://ftp.ncbi.nlm.nih.gov/pub/pmc/manuscript/
with open(f"{pmc_dataset_root}/filelist.txt", 'r') as f:
    for line in f:
        filename, pmcid, pmid, mid = line.split()
        if filename == 'File':
            continue
        dict_articles[pmid] = filename

In [None]:
print(list(dict_articles.items())[:10])

In [None]:
from lxml import etree

tree = etree.parse(pmc_dataset_root + '/' + list(dict_articles.values())[0])

In [None]:
dir(tree.getroot())

In [None]:
import nltk

def split_text(text):
    sents = nltk.tokenize.sent_tokenize(text)
    res_sents = []
    i = 0
    while i < len(sents):
        check = False
        if i + 1 < len(sents):
            check = sents[i + 1].strip()[0].islower() or sents[i + 1].strip()[0].isdigit()
        made = sents[i]
        while i + 1 < len(sents) and (made.endswith('Fig.') or check):
            made += " " + " ".join(sents[i + 1].strip().split())
            i += 1
            if i + 1 < len(sents):
                check = sents[i + 1].strip()[0].islower() or sents[i + 1].strip()[0].isdigit()
        res_sents.append(" ".join(made.strip().split()))
        i += 1
    return res_sents

def get_sentences(node):
    def helper(node, is_disc):
        if node.tag == 'xref':
            ntail = ''
            if node.tail is not None:
                ntail = node.tail
            res = f' xref_{node.get("ref-type")}_{node.get("rid")} ' + ntail
            if res is None:
                return '', ''
            if is_disc:
                return '', res
            return res, ''
        if node.tag == 'title':
            if node.tail is None:
                return '', ''
            if is_disc:
                return '', node.tail
            return node.tail, ''
        if not is_disc and node.find('title') is not None:
            title = "".join(node.find('title').itertext()).lower()
            if 'discussion' in title:
                is_disc = True
        st_text = ''
        if node.text is not None:
            st_text = node.text
        if is_disc:
            n_disc = st_text
            n_gen = ""
        else:
            n_gen = st_text
            n_disc = ""
        for ch in node.getchildren():
            gen, disc = helper(ch, is_disc)
            n_gen += gen
            n_disc += disc
        tail = ""
        if node.tail is not None:
            tail = node.tail
        if is_disc:
            n_disc += tail
        else:
            n_gen += tail
        return n_gen, n_disc
    gen_res, disc_res = helper(node.find('body'), False)
    gen_res = split_text(gen_res)
    disc_res = split_text(disc_res)
    
    abstract = ""
    
    try:
        abstract = "".join(node.find('front').find('article-meta').find('abstract').itertext())
        abstract = " ".join(abstract.strip().split())
    except Exception:
        pass
    return gen_res, disc_res, abstract

In [None]:
tree = etree.parse(f"{pmc_dataset_root}/PMC0020XXXXX/PMC2000292.xml")

In [None]:
sents = get_sentences(tree.getroot())

In [None]:
sents

In [None]:
print(len(sents[0]), len(sents[1]), len(sents[2]))

In [None]:
def get_all_refs(node):
    
    def get_cit_id_type(node):
        if node.find('element-citation') is None:
            return None
        if node.find('element-citation').find('pub-id') is None:
            return None
        return node.find('element-citation').find('pub-id').get('pub-id-type')
        
    
    def get_citation_info(node):
        if node is None:
            return {}
        res = {}
        for ch in node.getchildren():
            if ch.tag == 'ref':
                id_type = get_cit_id_type(ch)
                if id_type is not None and id_type == 'pmid':
                    res[ch.get('id')] = {
                        'publication-type': ch.find('element-citation').get('publication-type'),
                        'pmid': ch.find('element-citation').find('pub-id').text
                    }
        return res
    def get_figs_info(node):
        if node is None:
            return {}
        res = {}
        for ch in node.getchildren():
            if ch.tag == 'fig' and ch.find('caption') is not None:
                res[ch.get('id')] = " ".join(''.join(ch.find('caption').itertext()).strip().split())
        return res
    def get_tables_info(node):
        if node is None:
            return {}
        res = {}
        for ch in node.getchildren():
            if ch.tag == 'table-wrap' and ch.find('caption') is not None:
                res[ch.get('id')] = " ".join(''.join(ch.find('caption').itertext()).strip().split())
        return res
        
    citations = get_citation_info(node.find('back').find('ref-list'))
    figs = get_figs_info(node.find('floats-group'))
    tables = get_tables_info(node.find('floats-group'))
    return citations, figs, tables

In [None]:
get_all_refs(tree.getroot())

In [None]:
import re

pattern = re.compile("(?<=xref_bibr_)[\d\w]+")

def count_reverse(sents_gen, sents_disc, pmid):
    result = []
    for i, sent in enumerate(sents_gen):
        results = re.findall(pattern, sent)
        result.extend(list(map(lambda x: (pmid, 'general', str(i), x), results)))
    for i, sent in enumerate(sents_disc):
        results = re.findall(pattern, sent)
        result.extend(list(map(lambda x: (pmid, 'discussion', str(i), x), results)))
    return result

In [None]:
gen_sents, disc_sents, abst = get_sentences(tree.getroot())
count_reverse(gen_sents, disc_sents, '2000292')

In [None]:
import traceback

num_id = 0

for id, filename in list(dict_articles.items()):
    print(f'\r{num_id} {filename}', end='')
    num_id += 1
    try:
        tree = etree.parse(pmc_dataset_root + "/" + filename).getroot()
        gen_sents, disc_sents, abstract = get_sentences(tree)
        cits, figs, tables = get_all_refs(tree)
    except Exception as e:
        print("\rsomething went wrong", id, filename, e)
        continue
    with open(f'{dataset_root}/sentences.csv', 'a') as f:
        for i, sent in enumerate(gen_sents):
            print('\t'.join([id, str(i), 'general', sent]), file=f)
        for i, sent in enumerate(disc_sents):
            print('\t'.join([id, str(i), 'discussion', sent]), file=f)
    if abstract != '':
        with open(f'{dataset_root}/abstracts.csv', 'a') as f:
            print('\t'.join([id, abstract]), file=f)
    with open(f'{dataset_root}/citations.csv', 'a') as f:
        for i, dic in cits.items():
            print('\t'.join([id, str(i), dic['publication-type'], dic['pmid']]), file=f)
    with open(f'{dataset_root}/figures.csv', 'a') as f:
        for i, text in figs.items():
            print('\t'.join([id, i, text]), file=f)
    with open(f'{dataset_root}/tables.csv', 'a') as f:
        for i, text in tables.items():
            print('\t'.join([id, i, text]), file=f)
    with open(f'{dataset_root}/reverse_ref.csv', 'a') as f:
        res = count_reverse(gen_sents, disc_sents, id)
        for row in res:
            print('\t'.join(list(row)), file = f)