# Analyze archive

This notebook contains the code to analyse content of the PubMedCentral Author Manuscript Collection. \
See: https://www.ncbi.nlm.nih.gov/pmc/about/mscollection/

Files can be downloaded here: https://ftp.ncbi.nlm.nih.gov/pub/pmc/manuscript/xml/ \
**Please ensure** that files are downloaded into `~/pmc_dataset` folder to proceed.

## Collecting files

In [None]:
% matplotlib inline
% config InlineBackend.figure_format='retina'

import functools
import os
from collections import Counter
from glob import glob

import matplotlib.pyplot as plt
from lxml import etree
from tqdm.auto import tqdm

dict_articles = {}
pmc_dataset_root = os.path.expanduser('~/pmc_dataset')


for filelist in tqdm(glob(os.path.join(pmc_dataset_root, '*filelist.txt'))):
    with open(filelist, 'r') as f:
        for line in f:
            if 'LastUpdated' in line:
                continue
            filename, pmcid, pmid, mid, date, time = line.split()
            dict_articles[pmid] = filename

In [None]:
print('Total papers', len(dict_articles))

In [None]:
list(dict_articles.values())[:10]

In [None]:
def count_tags(node):
    stat = Counter()

    def dfs(root):
        stat[root.tag] += 1
        for child in root.getchildren():
            dfs(child)

    dfs(node)
    return stat

In [None]:
def get_title(tree):
    return etree.tostring(tree.getroot().find("front").find("article-meta").find("title-group").find("article-title"))

## Collecting review papers

In [None]:
review_filenames = set()
for filename in tqdm(dict_articles.values()):
    tree = etree.parse(os.path.join(pmc_dataset_root, filename))
    title = str(get_title(tree))
    if not title:
        print(f"\r{filename}")
    if "review" in title.lower():
        review_filenames.add(filename)

In [None]:
print('Review papers', len(review_filenames))

## Collecting tag statistics

In [None]:
tag_stat = {}
tag_stat['review'] = Counter()
tag_stat['ordinary'] = Counter()
for filename in tqdm(dict_articles.values()):
    tree = etree.parse(os.path.join(pmc_dataset_root, filename))
    cur_stat = count_tags(tree.getroot())
    if filename in review_filenames:
        tag_stat['review'] += cur_stat
    else:
        tag_stat['ordinary'] += cur_stat

In [None]:
for s, cnt in zip(['ordinary', 'review'], [len(dict_articles) - len(review_filenames), len(review_filenames)]):
    with open(f'{s}_tag_stat.txt', 'w') as f:
        srt = sorted(tag_stat[s].items(), key=lambda x: x[1])
        srt = list(map(lambda x: (x[0], x[1] / cnt), srt))
        print(f'Number: {cnt}', file=f)
        for val, count in srt:
            print(f'{val} {count}', file=f)

In [None]:
def tag_depth(node):
    def dfs(root):
        d = 1
        for child in root.getchildren():
            d = max(d, dfs(child) + 1)
        return d

    return dfs(node)

In [None]:
d_stat = {}
d_stat['review'] = {}
d_stat['ordinary'] = {}
for filename in tqdm(dict_articles.values()):
    tree = etree.parse(os.path.join(pmc_dataset_root, filename))
    cur_stat = tag_depth(tree.getroot())
    if filename in review_filenames:
        d_stat['review'][filename] = cur_stat
    else:
        d_stat['ordinary'][filename] = cur_stat

In [None]:
print(list(d_stat['review'].items())[:10])

In [None]:
for s in ['ordinary', 'review']:
    with open(f'{s}_tag_depth.txt', 'w') as f:
        srt = sorted(d_stat[s].items(), key=lambda x: x[1])
        for val, count in srt:
            print(f'{val} {count}', file=f)

In [None]:
plt.title('Tag depths review papers')
plt.hist(d_stat['review'].values(), bins=range(5, 20))
plt.show()

In [None]:
plt.title('Tag depth ordinary papers')
plt.hist(d_stat['ordinary'].values(), bins=range(5, 20))
plt.show()

In [None]:
tree = etree.parse(os.path.join(pmc_dataset_root, list(dict_articles.values())[0]))

### Collecting paragraphs statistics

In [None]:
def get_paragraph_info(root):
    num = 0
    sum_pos = -1
    disc_pos = -1
    lens = Counter()
    for ch in root.find('body').getchildren():
        if ch.tag == 'sec':
            num += 1
            try:
                lens[num] = len(etree.tostring(ch))
            except Exception:
                lens[num] = 0
                print("\n!")
            str_title = str(etree.tostring(ch.find('title'))).lower()
            if 'summary' in str_title:
                sum_pos = num
            if 'discussion' in str_title:
                disc_pos = num
    return num, sum_pos, disc_pos, lens

In [None]:
review_filenames = set()

with open('review_tag_depth.txt', 'r') as f:
    for line in f:
        filename, _ = line.split()
        review_filenames.add(filename)

In [None]:
para_stats = {}
para_stats['review'] = {}
para_stats['ordinary'] = {}

for filename in tqdm(dict_articles.values()):
    tree = etree.parse(os.path.join(pmc_dataset_root, filename))
    try:
        cur_stat = get_paragraph_info(tree.getroot())
    except Exception:
        print(f"\n{filename}")
        continue
    if filename in review_filenames:
        para_stats['review'][filename] = cur_stat
    else:
        para_stats['ordinary'][filename] = cur_stat

In [None]:
list(para_stats['review'].items())[:10]

#### Number of sections

In [None]:
para_nums = list(map(lambda x: x[1][0], para_stats['review'].items()))
print(para_nums[:10])
plt.title('Number of sections in review papers')
plt.hist(para_nums, bins=range(1, 20))
plt.show()

In [None]:
para_nums = list(map(lambda x: x[1][0], para_stats['ordinary'].items()))
print(para_nums[:10])
plt.title('Number of sections in ordinary papers')
plt.hist(para_nums, bins=range(1, 20))
plt.show()

### Discussion section position

In [None]:
sum_stat = list(map(lambda x: x[1][1], para_stats['review'].items()))
print(sum_stat[:10])
plt.title('Position of discussion section in review papers')
plt.hist(sum_stat, bins=range(1, 20))
plt.show()

In [None]:
sum_stat = list(map(lambda x: x[1][1], para_stats['ordinary'].items()))
print(sum_stat[:10])
plt.title('Position of discussion section in ordinary papers')
plt.hist(sum_stat, bins=range(1, 20))
plt.show()

### Position of discussion papers

In [None]:
sum_stat = list(map(lambda x: x[1][2], para_stats['review'].items()))
print(sum_stat[:10])
plt.title('Position of discussion section in review papers')
plt.hist(sum_stat, bins=range(-1, 20))
plt.show()

In [None]:
sum_stat = list(map(lambda x: x[1][2], para_stats['ordinary'].items()))
print(sum_stat[:10])
plt.title('Position of discussion section in ordinary papers')
plt.hist(sum_stat, bins=range(-1, 20))
plt.show()

### Average number of sections

In [None]:
len_stat = functools.reduce(lambda x, y: x + y, map(lambda x: x[1][3], para_stats['review'].items()))
plt.title('Average number of sections in review papers')
plt.bar(len_stat.keys(), list(map(lambda x: x / len(para_stats['review'].items()), len_stat.values())))
plt.show()

In [None]:
len_stat = functools.reduce(lambda x, y: x + y, map(lambda x: x[1][3], para_stats['ordinary'].items()))
plt.title('Average number of sections in ordinary papers')
plt.bar(list(map(lambda x: min(35, x), len_stat.keys())),
        list(map(lambda x: x / len(para_stats['ordinary'].items()), len_stat.values())))
plt.show()

### Position of conclusion section

In [None]:
xml = '<tag>Some <a>example</a> text</tag>'
tree = etree.fromstring(xml)
print(''.join(tree.itertext()))

In [None]:
list(filter(lambda x: x[1][0] == 1, para_stats['ordinary'].items()))[:10]

In [None]:
def get_conc_info(root):
    conc_pos = -1
    num = 0
    for ch in root.find('body').getchildren():
        if ch.tag == 'sec':
            num += 1
            str_title = str(etree.tostring(ch.find('title'))).lower()
            if 'conclusion' in str_title:
                conc_pos = num
    return conc_pos

In [None]:
conc_stats = {}
conc_stats['review'] = {}
conc_stats['ordinary'] = {}

for filename in tqdm(dict_articles.values()):
    tree = etree.parse(os.path.join(pmc_dataset_root, filename))
    try:
        cur_stat = get_conc_info(tree.getroot())
    except Exception:
        print(f"\n{filename}")
        continue
    if filename in review_filenames:
        conc_stats['review'][filename] = cur_stat
    else:
        conc_stats['ordinary'][filename] = cur_stat

In [None]:
conc_stat = list(map(lambda x: x[1], conc_stats['review'].items()))
print(conc_stat[:10])
plt.title('Position of conclusion section in review papers')
plt.hist(conc_stat, bins=range(-1, 20))
plt.show()

In [None]:
conc_stat = list(map(lambda x: x[1], conc_stats['ordinary'].items()))
print(conc_stat[:10])
plt.title('Position of conclusion section in ordinary papers')
plt.hist(conc_stat, bins=range(-1, 20))
plt.show()