# Numbers Extraction

### Requirements

1. Spacy: `conda install -c conda-forge spacy`
2. English corpus for spacy: `python -m spacy download en_core_web_sm`
3. text2num: `pip install text2num`


## Load Publication Data

In [None]:
from bokeh.plotting import output_notebook
import logging
import numpy as np
import pandas as pd

pd.set_option('display.max_colwidth', None)

from pysrc.papers.config import PubtrendsConfig
from pysrc.papers.db.loaders import Loaders
from pysrc.papers.utils import SORT_MOST_CITED

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
output_notebook()
% matplotlib inline

SEARCH_QUERY = 'human aging'
SEARCH_SORT = SORT_MOST_CITED
SEARCH_PAPERS = 1000

In [None]:
config = PubtrendsConfig(test=False)
loader = Loaders.get_loader('Pubmed', config)
try:
    logging.info('Searching')
    ids = loader.search(SEARCH_QUERY, limit=SEARCH_PAPERS, sort=SEARCH_SORT, noreviews=True)
    pub_df = loader.load_publications(ids)
finally:
    loader.close_connection()
    logging.info('Done')

## Core Function for Metric Extraction

In [None]:
import re
import spacy

from collections import Counter

NUMBER = re.compile(r'-?[\d]+([\.,][\d]+)?([eE][+-]?\d+)?')
spacy_en = spacy.load('en_core_web_sm')

### Demo `extract_metrics`

In [None]:
from pysrc.papers.analysis.numbers import extract_metrics

TEXT = """
No specific antiviral drug has been proven effective for treatment of patients with 
 severe coronavirus disease 2019 (COVID-19).

Over brief training periods of 3-24 min, four patients then used these signals 
 to master closed-loop control and to achieve success rates of 74-100% in a 
 one-dimensional binary task.

We performed a weighted multivariate analysis of urinary creatinine concentrations in 
 22,245 participants of the third National Health and Nutrition Examination Survey (1988-1994) 
 and established reference ranges (10th-90th percentiles) for each demographic and age category.

Longitudinal descriptive analyses of the 1032 participants in the 1991-2007 
 National Institute of Child Health and Human Development Study of Early Child Care and 
 Youth Development birth cohort from 10 study sites who had accelerometer-determined minutes of MVPA 
 at ages 9 (year 2000), 11 (2002), 12 (2003), and 15 (2006) years.
 
Hookworm infection occurs in almost half of ssa's poorest people, including 40-50 million school-aged 
children and 70 million pregnant women in whom it is a leading cause of anemia.

For the 2 most mutagenic regimens: 4 x 1 hr in 3 mm enu and 6 x 1', 5: 'hr in 3 mm enu'.
"""

metrics, _ = extract_metrics(TEXT, visualize_dependencies=True)
for word, occasions in metrics.items():
    print(f"{word}: {occasions}")

## Let's apply metric extraction to Pubmed papers!

### Exclude papers with unwanted terms

In [None]:
EXCLUDE = ['\becog', 'electrocorticograph', '\bhybrid', 'fNIRS', '\bSSVEP']

if len(EXCLUDE) > 0:
    EXCLUDE_REGEX = '|'.join(EXCLUDE)
    EXCLUDE_MASK = np.logical_not(pub_df.abstract.str.contains(EXCLUDE_REGEX, flags=re.IGNORECASE, regex=True))
    processed_pub_df = pub_df[EXCLUDE_MASK]
else:
    processed_pub_df = pub_df

In [None]:
original_ids = set(pub_df['id'])
processed_ids = set(processed_pub_df['id'])
diff = original_ids - processed_ids

See the list of excluded papers below.

In [None]:
pub_df[pub_df['id'].isin(diff)][['id', 'abstract']]

## Run Metric Extraction

In [None]:
from tqdm.auto import tqdm

# Slow, currently moved out of the class to speed up fixing & rerunning the code of MetricExtractor
metrics_data = []
for _, data in tqdm(processed_pub_df.iterrows()):
    paper_metrics_data = [data['id'], *extract_metrics(data['abstract'])]
    metrics_data.append(paper_metrics_data)

In [None]:
class MetricExtractor:
    def __init__(self, metrics_data):
        self.metrics_df = pd.DataFrame(metrics_data, columns=['ID', 'Metrics', 'Sentences'])

    def get_top_metrics(self, number=20):
        metrics_counter = Counter()
        for metric_dict in self.metrics_df['Metrics']:
            for metric, occasions in metric_dict.items():
                metrics_counter[metric] += len(occasions)
        return metrics_counter.most_common(number)

    def get_metric_values(self, *metrics, min_value=None, max_value=None, detailed=False):
        values = []
        for _, data in self.metrics_df.iterrows():
            metric_dict = data['Metrics']
            sentences = data['Sentences']

            for metric in metrics:
                if metric in metric_dict:
                    for value, sentence_number in metric_dict[metric]:
                        if min_value and value < min_value or max_value and value > max_value:
                            continue
                        if detailed:
                            sentence = sentences[sentence_number]
                            values.append([data['ID'], value, sentence])
                        else:
                            values.append(value)
        if detailed:
            return pd.DataFrame(values, columns=['PMID', ', '.join(metrics), 'Sentence'])
        return values

    def filter_papers(self, metrics):
        """
        :param metrics - list of tuples ([list of keywords], min_value, max_value)
               e.g. (['subjects', 'participants'], 5, None)
        :return list of PMIDs
        """
        selection = []
        for _, data in self.metrics_df.iterrows():
            suitable = True
            metric_dict = data['Metrics']

            for metric in metrics:
                metric_suitable = False
                words, min_value, max_value = metric

                for word in words:
                    if word in metric_dict:
                        for value, _ in metric_dict[word]:
                            if min_value and value < min_value or max_value and value > max_value:
                                continue
                            metric_suitable = True
                    if metric_suitable:
                        break

                suitable &= metric_suitable

            if suitable:
                selection.append(data['ID'])
        return selection

In [None]:
me = MetricExtractor(metrics_data)

### See an example of extraction result below.

In [None]:
me.metrics_df.head(3)

In [None]:
result = pd.merge(left=me.metrics_df, left_on='ID', right=pub_df[['id', 'title']], right_on='id')
result = result[['ID', 'title', 'Metrics', 'Sentences']]
result.to_csv(f'metrics_{SEARCH_QUERY.replace(" ", "_").lower()}.csv', sep=',', index=False)

### Most Frequent Metrics

In [None]:
print(me.get_top_metrics(50))

### Select a metric to show distribution and abstract fragments where it was mentioned.

Currently synonyms are not processed, so feel free to use a tuple of words that correspond to the same metric.

In [None]:
METRIC = ('subject', 'patient', 'role', 'participant', 'volunteer', 'people', 'donor',
          'man', 'woman', 'male', 'female')
METRIC_TEXT = ', '.join(METRIC)

### Distribution of selected `METRIC`

In [None]:
import matplotlib.pyplot as plt

num_subjects = me.get_metric_values(*METRIC)
plt.hist(num_subjects)
plt.xlabel(METRIC_TEXT)
plt.ylabel('Number of papers')

### Example Mentions 

In [None]:
me.get_metric_values(*METRIC, detailed=True).sort_values(METRIC_TEXT, ascending=False).head(20)

### Filter papers based on desired metric values

In [None]:
METRICS = [
    (['subjects', 'patients', 'participants', 'volunteers'], 10, None),
    (['accuracy'], 65, None),
]

selection = me.filter_papers(METRICS)

In [None]:
pub_df[pub_df.id.isin(selection)][['id', 'abstract']]

## Work with synonims

In [None]:
from itertools import chain
from nltk.corpus import wordnet

synonyms = wordnet.synsets('patient')
set(chain.from_iterable([word.lemma_names() for word in synonyms]))

## Development

Other libraries for extraction of metrics/quantities: quantulum, grobid-quantities.

Dependencies:
  * `pip install quantulum3`

In [None]:
from quantulum3 import parser

parser.parse("""No specific antiviral drug has been proven effective for treatment of patients with
 severe coronavirus disease 2019 (COVID-19).""")