# Metric Extraction

### Requirements

1. Spacy: `conda install -c conda-forge spacy`
2. English corpus for spacy: `python -m spacy download en_core_web_sm`
3. text2num: `pip install text2num`

### Outline

In [None]:
import logging
import json
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

## Data Source Selection

### Option 1: Use Search Terms 

1. Define the `SEARCH_QUERY` variable in the cell below with a list of keywords that describe the science branch of your interest.
2. Set `USE_PUBMED_IMPORT` to `False`.

### Option 2: Import Search Results from Pubmed 

1. Set `USE_PUBMED_IMPORT` to `True`.
2. Search for terms of interest on the Pubmed website. Use Save with Selection=All Results and Format=PMID to obtain a .txt file with PMIDs of all papers that were found.
3. Use widget below to upload a file with IDs from Pubmed.

In [None]:
# Option 1 - Use Search Terms
SEARCH_QUERY = 'protein structure'

# Option 2 - Import Search Results from Pubmed
USE_PUBMED_IMPORT = True

## Import PMIDs from Pubmed

In [None]:
import ipywidgets as widgets

from IPython.display import display

if USE_PUBMED_IMPORT:
    w = widgets.FileUpload(accept='.txt', multiple=False)
    display(w)

## Load Publication Data

In [None]:
from bokeh.plotting import show, output_notebook
from matplotlib import pyplot as plt

from pysrc.papers.config import PubtrendsConfig
from pysrc.papers.progress import Progress
from pysrc.papers.db.pm_loader import PubmedLoader
from pysrc.papers.db.ss_loader import SemanticScholarLoader
from pysrc.papers.analyzer_experimental import ExperimentalAnalyzer
from pysrc.papers.plotter_experimental import ExperimentalPlotter
from pysrc.papers.utils import SORT_MOST_CITED, SORT_MOST_RELEVANT, SORT_MOST_RECENT, cut_authors_list

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
output_notebook()
%matplotlib inline

SEARCH_SORT = SORT_MOST_CITED
SEARCH_PAPERS = 1000

In [None]:
config = PubtrendsConfig(test=False)
loader = PubmedLoader(config)
progress = Progress(1)
loader.set_progress(progress)
try:
    if USE_PUBMED_IMPORT:
        filename = list(w.value.keys())[0]
        ids = [int(chunk) for chunk in w.value[filename]['content'].decode('utf-8').split('\r\n')]
    else:
        ids = loader.search(SEARCH_QUERY, limit=SEARCH_PAPERS, sort=SEARCH_SORT)
    pub_df = loader.load_publications(ids)
finally:
    loader.close_connection()
    logging.info('Done')

## Core Function for Metric Extraction

In [None]:
import re
import spacy

from collections import Counter
from spacy import displacy
from text_to_num import alpha2digit

REAL_NUMBER = re.compile(r'-?[\d]+(\.[\d]+)?')
spacy_en = spacy.load('en_core_web_sm')

In [None]:
# TODO: Test Grobid Quantities

def extract_metrics(abstract_text, visualize_dependencies=False):
    """
    Parses abstract and returns a dict of numbers with nouns that could be suitable as a metric.
    :return list of tuples (sentence, [metrics]), where metrics is a list of tuples (number, [nouns], sentence_number)
    """   
    metrics = {}
    sentences = {}
    # Convert textual numbers to digits (three -> 3)
    abstract_text = alpha2digit(abstract_text, 'en')
    # Split text into sentences and find numbers in sentences
    doc = spacy_en(abstract_text)
    for idx, sent in enumerate(doc.sents):
        sentences[idx] = sent.text
        for token in sent:
#             print(token.text, token.pos_, list(token.ancestors))
            if REAL_NUMBER.fullmatch(token.text):
                value = float(token.text) if '.' in token.text else int(token.text)
                # Analyze children and siblings, then ancestors if first was not enough
                # TODO: is there a better way?
                # TODO: use close nouns as a fallback when it is hard to find a dependency?
                # TODO: expand nouns with adjectives or other nouns? (rate -> information transfer rate)
                candidates = list(token.children) + list(token.head.children) + [token.head]
                # Explicitly ignore % (TODO: use as a unit of measurement)
                nouns = [token.text for token in filter(lambda t: t.text != '%' and t.pos_ == 'NOUN', candidates)]
                if not nouns:
                    for t in token.ancestors:
                        if t.text != '%' and t.pos_ == 'NOUN':
                            nouns = [t.text]
                            break
                for noun in nouns:
                    if noun not in metrics:
                        metrics[noun] = []
                    metrics[noun].append((value, idx))
    if visualize_dependencies:
        displacy.render(list(doc.sents), style="dep", jupyter=True)
    return metrics, sentences

### Demo `extract_metrics`

In [None]:
SENTENCE = """Over brief training periods of 3-24 min, four patients then used these signals 
              to master closed-loop control and to achieve success rates of 74-100% in a 
              one-dimensional binary task."""

print(SENTENCE)
metrics, _ = extract_metrics(SENTENCE, visualize_dependencies=True)
for word, occasions in metrics.items():
    print(f"{word}: {occasions}")

## Let's apply metric extraction to Pubmed papers!

### Exclude papers with unwanted terms

In [None]:
EXCLUDE = ['\becog', 'electrocorticograph', '\bhybrid', 'fNIRS', '\bSSVEP']
EXCLUDE_REGEX = '|'.join(EXCLUDE)
EXCLUDE_MASK = np.logical_not(pub_df.abstract.str.contains(EXCLUDE_REGEX, flags=re.IGNORECASE, regex=True))

processed_pub_df = pub_df[EXCLUDE_MASK]

In [None]:
original_ids = set(pub_df['id'])
processed_ids = set(processed_pub_df['id'])
diff = original_ids - processed_ids

See the list of excluded papers below.

In [None]:
pub_df[pub_df['id'].isin(diff)][['id', 'abstract']]

## Run Metric Extraction

In [None]:
# Slow, currently moved out of the class to speed up fixing & rerunning the code of MetricExtractor

metrics_data = []
for _, data in processed_pub_df.iterrows():
    paper_metrics_data = [data['id'], *extract_metrics(data['abstract'])]
    metrics_data.append(paper_metrics_data)

In [None]:
class MetricExtractor:
    def __init__(self, metrics_data):
        self.metrics_df = pd.DataFrame(metrics_data, columns=['ID', 'Metrics', 'Sentences'])
        
    def get_top_metrics(self, number=20):
        metrics_counter = Counter()
        for metric_dict in self.metrics_df['Metrics']:
            for metric, occasions in metric_dict.items():
                metrics_counter[metric] += len(occasions)
        return metrics_counter.most_common(number)
    
    def get_metric_values(self, *metrics, min_value=None, max_value=None, detailed=False):
        values = []
        for _, data in self.metrics_df.iterrows():
            metric_dict = data['Metrics']
            sentences = data['Sentences']
        
            for metric in metrics:
                if metric in metric_dict:
                    for value, sentence_number in metric_dict[metric]:
                        if min_value and value < min_value or max_value and value > max_value:
                            continue
                        if detailed:
                            sentence = sentences[sentence_number]
                            values.append([data['ID'], value, sentence])
                        else:
                            values.append(value)
        if detailed:
            return pd.DataFrame(values, columns=['PMID', ', '.join(metrics), 'Sentence'])
        return values
    
    def filter_papers(self, metrics):
        """
        :param metrics - list of tuples ([list of keywords], min_value, max_value)
               e.g. (['subjects', 'participants'], 5, None)
        :return list of PMIDs
        """
        selection = []
        for _, data in self.metrics_df.iterrows():
            suitable = True
            metric_dict = data['Metrics']
            
            for metric in metrics:
                metric_suitable = False
                words, min_value, max_value = metric
                
                for word in words:
                    if word in metric_dict:
                        for value, _ in metric_dict[word]:
                            if min_value and value < min_value or max_value and value > max_value:
                                continue
                            metric_suitable = True
                    if metric_suitable:
                        break
                
                suitable &= metric_suitable
                    
            if suitable:
                selection.append(data['ID'])
        return selection

In [None]:
me = MetricExtractor(metrics_data)

### See an example of extraction result below.

In [None]:
me.metrics_df.head(3)

### Most Frequent Metrics

In [None]:
print(me.get_top_metrics(50))

### Select a metric to show distribution and abstract fragments where it was mentioned.

Currently synonyms are not processed, so feel free to use a tuple of words that correspond to the same metric.

In [None]:
METRIC = ('subjects', 'patients', 'participants', 'volunteers')
METRIC_TEXT = ', '.join(METRIC)

### Distribution of selected `METRIC`

In [None]:
import matplotlib.pyplot as plt

num_subjects = me.get_metric_values(*METRIC)
plt.hist(num_subjects)
plt.xlabel(METRIC_TEXT)
plt.ylabel('Number of papers')

### Example Mentions 

In [None]:
me.get_metric_values(*METRIC, detailed=True).sort_values(METRIC_TEXT, ascending=False).head(20)

### Filter papers based on desired metric values

In [None]:
METRICS = [
    (['subjects', 'patients', 'participants', 'volunteers'], 10, None),
    (['accuracy'], 65, None),
]

selection = me.filter_papers(METRICS)

In [None]:
pub_df[pub_df.id.isin(selection)][['id', 'abstract']]

## Development

Other libraries for extraction of metrics/quantities: quantulum, grobid-quantities.

Dependencies:
  * stemming `pip install stemming`

In [None]:
from quantulum3 import parser

sent = 'One subject participated and reached 95.4 % mean online accuracy after six runs of 40 trials.'
parser.parse(sent)