In [None]:
!pip install pubmed-scraper pymed lxml progress
!pip install attrs certifi ipython pandas progress pytest
!pip install requests urllib3 pyaml

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pubmed_data = pd.read_csv("/data/pubmed-data.tsv", sep="\t")

In [None]:
"""
An implementation of the Token Bucket algorithm
Author Dan Davis
"""
# NOTE: This code was adapted from https://github.com/porterjamesj/tokenbucket/blob/master/tokenbucket.py,
#       but modified to add a backoff that appears to be needed on Windows.
import time
from threading import Lock
from requests import Session


__all__ = (
    'TokenBucket',
    'RateLimitedSession',
)


class TokenBucket(object):

    def __init__(self, rate=1, tokens=0, capacity=100):
        # immutable attributes
        self.lock = Lock()
        self._rate = rate
        self._capacity = capacity
        # mutable attributes
        self._tokens = tokens
        self._time = time.monotonic()

    @property
    def rate(self):
        return self._rate

    @property
    def capacity(self):
        return self._capacity

    def _adjust(self):
        """
        Update internal time and tokens
        """
        now = time.monotonic()
        elapsed = now - self._time

        self._tokens = min(
            self._capacity,
            self._tokens + elapsed * self._rate
        )
        self._time = now

    @property
    def tokens(self):
        """
        Publicly accessible view of how many tokens the bucket has.
        """
        with self.lock:
            self._adjust()
            return self._tokens

    def consume(self, tokens):
        """
        Consume `tokens` tokens from the bucket, blocking until they are
        available
        """
        with self.lock:
            self._adjust()
            self._tokens -= tokens
            backoff = 1
            while self._tokens < 0:
                to_sleep = backoff * (-self._tokens / self._rate)
                time.sleep(to_sleep)
                self._adjust()
                backoff += 1


class RateLimitedSession(Session):
    def __init__(self, session=None, tokenbucket=None, rate=1, tokens=0, capacity=100, backoff=2.0,
                 *args, **kwargs):
        """Creates a TokenBucketSession

        Notes
        ~~~~~

        * If you provide a `tokenbucket`, then the `rate`, `tokens`, and `capacity` arguments are ignored.
        """
        super(RateLimitedSession, self).__init__(*args, **kwargs)
        if tokenbucket is None:
            tokenbucket  = TokenBucket(rate=rate, tokens=tokens, capacity=capacity)
        self.tokenbucket = tokenbucket
        self.session = session
        self.backoff = backoff

    def request(self, *args, **kwargs):
        """Maintains the existing api for Session.request.

        Used by all of the higher level methods, e.g. Session.get.
        """
        if self.session:
            func = self.session.request
        else:
            func = super(RateLimitedSession, self).request
        self.tokenbucket.consume(1)
        r = func(*args, **kwargs)
        if r.status_code == 429:
            time.sleep(self.backoff)
            r = func(*args, **kwargs)
        return r

In [None]:
"""
Author Dan Davis
"""

from requests.adapters import HTTPAdapter
from urllib.parse import quote
from types import MethodType
import re
from collections import OrderedDict
from lxml import etree
from io import BytesIO


# Hostname for eutils
EUTILS_PREFIX = 'https://eutils.ncbi.nlm.nih.gov/entrez'

# Base URL for eutils
EUTILS_URL = '{}/eutils/{}'


class EUtils(object):
    """
    An abstraction that wraps the NCBI E-Utilities
    """
    def __init__(self, apikey=None, email=None, rate=3, prefix=None, session=None):
        self.apikey = apikey
        self.email = email
        self.rate = rate
        self.prefix = prefix if prefix else EUTILS_PREFIX
        if not session:
            session = RateLimitedSession(rate=rate, tokens=rate, capacity=rate)
            session.mount('https://', HTTPAdapter(max_retries=3, pool_maxsize=10))
        self.session = session

    def params(self, db=None, **kwargs):
        params = dict((k,v) for k,v in kwargs.items())
        if db:
            params['db'] = db
        if self.apikey:
            params['api_key'] = self.apikey
        return '&'.join('{}={}'.format(key, quote(value)) for key, value in params.items())

    def einfo(self, db=None, **kwargs):
        params = self.params(db, retmode='xml', **kwargs)
        url = EUTILS_URL.format(self.prefix, 'einfo.fcgi') + '?' + params
        r = self.session.get(url)
        r.raise_for_status()
        content_type = r.headers['Content-Type']
        if content_type.startswith('text/xml'):
            r.xml = etree.parse(BytesIO(r.content))
        return r

    def esearch(self, db, history=True, webenv=None, query_key=None, retmax=20, **kwargs):
        if history or webenv:
            kwargs['usehistory'] = 'y'
            if webenv:
                kwargs['WebEnv'] = webenv
            if query_key:
                kwargs['query_key'] = query_key
        params = self.params(db, retmode='xml', retmax=str(retmax), **kwargs)
        url = EUTILS_URL.format(self.prefix, 'esearch.fcgi') + '?' + params
        r = self.session.get(url)
        r.raise_for_status()
        content_type = r.headers['Content-Type']
        if content_type.startswith('text/xml'):
            r.xml = etree.parse(BytesIO(r.content))
            webenv = r.xml.xpath('/eSearchResult/WebEnv')
            r.webenv = webenv[0].text if webenv else None
            query_key = r.xml.xpath('/eSearchResult/QueryKey')
            r.query_key = query_key[0].text if query_key else None
        return r

    def efetch(self, db, *args, webenv=None, query_key=None, retmax=20, **kwargs):
        if webenv:
            kwargs['usehistory'] = 'y'
            kwargs['WebEnv'] = webenv
            if query_key:
                kwargs['query_key'] = query_key
        if len(args) > 0:
            idlist = ','.join(str(arg) for arg in args)
            params = self.params(db, retmode='xml', retmax=str(retmax), id=idlist, **kwargs)
        else:
            params = self.params(db, retmode='xml', retmax=str(retmax), **kwargs)
        url = EUTILS_URL.format(self.prefix, 'efetch.fcgi') + '?' + params
        r = self.session.get(url)
        r.raise_for_status()
        content_type = r.headers['Content-Type']
        if content_type.startswith('text/xml'):
            setattr(r, 'xml', etree.parse(BytesIO(r.content)))
        return r

    def epost(self, db, *args, webenv=None, **kwargs):
        idlist = ','.join(str(arg) for arg in args)
        if webenv:
            kwargs['WebEnv'] = webenv
        params = self.params(db, id=idlist, retmode='xml', **kwargs)
        url = EUTILS_URL.format(self.prefix, 'epost.fcgi') + '?' + params
        r = self.session.get(url)
        r.raise_for_status()
        content_type = r.headers['Content-Type']
        if content_type.startswith('text/xml'):
            setattr(r, 'xml', etree.parse(BytesIO(r.content)))
            webenv = r.xml.xpath('/ePostResult/WebEnv')
            r.webenv = webenv[0].text if webenv else None
            query_key = r.xml.xpath('/ePostResult/QueryKey')
            r.query_key = query_key[0].text if query_key else None
        return r

In [None]:
"""
Author Dan Davis
"""

import sys
from datetime import datetime
from pathlib import Path
from typing import Optional

import pandas as pd
from attrs import define, field
from numpy.random import RandomState
from progress.bar import Bar
from yaml import safe_load

PREVIEW_PREFIX = 'https://eutilspreview.ncbi.nlm.nih.gov/entrez'

@define
class Config:
    api_key: Optional[str]
    email: Optional[str]
    rate_limit: int
    num_queries: int
    num_results: int
    data_path: str
    data_sep: str
    result_path: str
    hedge_path: str
    seed: int

    @property
    def random_state(self):
        return RandomState(self.seed) if self.seed > 0 else None

    @staticmethod
    def get_defaults():
        return {
            'num_queries': 1000,
            'num_results': 200,
            'seed': -1,                             # seed below 0 is ignored
            'data_path': '/data/pubmed-data.tsv',
            'data_sep': '\t',
            'result_path': '/data/team4/results',
            'hedge_path': '/data/team4/hedges.csv',
            'api_key': None,
            'email': None,
            'rate_limit': 3
        }

    @classmethod
    def load(cls, path=None):
        cls_kwargs = cls.get_defaults()
        expected_keys = set(cls_kwargs.keys())
        if path is not None:
            with open(str(path)) as f:
                overrides = safe_load(f)
            if any(key not in expected_keys for key in overrides.keys()):
                raise ValueError(f'{path}: invalid setting encountered')
            cls_kwargs.update(overrides)
        return cls(**cls_kwargs)


class Pipeline:
    def __init__(self, config : Config):
        self.config = config
        self.hedge = None
        self.data = None
        self.eutils = EUtils(config.api_key, config.email, config.rate_limit, PREVIEW_PREFIX)

    def load_hedges(self, hedge_path: Optional[str] = None):
        if hedge_path is None:
            hedge_path = self.config.hedge_path
        # TODO: validate expected columns and types?
        return pd.read_csv(hedge_path, index_col='BiasDimension')

    def load_data(self, data_path: Optional[str] = None):
        if data_path is None:
            data_path = self.config.data_path
        df = pd.read_csv(data_path, sep=self.config.data_sep)
        # remove rows without query
        df = df[df.query_term.notnull()]
        return df

    def stage1(self, result_path=None):
        # prepare the run
        self.hedge = self.load_hedges()
        self.data = self.load_data()

        num_queries = self.config.num_queries
        queries = self.data.sample(num_queries, random_state=self.config.random_state)
        # drop some of the columns to make this tractable
        columns_to_drop = list(set(queries.columns) - {'search_id', 'query_term', 'result_count'})
        queries = queries.drop(columns=columns_to_drop)

        # setup the result directory
        if result_path is None:
            result_path = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
        result_path = Path(self.config.result_path) / result_path
        result_path.mkdir()

        # progress bar
        progress = Bar('Sage 1', max=self.config.num_queries)

        # for each query
        eutils = self.eutils
        for index, row in self.queries.iterrows():
            query_term = row['query_term']

            # that query gets a directory
            query_result_path = result_path / str(index)
            query_result_path.mkdir()
            relevance_results = query_result_path / 'relevance.xml'
            datedesc_results = query_result_path / 'datedesc.xml'

            # save the query results with relevance
            r = eutils.esearch('pumed', retmax=self.config.num_results, term=query_term, sort='relevance')
            relevance_results.write_text(r.content)

            # save the query results with date descending
            r = eutils.esearch('pubmed', retmax=self.config.num_results, term=query_term, sort='date_desc')
            datedesc_results.write_text(r.content)

            progress.next()
        progress.finish()
        return 0

In [None]:
eutils = EUtils(
    '',               # API key
    '',                                 # Email address - unused
    10,                                                   # API calls per second
    'https://eutilspreview.ncbi.nlm.nih.gov/entrez'       # URL prefix for preview - normally not needed
)

In [None]:
r.headers['Content-Type'].startswith('text/xml')

In [None]:
r = eutils.esearch('pubmed', term='African Americans', sort='relevance', retmax=200)
print(r.webenv)
print(r.query_key)

In [None]:
r = eutils.esearch('pubmed', term='African Americans', retmax=5000000)
print_element(r.xml)

In [None]:
# pmids = [element.text for element in r.xml.xpath('//IdList/Id')]

pmids = [ '18538731' 
, '31761807' 
, '28244479' 
, '29949179' 
, '27741350' 
, '28574057' 
, '27839715' 
, '33024261' 
, '24274180' 
, '29770138' 
, '33173229' 
, '26111503' 
, '29217838' 
, '29397066' 
, '30480571' 
, '28351568' 
, '16892035' 
, '30593508' 
, '29032295' 
, '23266771' 
, '35612878' 
, '35612872' 
, '35612856' 
, '35612854' 
, '35612853' 
, '35612849' 
, '35612841' 
, '35612839' 
, '35612832' 
, '35612829' 
, '35612824' 
, '35612815' 
, '35612791' 
, '35612789' 
, '35612777' 
, '35612774' 
, '35612771' 
, '35612766' 
, '35612761' 
, '35612745' ]

In [None]:
r = eutils.esearch('pubmed', term='African Americans', sort='relevance', retmax=5000)
print(r.webenv)
print(r.query_key)

In [None]:
pmids = [element.text for element in r.xml.xpath('//IdList/Id')]

r = eutils.epost('pubmed', *pmids, webenv=r.webenv)
print_element(r.xml)

In [None]:
r2 = eutils.esearch('pubmed', term='cancer', sort='date', webenv=r3.webenv, query_key=r2.query_key, retmax=500)

print_element(r2.xml)

In [None]:
r3 = eutils.epost('pubmed', *pmids2, webenv=r3.webenv)

print_element(r3.xml)

In [None]:
import lxml.etree as ET

filename = 'term_cancer_sort_date.xml'
r3.xml.write(filename)

In [None]:
from IPython.display import display, FileLink
local_file = FileLink('./term_cancer_sort_date.xml', result_html_prefix="download:")
display(local_file)