info: <br />
https://pypi.org/project/pubmed-lookup/ <br />
https://pypi.org/project/metapub/

In [65]:
from pubmed_lookup import PubMedLookup, Publication
import sys
import re
from time import perf_counter
from importlib import reload
import logging

In [66]:
# Maximum number of articles that will be processed.
LIMIT = 50 #1_000_000

# Address of the file with ids from PubMed
# (the first line of the file will be ignored).
FILE_IN = 'pmids.csv'

# Address of the file to write down the results of filtering
FILE_OUT = 'cash.tsv'

# Address of the file where the errors will be written.
FILE_LOG = 'pmids-errors.log'

# 'NCBI will contact user by email if excessive queries are detected'
EMAIL = ''

# Show short information about found results in the console
SHOW_IN_CONSOLE = True

# counter of errors
counter_errors = 0

In [67]:
def get_publication_object_from_id(id_str):
    """
    :param id_str: id of article at PubMed (as string)
    :return:       the Publication object from the ncbi.nlm.nih.gov.
                   If an error occured then raise the exception.
    """
    
    # check if received id_str is really stirng
    assert type(id_str) == str, 'id_str is not string'
    
    # Retrieve a PubMed record.
    try:
        url = 'http://www.ncbi.nlm.nih.gov/pubmed/' + id_str
        lookup = PubMedLookup(url, EMAIL)
    except:
        raise Exception(f'{id_str:8} - error in PubMedLookup: {sys.exc_info()[0]}: {sys.exc_info()[1]}')
    
    # Create a Publication object.
    try:
        # attribute 'resolve_doi=False' allows eliminate 'ConnectionResetError'
        # that occurs in some articles
        publ = Publication(lookup, resolve_doi=False)  
    except:
        raise Exception(f'{id_str:8} - error in Publication: {sys.exc_info()[0]}: {sys.exc_info()[1]}')
    
    return publ

In [68]:
def process_id(id_str, fh_in, fh_out):
    """
    :param id:                     id of article at PubMed (as string)
    :params fh_in, fh_out, fh_log: file handlers for input, output and log files
    :return:                       None
    """
    
    global counter_errors
    
    # Receive Publication object from the site.
    # If exception occures then report to console and log-file, increase 'counter_errors'
    try:
        obj = get_publication_object_from_id(id_str)
    except Exception as ex:
        counter_errors += 1  
        print('\n' + str(ex) + '\n')
        logging.error(ex)
        return

    # Extract valuable fields from the Publication object
    title = obj.title
    year = obj.year
    month = obj.month
    day = obj.day
    abstract = re.sub(r'[\t\n\r]', '  ', obj.abstract.strip())
    
    # Some formatting of data values
    if year == '' or year is None:
        year = '????'
        
    if type(month) == int and month in list(range(1,10)):
        month = '0' + str(month)
    elif type(month) == str and len(month) == 1:
        month = '0' + month
    elif month == '' or month is None:
        month = '??'
        
    if type(day) == int and day in list(range(1,10)):
        day = '0' + str(day)
    elif type(day) == str and len(day) == 1:
        day = '0' + day
    elif day == '' or day is None:
        day = '??'
        
    if title.startswith('"'):
        title = '_' + title
        
    if abstract == '' or abstract is None:
        abstract = '-'
    elif abstract.startswith('"'):
        abstract = '_' + abstract
    
    # Write down results to output file and, optionally, show short message in the console
    fh_out.write(f'{id_str}\t{obj.title}\t{year}-{month}-{day}\t{abstract}' + '\n')
    if SHOW_IN_CONSOLE:
        print(f'{count:4}. {id_str:>8}\t{(title if len(title)<=50 else title[:49]+"…"):50}\t{year}-{month}-{day}')

In [69]:
# Open the file with ids from PubMed, create/rewrite output-file.
fh_in = open(FILE_IN, 'r')
fh_out = open(FILE_OUT, 'w')

# Configure the logging module
reload(logging)  # without this operation logging in Jupyter notebook will not write to file
logging.basicConfig(filename=FILE_LOG, filemode='w', format='%(message)s')

# Ignore the first line in the file with ids (the line with header).
fh_in.readline()

# Write header to the output file.
fh_out.write('Id\tTitle\tDate\tAbstract' + '\n')

# In cycle read the file line by line.
# For each line extract id and process the corresponding article in PubMed.
# Results are placed in the output file and id is duplicated to the console.
t0 = perf_counter()
for count, line in enumerate(fh_in, 1):
    id_str = line.strip()
    process_id(id_str, fh_in, fh_out)
    if count % 100 == 0:
        print(f'{count:4} articles processed, {(perf_counter() - t0):.1f} sec')
    if count >= LIMIT:
        break

# Close the files and report result.
fh_out.close()
fh_in.close()
print(f'\nDone ({count} ids processed, including {counter_errors} with errors)')
print('Time of execution: {:.1f} sec'.format(perf_counter() - t0))

   1. 28758339	Mitofusin 1 and optic atrophy 1 shift metabolism …	2017-??-??
   2. 28624737	Optical properties of the human lens constituents.	2017-08-??
   3. 22834969	Human serum metabolic profiles are age dependent. 	2012-12-??
   4.  3299702	Human aging: usual and successful.                	1987-07-10
   5. 29266515	Altered macromolecular pattern and content in the…	2018-??-??
   6. 26198148	Noncoding Transcriptional Landscape in Human Agin…	2016-??-??
   7. 27988482	Toward a More Comprehensive Concept of Successful…	2017-03-01
   8. 26511272	Aging in Precarious Circumstances: Do Positive Vi…	2017-??-01
   9. 25620575	A new hypothesis of aging.                        	2015-03-??
  10. 24155214	Motor function is associated with 1,25(OH)(2)D an…	2014-06-??
  11. 28391702	Aging With Purpose: Systematic Search and Review …	2017-??-??
  12. 30078097	The impact of cognitive impairment on the physica…	2018-11-??
  13. 25545014	Diadenosine polyphosphates in the tears of anirid…	2015-08-??

---

In [71]:
# This code allows to investigate errors that occur during processing of some ids.
# (e.g. id=28216301 - <class 'TypeError'>: 'string indices must be integers')
# List of problematic ids: 28216301, 30707652, 28585053, 23847372, 2046973, 342621,
# 7020870, 1956599, 1204687, 6504933, 7020870, 1956599, 1204687, 6504933, 30300464,
# 30300464, 28462727, 28474412, 566404, 29538798, 28762630

# url = 'http://www.ncbi.nlm.nih.gov/pubmed/28216301'
# lookup = PubMedLookup(url, '')              # 1st query - this works

# try:
#     Publication(lookup)  # error is here    # 2nd query - error
# except:
#     print(sys.exc_info()[0])
#     print(sys.exc_info()[1])
#     print(sys.exc_info())
#     print()
#     print(traceback.print_exc())

<class 'TypeError'>
string indices must be integers
(<class 'TypeError'>, TypeError('string indices must be integers'), <traceback object at 0x7fd3f35d38c8>)



NameError: name 'traceback' is not defined