info: <br />
https://pypi.org/project/pubmed-lookup/ <br />
https://pypi.org/project/metapub/

In [2]:
from pubmed_lookup import PubMedLookup, Publication
import sys
import re
from time import time

In [160]:
# List of words that will be searched in abstracts.
search_words = ['longevity', 'lifespan', 'aging']

# Maximum number of articles that will be processed.
LIMIT = 50 #1_000_000

# Maximum number of characters that you want to see in the 'abstract'-column
# of the output file (paragraph breakdown in the abstract is ignored).
CHARACTERS_IN_ABSTRACT = 1000

# Address of the file with ids from PubMed
# (the first line of the file will be ignored).
FILE_IN = 'pmids.csv'

# Address of the file to write down the results of filtering
FILE_OUT = 'pmids-found.tsv'

# Address of the file where the errors will be written.
FILE_LOG = 'pmids-errors.log'

# 'NCBI will contact user by email if excessive queries are detected'
EMAIL = ''

In [161]:
def get_publication_object_from_id(id_str):
    """
    :param id: id of article at PubMed (as string)
    :return:   the Publication object from the ncbi.nlm.nih.gov.
               If an error occured then raise the exception.
    """
    
    assert type(id_str) == str, 'id_str is not string'
    
    # Retrieve a PubMed record.
    try:
        url = 'http://www.ncbi.nlm.nih.gov/pubmed/' + id_str
        lookup = PubMedLookup(url, EMAIL)
    except:
        raise Exception(f'{id_str:8} - error in PubMedLookup: {sys.exc_info()[0]}: {sys.exc_info()[1]}')
    
    # Create a Publication object.
    try:
        # attribute 'resolve_doi=False' allows eliminate 'ConnectionResetError'
        # that occurs in some articles
        publ = Publication(lookup, resolve_doi=False)  
    except:
        raise Exception(f'{id_str} - error in Publication: {sys.exc_info()[0]}: {sys.exc_info()[1]}')
    
    return publ

In [162]:
def process_id(id_str, fh_in, fh_out, fh_log = None):
    """
    :param id:                     id of article at PubMed (as string)
    :params fh_in, fh_out, fh_log: file handlers for input, output and log files
    :return:                       None
    """
    
    # Receive Publication object from the site.
    try:
        obj = get_publication_object_from_id(id_str)
    except Exception as ex:
        print('\n' + str(ex) + '\n')
        fh_log.write(str(ex) + '\n')
        return

    # Check abstract for presence of search words.
    abstract_lower = obj.abstract.strip().lower()
    found_words = []
    is_found = False
    for word in search_words:
        if abstract_lower.find(word) != -1:
            is_found = True
            if word not in found_words:
                found_words.append(word)
    
    # If search words are found, write down results in the file and show message in the console.
    if is_found:
        abstract = re.sub(r'[\t\n\r]', '  ', obj.abstract.strip())
        if len(abstract) > CHARACTERS_IN_ABSTRACT:
            abstract = abstract[0:CHARACTERS_IN_ABSTRACT] + '…'
        fh_out.write(f'{id_str}\t{obj.title}\t{", ".join(found_words)}\t{abstract}' + '\n')
        print(f'{count:4}. {id_str:>8} - found')

In [163]:
# Open the file with ids from PubMed, create/rewrite output- and log-files.
fh_in = open(FILE_IN, 'r')
fh_out = open(FILE_OUT, 'w')
fh_log = open(FILE_LOG, 'w')

# Ignore the first line in the file with ids (the line with header).
fh_in.readline()

# Write header to the output file.
fh_out.write('Id\tTitle\tFound words\tAbstract (beginning)' + '\n')

# In cycle read the file line by line.
# For each line extract id and process the corresponding article in PubMed.
# Results are placed in the output file and id is duplicated to the console.
t0 = time()
for count, line in enumerate(fh_in, 1):
    id_str = line.strip()
    process_id(id_str, fh_in, fh_out, fh_log)
    if count >= LIMIT:
        break

# Close the files and report about finishing of the program.
fh_out.close()
fh_in.close()
if fh_log:
    fh_log.close()
print(f'\nDone ({count} articles were processed)')
print('Time of execution: {:.1f} sec'.format(time() - t0))

   1. 28758339 - found
   2. 28624737 - found
   3. 22834969 - found
   4.  3299702 - found
   5. 29266515 - found
   6. 26198148 - found
   7. 27988482 - found
   8. 26511272 - found
   9. 25620575 - found
  10. 24155214 - found
  17. 26463762 - found
  21. 28111190 - found
  22. 26973114 - found
  23. 29244847 - found
  29. 28872043 - found

28216301 - error in Publication: <class 'TypeError'>: string indices must be integers

  34. 24457528 - found
  36. 30411727 - found
  38. 25468185 - found
  39. 28510785 - found
  43. 23273984 - found
  44. 24793646 - found

30707652 - error in Publication: <class 'TypeError'>: string indices must be integers

  48. 28982708 - found
  49. 29654977 - found

Done (50 articles were processed)
Time of execution: 78.0 sec


---

In [3]:
# # <class 'TypeError'>: 'string indices must be integers':

# url = 'http://www.ncbi.nlm.nih.gov/pubmed/28216301'
# lookup = PubMedLookup(url, '')              # 1st query - this works

# try:
#     Publication(lookup)  # error is here    # 2nd query - error
# except:
#     print(sys.exc_info()[0])
#     print(sys.exc_info()[1])
#     print(sys.exc_info())
#     print()
#     print(traceback.print_exc())

<class 'TypeError'>
string indices must be integers
(<class 'TypeError'>, TypeError('string indices must be integers'), <traceback object at 0x7feacc05a908>)



NameError: name 'traceback' is not defined