In [228]:
import re
from time import perf_counter
import logging

In [229]:
# List of words that will be searched in abstracts.
search_words = ['longevity', 'lifespan', 'aging']

# Maximum number of articles that will be processed.
LIMIT = 50 #1_000_000

# Maximum number of characters that you want to see in the 'abstract'-column
# of the output file (paragraph breakdown in the abstract is ignored).
# WARNING: very long abstracts can cause problems for viewing of TSV-file
# in some programs.
CHARACTERS_IN_ABSTRACT = 1000

# Address of the TSV-file where information is cashed
FILE_CASH = 'cash-combined.tsv'

# Address of the file to write down the results of searching
FILE_FOUND = 'found.tsv'

# Show short information about found in the console
SHOW_IN_CONSOLE = True

In [230]:
def process_line(line, fh_found):
    """
    :param line:       parsed line with source data
    :params fh_found:  handler of the file where results are written down
    :return:           True/False, if the search was successful
    """

    # Try to parse the received line.
    # If something wrong then show error message and continue to the next line.
    try:
        id_str, title, date, abstract = line.split('\t')
    except Exception as ex:
        logging.error(f'Exception during processing of id {line.split()[0]:8}: {ex}')
        return False
    
    # Search coincidences in abstract.
    # If found, put them into array 'found_words', indicate founding via boolean variable 'is_found'
    search_text = title.lower() + ' ' + abstract.lower()
    found_words = []
    is_found = False
    
    for word in search_words:
        if search_text.find(word) != -1:
            is_found = True
            if word not in found_words:
                found_words.append(word)
    
    # If search words are found, write down results in the file and, optionally,
    # show short message in the console.
    # Report result of function work (True/False)
    if is_found:
        if title.startswith('"'):
            title = '_' + title
        
        if abstract.startswith('"'):
            abstract = '_' + abstract
            
        if len(abstract) > CHARACTERS_IN_ABSTRACT:
            abstract = abstract[0:CHARACTERS_IN_ABSTRACT] + '…'
            
        fh_found.write(f'{id_str}\t{title}\t{", ".join(found_words)}\t{date}\t{abstract}' + '\n')
                       
        if SHOW_IN_CONSOLE:
            print(f'{id_str:>8}\t{(", ".join(found_words)):16}\t{date}\t' +
                  f'{title if len(title)<=50 else title[:49]+"…"}')
                  
        return True
                  
    else:
        return False

In [231]:
# Open files for reading source data and writing results.
fh_cash = open(FILE_CASH, 'r')
fh_found = open(FILE_FOUND, 'w')

# Configure the logging module.
logging.basicConfig(format='%(message)s')

# Ignore the first line in the file for reading (the line with header).
fh_cash.readline()

# Write header to the output file.
fh_found.write('Id\tTitle\tFound words\tDate\tAbstract (beginning)' + '\n')

# In cycle read and process the input file line by line.
# Count lines where coincidences are found, measure time of work.
counter_found = 0
t0 = perf_counter()
for count, line in enumerate(fh_cash, 1):
    line = line.strip()
    if process_line(line, fh_found):
        counter_found += 1
    if count >= LIMIT:
        break

# Close the files and report result.
fh_found.close()
fh_cash.close()
print(f'\nDone ({count} articles processed, {counter_found} articles found)')
print('Time of execution: {:.3f} sec'.format(perf_counter() - t0))

28758339	lifespan, aging 	2017-??-??	Mitofusin 1 and optic atrophy 1 shift metabolism …
28624737	aging           	2017-08-??	Optical properties of the human lens constituents.
22834969	aging           	2012-12-??	Human serum metabolic profiles are age dependent.
 3299702	aging           	1987-07-10	Human aging: usual and successful.
29266515	aging           	2018-??-??	Altered macromolecular pattern and content in the…
26198148	aging           	2016-??-??	Noncoding Transcriptional Landscape in Human Agin…
27988482	aging           	2017-03-01	Toward a More Comprehensive Concept of Successful…
26511272	aging           	2017-??-01	Aging in Precarious Circumstances: Do Positive Vi…
25620575	longevity, lifespan, aging	2015-03-??	A new hypothesis of aging.
24155214	aging           	2014-06-??	Motor function is associated with 1,25(OH)(2)D an…
28391702	aging           	2017-??-??	Aging With Purpose: Systematic Search and Review …
26463762	aging           	2016-??-??	Energetics of Aging and Fr