In [0]:
import re
import string
import collections
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import numpy as np 
import string
import random
import pickle

from collections import Counter
from collections import defaultdict

from nltk.corpus import brown
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer

from wordcloud import WordCloud

In [0]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
porter=PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [0]:
# Read Excel data as Data Frame
def readExcelToDataFrame(path):
    research_dataframe = pd.read_csv(path,index_col=False)
    research_dataframe.drop(research_dataframe.columns[research_dataframe.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
    return research_dataframe

In [0]:
research_dataframe = readExcelToDataFrame('/content/drive/My Drive/Colab Notebooks/kaggle_covid_19.csv')
research_dataframe.head()

Unnamed: 0,doc_id,source,title,abstract,text_body
0,,,,,
1,850dc8add6efb76a6dc341cb7a5a236c1dd3da7c,BIORXIV,Expanded skin virome in DOCK8-deficient patients,,"recurrent cutaneous and systemic infections, a..."
2,dd6c1a719bd75ba7fb7c62291a676b39c99bb0bc,BIORXIV,Veterinary Science Molecular characterization ...,The cDNA nucleotide sequence of genome segment...,Members of the family Birnaviridae have 2-segm...
3,e88710cd7b238794bd75f21d308bc5442ad9713f,BIORXIV,,,The common cold is one of the most prevalent a...
4,d46d2f01a570bfb6e3b2e65ec50972d1649b6a3d,BIORXIV,Severe Acute Respiratory Syndrome: Lessons fro...,An outbreak of severe acute respiratory syndro...,A n outbreak of severe acute respiratory syndr...


In [0]:
meta_data = readExcelToDataFrame('/content/drive/My Drive/Colab Notebooks/metadata.csv')

In [0]:
def cleanEmptyData(columnName,df):
    return df[df[columnName].notnull()]

In [0]:
research_dataframe = cleanEmptyData('text_body',research_dataframe)

In [0]:
meta_data["doc_id"] = meta_data["sha"]

In [0]:
meta_data = cleanEmptyData('doc_id', meta_data)
meta_data = cleanEmptyData('publish_time', meta_data)
meta_data = meta_data[meta_data['publish_time'].str.contains('2019') | meta_data['publish_time'].str.contains('2020')]

In [0]:
research_dataframe = cleanEmptyData('doc_id', research_dataframe)
research_dataframe = cleanEmptyData('text_body', research_dataframe)
df  = research_dataframe.merge(meta_data, on='doc_id', how='right')
df = df.drop(["source", "abstract_x", "cord_uid", "abstract_x","sha","source_x","title_y","pmcid","pubmed_id","license","abstract_y","journal","Microsoft Academic Paper ID","WHO #Covidence","has_pdf_parse","has_pmc_xml_parse","full_text_file"], axis=1)

df = cleanEmptyData('text_body', df)
df = df.rename(columns={'title_x': 'title'})
columns = ["doc_id","doi", "publish_time", "authors","url","title", "text_body"]
df = df.reindex(columns=columns)

In [0]:
f = open("/content/drive/My Drive/Colab Notebooks/df_24042020.pkl", "wb")
pickle.dump(df, f)
f.close()

In [0]:
target_text = df['text_body']

In [0]:
target_text = target_text.replace(np.nan, '', regex=True)

In [0]:
def prepare_sentence(data):
  data = data.replace(np.nan, '', regex=True)
  ## ! Check without removing the '\n' Keep original sentences and tokenize
  return [sent_tokenize(target_text[i]) for i in range(len(target_text)) if len(target_text[i]) != 0]

In [0]:
sentences = prepare_sentence(target_text)

In [0]:
## Save sentences file
f = open("/content/drive/My Drive/Colab Notebooks/sentences_24042020.pkl","wb")
pickle.dump(sentences, f)
f.close()

In [0]:
target_text = [x.replace('\n', ' ') for x in target_text]

In [0]:
target_text = [re.sub(r'[^a-zA-Z]', ' ', str(x)) for x in target_text]

In [0]:
target_text = [re.sub(r'\b[a-zA-Z]\b', '', str(x)) for x in target_text]

In [0]:
target_text = ["".join(j for j in i if j not in string.punctuation) for i in target_text]

In [0]:
target_text = [word_tokenize(doc) for doc in set(target_text)]

In [0]:
target_text = [[word.lower() for word in doc] for doc in target_text]

In [0]:
stop_words = set(stopwords.words('english'))

In [0]:
target_text = [[word for word in doc if word not in stop_words] for doc in target_text]

In [0]:
lemmatizer = WordNetLemmatizer() 
target_text = [[lemmatizer.lemmatize(word) for word in doc] for doc in target_text]

In [0]:
num_of_words = Counter(word for doc in target_text for word in set(doc))

In [0]:
 num_of_words_sorted = [(l,k) for k,l in sorted([(j,i) for i,j in num_of_words.items()], reverse=True)]

In [0]:
words_low_freq = [word[0] for word in num_of_words_sorted if word[1] == 1]

In [0]:
words_low_freq = set(words_low_freq)

In [0]:
target_text = [[word for word in doc if word not in words_low_freq] for doc in target_text]

In [0]:
def data_cleaning(target_text):
  ## Clean text
  ## Replace '\n' by ' '
  target_text = [x.replace('\n', ' ') for x in target_text]
  # Keep characters only
  target_text = [re.sub(r'[^a-zA-Z]', ' ', str(x)) for x in target_text]
  # Remove single characters
  target_text = [re.sub(r'\b[a-zA-Z]\b', '', str(x)) for x in target_text]
  # Remove punctuation (!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)
  target_text = ["".join(j for j in i if j not in string.punctuation) for i in target_text]
  ## Tokenize words
  target_text = [word_tokenize(doc) for doc in set(target_text)]
  ## Lower case words for all docs
  target_text = [[word.lower() for word in doc] for doc in target_text]
  ## Remove stop words from all docs
  stop_words = set(stopwords.words('english'))
  target_text = [[word for word in doc if word not in stop_words] for doc in target_text]
  ## lemmatization
  lemmatizer = WordNetLemmatizer() 
  target_text = [[lemmatizer.lemmatize(word) for word in doc] for doc in target_text]
  ## Remove words that occur only once in all documents
  # Check frequency of words and sort from high to low
  num_of_words = Counter(word for doc in target_text for word in set(doc))
  # num_of_words_sorted = OrderedDict(num_of_words.most_common())
  num_of_words_sorted = [(l,k) for k,l in sorted([(j,i) for i,j in num_of_words.items()], reverse=True)]
  # All words with a frequency of 1 (word[0] is a word and word[1] the frequency)
  words_low_freq = [word[0] for word in num_of_words_sorted if word[1] == 1]
  # Set to increase speed
  words_low_freq = set(words_low_freq)
  # Remove words with a frequency of 1 (this takes a while) = this takes too much time
  target_text = [[word for word in doc if word not in words_low_freq] for doc in target_text]
  return target_text

In [0]:
target_text = data_cleaning(target_text)

In [0]:
## Save target_text file
f = open("/content/drive/My Drive/Colab Notebooks/target_data_24042020.pkl", "wb")
pickle.dump(target_text, f)
f.close()

In [0]:
# =============================================================================
# PART IV: Vectorize (and calculate TF-IDF)
# ============================================================================
texts_flattened = [" ".join(x) for x in target_text]

In [0]:
# Include with token_pattern also single characters
vectorizer = TfidfVectorizer(lowercase=False, stop_words=None, token_pattern=r"(?u)\b\w+\b")

In [0]:
vectors = vectorizer.fit_transform(texts_flattened)

In [0]:
feature_names = vectorizer.get_feature_names()

In [0]:
dense = vectors.todense()

In [0]:
## Dictionary of unique words as values
word2idx = dict(zip(feature_names, range(len(feature_names))))

In [0]:
# Save word2idx file
f = open("/content/drive/My Drive/Colab Notebooks/word2idx_24042020.pkl", "wb")
pickle.dump(word2idx, f)
f.close()

In [0]:
# Dictionary with the unique words as keys
idx2word = {v:k for k,v in word2idx.items()}

## Save idx2word file
f = open("/content/drive/My Drive/Colab Notebooks/idx2word_24042020.pkl", "wb")
pickle.dump(idx2word, f)
f.close()

In [0]:
## word2idx all feature_names 
features_names_num = [word2idx[feature] for feature in feature_names]

In [0]:

## Calculate tfidf
# df_tfidf = pd.DataFrame(dense, columns=feature_names)
df_tfidf = pd.DataFrame(dense, columns=features_names_num)

In [0]:
## word2idx for all words in plot_data
target_text = [[word2idx.get(word) for word in line] for line in target_text]


In [0]:
# Save plot_data_num file
f = open("/content/drive/My Drive/Colab Notebooks/plot_data_24042020.pkl", "wb")
pickle.dump(target_text, f)
f.close()

In [0]:
# Create dictionary with a list as values
worddic = defaultdict(list)

In [0]:
import time
start = time.time()
[worddic[word].append([target_text.index(doc), 
                        [index for index, w in enumerate(doc) if w == word], 
                        df_tfidf.loc[i, word]]) 
                        for i,doc in enumerate(target_text) for word in set(doc)]
end = time.time()
print(end - start)

471.6404128074646


In [0]:
## Save pickle file for worddic_all
f = open("/content/drive/My Drive/Colab Notebooks/worddic_all_24042020.pkl","wb")
pickle.dump(worddic,f)
f.close()

**Data Loading **

In [0]:
## Load pickle file sentences
pickle_in = open('/content/drive/My Drive/Colab Notebooks/sentences_24042020.pkl', 'rb')
sentences = pickle.load(pickle_in)

In [0]:
## Load pickle file idx2word
pickle_in = open('/content/drive/My Drive/Colab Notebooks/idx2word_24042020.pkl', 'rb')
idx2word = pickle.load(pickle_in)

In [0]:
## Load pickle file word2idx
pickle_in = open('/content/drive/My Drive/Colab Notebooks/word2idx_24042020.pkl', 'rb')
word2idx = pickle.load(pickle_in)

In [0]:
## Load pickle file worddic (numeric version)
pickle_in = open('/content/drive/My Drive/Colab Notebooks/worddic_all_24042020.pkl', 'rb')
worddic = pickle.load(pickle_in)

In [0]:
## data set
pickle_in = open('/content/drive/My Drive/Colab Notebooks/df_24042020.pkl', 'rb')
df = pickle.load(pickle_in)

In [0]:
df.head()

Unnamed: 0,doc_id,doi,publish_time,authors,url,title,text_body
0,43d2d4072d40a3964ea330342b9222376d700531,10.3201/eid2509.181937,2019-09-10,"Jing, Shuping; Zhang, Jing; Cao, Mengchan; Liu...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Lassa fever in travelers from West Africa,We identified a case of fatal acute respirator...
1,cf692fd14d468eaca2c460132802463f0747411e,10.3201/eid2503.171702,2019-03-10,"Farag, Elmoubasher Abu Baker; Nour, Mohamed; E...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Survey on Implementation of One Health Approac...,H uman infections with Middle East respiratory...
2,4f2102dfde801cb0cab691992bbc61cbd98b1ca5,10.3201/eid2502.180798,2019-02-10,"Rampling, Tommy; Page, Mark; Horby, Peter",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,,Although several hundred centers contribute to...
3,45e91042ac20319b849f63a7dd00dda7d11de638,10.3201/eid2601.ac2601,2020-01-10,"Breedlove, Byron",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Hunters Searching among Starry Nights and at t...,Director of Astrobiology at Columbia Universit...
4,d84f1c9036052ca0d9f8216cac4c5118ad8cea19,10.3201/eid2512.191002,2019-12-10,"Stoian, Ana M.M.; Zimmerman, Jeff; Ji, Ju; Hef...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,,"However, half-life calculations were based on ..."


In [0]:
# Create word search which takes multiple words (search sentence) 
# and finds documents that contain these words along with metrics for ranking:

# Output: 
# searchsentence, words, fullcount_order, combocount_order, fullidf_order, fdic_order
# (1) searchsentence: original sentence to be searched
# (2) words: words of the search sentence that are found in the dictionary (worddic)
# (3) fullcount_order: number of occurences of search words
# (4) combocount_order: percentage of search terms
# (5) fullidf_order: sum of TD-IDF scores for search words (in ascending order)
# (6) fdic_order: exact match bonus: word ordering score

# >>> example on limited dataset (first three docs of biorxiv))
# search('Full-genome phylogenetic analysis')
# (1) ('full-genome phylogenetic analysis',  
# searchsentence: original search sentence
# (2) ['phylogenetic', 'analysis'], 
# words: two of the search words are in the dictionary worddic
# (3) [(1, 7), (0, 1)], 
# fullcount_order: the search words (as found in dict) 
# occur in total 7 times in doc 1 and 1 time in doc 0
# (4) [(1, 1.0), (0, 0.5)], 
# combocount_order: max value is 1, 
# in doc 1 all searchwords (as in dict) are present (1), 
# in doc 0 only 1 of the 2 search words are present (0.5)
# (5) [(1, 0.0025220519886750533), (0, 0.0005167452472220973)], 
# fullidf_order: 
# doc 1 has a total (sum) tf-idf of 0.0025220519886750533, 
# doc 0 a total tf-idf of 0.0005167452472220973
# (6) [(1, 1)]) 
# fdic_order: doc 1 has once two search words next to each other
# <<<

def search(searchsentence, must_have_word=None):
    # split sentence into individual words
    searchsentence = searchsentence.lower()
    # split sentence in words
    words = word_tokenize(searchsentence)
    # remove duplicates in search words
    words = list(set(words))

    # lemmatize search words and must_have_word
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    if must_have_word != None:
        must_have_word = lemmatizer.lemmatize(must_have_word)

    # add must_have_word (first position) to search words 
    # if not yet in search words and if in dictionary:
    if must_have_word != None and (must_have_word not in words)\
            and (must_have_word in word2idx):     
            words.insert(0, must_have_word)

    # keep characters as in worddic
    words = [re.sub(r'[^a-zA-Z]', ' ', str(w)) for w in words]
    
    # lemmatize search words
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # remove words if not in worddic 
    # keep only the words that are in the dictionary
    words = [word for word in words if word in word2idx.keys()]
    numwords = len(words)
    
    # word2idx all words
    words = [word2idx[word] for word in words]

    # Subset of worddic with only search words
    worddic_sub = {key: worddic[key] for key in words}
    
    # Subset of worddic with only search words
    worddic_sub = {key: worddic[key] for key in words}
    
    # temp dictionaries
    enddic = {}
    idfdic = {}
    closedic = {}


    ## metrics fullcount_order and fullidf_order: 
    # sum of number of occurences of all words in each doc (fullcount_order) 
    # and sum of TF-IDF score (fullidf_order)
    for word in words:
        # print(word)
        for indpos in worddic[word]:
            # print(indpos)
            index = indpos[0]
            amount = len(indpos[1])
            idfscore = indpos[2]
            # check if the index is already in the dictionary: add values to the keys
            if index in enddic.keys():
                enddic[index] += amount
                idfdic[index] += idfscore
            # if not, just make a two new keys and store the values
            else:
                enddic[index] = amount
                idfdic[index] = idfscore
    fullcount_order = sorted(enddic.items(), key=lambda x: x[1], reverse=True)
    fullidf_order = sorted(idfdic.items(), key=lambda x: x[1], reverse=True)


    ## metric combocount_order: 
    # percentage of search words (as in dict) that appear in each doc
    words_docs = defaultdict(list)
    # get for each word the docs which it is in
    for k in worddic_sub.keys():
        for i in range(len(worddic_sub[k])):
            words_docs[k].append(worddic_sub[k][i][0])
    # keep onlt the unique docs per word      
    for k in words_docs:
        words_docs[k] = set(words_docs[k])
    # combination of all docs
    comboindex = []
    for k in words_docs:
        comboindex += words_docs[k]
    # count the number of each doc (from 0 to max number of search words)
    combocount = Counter(comboindex) 
    # divide by number of search words (to get in range from [0,1])
    for key in combocount:
        combocount[key] = combocount[key] / numwords
    # sort from highest to lowest
    combocount_order = sorted(combocount.items(), key=lambda x: x[1], reverse=True)


    ## metric closedic: 
    # check on words appearing in same order as in search
    fdic_order = 0 # initialization (in case of a single search word)
    if len(words) > 1:
        # list with docs with a search word        
        x = [index[0] for record in [worddic[z] for z in words] for index in record]
        # list with docs with more than one search word
        # y = sorted(list(set([i for i in x if x.count(i) > 1])))
        counts = np.bincount(x)
        y = list(np.where([counts>1])[1])

        # dictionary of documents and all positions 
        # (for docs with more than one search word in it)
        closedic = {}
        y = set(y) # speed up processing
        for wordbig in [worddic[x] for x in words]:
            for record in wordbig:
                if record[0] in y:
                    index = record[0]
                    positions = record[1]
                    try:
                        closedic[index].append(positions)
                    except:
                        closedic[index] = []
                        closedic[index].append(positions)
    
        ## metric fdic: 
        # number of times search words appear in a doc in descending order
        fdic = {}
        # fdic_order = []
        for index in y: # list with docs with more than one search word
            x = 0 
            csum = []            
            for seqlist in closedic[index]:
                while x > 0:
                    secondlist = seqlist # second word positions
                    x = 0
                    # first and second word next to each other (in same order)
                    sol = [1 for i in firstlist if i + 1 in secondlist]
                    csum.append(sol)
                    fsum = [item for sublist in csum for item in sublist] 
                    fsum = sum(fsum) 
                    fdic[index] = fsum

                while x == 0:
                    firstlist = seqlist # first word positions 
                    x += 1 
        fdic_order = sorted(fdic.items(), key=lambda x: x[1], reverse=True)
    
    ## keep only docs that contains all must_search_words
    if must_have_word != None and numwords > 1:
        # check if word is in dictionary
        if must_have_word.lower() not in word2idx:
            print("\nMust-have-word not found in dictionary")
        else:
            # lower case must_have_word
            must_have_word = word2idx[must_have_word.lower()] 
            # get list of all docs with a must have word 
            must_have_docs = set([doc[0] for doc in worddic_sub[must_have_word]])
            # update the score metrics containing only docs with the must have search word
            fullcount_order = [list_of_list for list_of_list in fullcount_order\
                               if list_of_list[0] in must_have_docs]
            combocount_order = [list_of_list for list_of_list in combocount_order\
                                if list_of_list[0] in must_have_docs]    
            fullidf_order = [list_of_list for list_of_list in fullidf_order\
                             if list_of_list[0] in must_have_docs]
            fdic_order = [list_of_list for list_of_list in fdic_order\
                          if list_of_list[0] in must_have_docs]
    
    
    ## idx2word all words (transform words again in characters instead of numbers)
    words = [idx2word[word] for word in words]

    return (searchsentence, words, fullcount_order, combocount_order, fullidf_order, fdic_order)

In [0]:

def rank(term, must_have_word=None):

    # get results from search
    results = search(term, must_have_word)
    # get metrics
    # search words found in dictionary:
    search_words = results[1] 
    # number of search words found in dictionary:
    num_search_words = len(results[1]) 
    # number of search words (as in dict) in each doc (in descending order):
    num_score = results[2] 
    # percentage of search words (as in dict) in each doc (in descending order):
    per_score = results[3]
    # sum of tfidf of search words in each doc (in ascending order):
    tfscore = results[4] 
    # fidc order:
    order_score = results[5] 

    # list of documents in order of relevance
    final_candidates = []


    ## no search term(s) not found
    if num_search_words == 0:
        print('Search term(s) not found')


    ## single term searched (as in dict): return the following 5 scores
    if num_search_words == 1:
        # document numbers:
        num_score_list = [l[0] for l in num_score] 
        # take max 3 documents from num_score:
        num_score_list = num_score_list[:3] 
        # add the best percentage score:
        num_score_list.append(per_score[0][0]) 
        # add the best tfidf score
        num_score_list.append(tfscore[0][0]) 
        # remove duplicate document numbers
        final_candidates = list(set(num_score_list)) 


    ## more than one search word (and found in dictionary)
    # ranking is based on an intelligent commbination of the scores
    if num_search_words > 1:

        ## set up a dataframe with scores of size all documents (initalized with 0)
        total_number_of_docs = len(df)
        doc_score_columns = ['num_score', 'per_score', 'tf_score', 'order_score']
        doc_score = pd.DataFrame(0, index=np.arange(total_number_of_docs), columns=doc_score_columns)
        
        # plot a score in a daraframe (index is doc number)
        def doc_plot(type_score):
            score_doc = [0]*total_number_of_docs
            for i in range(len(type_score)):
                x = type_score[i][0] # document number
                score_doc[x] = float(type_score[i][1]) # score
            return score_doc
        
        # Fill-in for each doc the score
        doc_score.num_score = doc_plot(num_score)
        doc_score.per_score = doc_plot(per_score)
        doc_score.tf_score = doc_plot(tfscore)
        doc_score.order_score = doc_plot(order_score)
        
        # Normalize (to the max)
        maximum = max(doc_score.num_score)
        doc_score.num_score = [float(i)/maximum for i in doc_score.num_score]
        
        # keep per_score (percentage of search words in document) as it is (between 0 and 1)
        # doc_score.per_score = [float(i)/sum(doc_score.per_score) for i in doc_score.per_score]
        
        # TODO check tf score keep it as it is (not normalizing)
        maximum = max(doc_score.tf_score)        
        doc_score.tf_score = [float(i)/maximum for i in doc_score.tf_score]
        
        maximum = max(doc_score.order_score)   
        doc_score.order_score = [float(i)/maximum for i in doc_score.order_score]
        
        # sum all scores to get a Grand Score
        doc_score['sum'] = (doc_score.num_score +\
                            doc_score.per_score +\
                            doc_score.tf_score +\
                            doc_score.order_score)
        # keep only the values with a sum > 0
        doc_score = doc_score[doc_score['sum'] > 0]
        
        # get the docs (i.e index) of the hightest raknkimng from hgh to low
        final_candidates = doc_score.sort_values('sum', ascending=False).index
    
        # keep 15 top candidates
        final_candidates = final_candidates[:15]        

    # print final candidates
    print('\nFound search words:', results[1])

    # top results: sentences with search words, paper ID (and document number), authors and abstract
    df_results = pd.DataFrame(columns=\
              ["doc_id","doi", "publish_time","Document_no","authors","url","title","Sentences", "Search_words"])
              # ['doc_id', 'Paper_id', 'Document_no', 'Authors', 'Abstract', 'Sentences', 'Search_words'])
    for index, results in enumerate(final_candidates):
        df_results.loc[index, 'doc_id'] = df.doc_id[results]
        df_results.loc[index, 'doi'] = df.doi[results]
        df_results.loc[index, 'publish_time'] = df.publish_time[results]
        df_results.loc[index, 'Document_no'] = results
        df_results.loc[index, 'authors'] = df.authors[results]
        df_results.loc[index, 'url'] = df.url[results]
        df_results.loc[index, 'title'] = df.title[results]

        # get sentences with search words and all search words in the specific document
        sentence_index, search_words_found = search_sentence(results, search_words)
        # all sentences with search words
        df_results.loc[index, 'Sentences'] = sentence_index
        # all search words (also multiple instances) 
        df_results.loc[index, 'Search_words'] = search_words_found
          
    return final_candidates, df_results

In [0]:
def search_sentence(doc_number, search_words):
    sentence_index = [] # all sentences with search words 
    search_words_found = [] # all found search words
    
    for sentence in sentences[doc_number]:
        # keep characters as in worddic, lowercase, split in words and lemmatize
        sentence_temp = re.sub(r'[^a-zA-Z]', ' ', sentence)
        sentence_temp = sentence_temp.lower() # lowercase
        sentence_temp = sentence_temp.split() # split in different words
        
        # all sentences with search words
        for search_word in search_words:
            if search_word in sentence_temp:
                sentence_index.append(sentence)
                break
        # all search words (also multiple instances)
        for search_word in search_words:
            if search_word in sentence_temp:
                search_words_found.append(search_word)
        # [search_words_found.append(search_word) for search_word in search_words if search_word in sentence_temp]
            
    return sentence_index, search_words_found 

In [0]:
# return this text with the highlighted words
def highlight_words(text, words, color):
    
    # color set
    color_set = {'red': '\033[31m', 'green': '\033[32m','blue': '\033[34m','reset': '\033[39m'}

    lemmatizer = WordNetLemmatizer() 
    
    # wrap words in color
    for word in words:
        # text_lower = text.lower() # lowercase words
        text_temp = [re.sub(r'[^a-zA-Z]', '', word) for word in text]
        # lemmatize
        text_temp = [lemmatizer.lemmatize(word) for word in text_temp] 
        # idxs = [i for i, x in enumerate(text) if x.lower() == word]
        idxs = [i for i, x in enumerate(text_temp) if x.lower() == word]
        for i in idxs:
            text[i] = color_set[color] + text[i] + color_set['reset']
            
    # join the list back into a string and print
    text_highlighted = ' '.join(text)
    return(text_highlighted)
         

## Main function of printing the papers in ranked order
# Select per document: 
# - top_n: number of top n papers to be displayed
# - show_sentences: display the sentences which contains search words
# - show_wordcloud: dsiplay a cloud word of these sentences
def print_ranked_papers(ranked_result, top_n=3, show_abstract=True, show_sentences=True):

    # Print top n result (with max number of documents from ranked_result)
    for index in range(min(top_n, len(ranked_result))):    
       
        ## Preparation
        # join all sentences and seperate by a return
        text_sentences = '\n'.join(ranked_result.Sentences[index])
        # spit in seperate words 
        text_sentences_split = text_sentences.split()
        # list of search words
        search_words = list(set(ranked_result.Search_words[index]))

        
        ## Print most important items per document (paper)
        # and in case of 'nan' write 'not available'
        
        # ranking number and title
        if pd.isnull(ranked_result.title[index]):
            print('\n\nRESULT {}:'. format(index+1), 'Title not available')
        else: 
                # Print Result from 1 and not 0
                print('\n\nRESULT {}:'. format(index+1), ranked_result.title[index]) 
    
        # generate cloud word
        # if len(text_sentences_split)>0:
        #   wordcloud = WordCloud()
        #   img = wordcloud.generate_from_text(' '.join(text_sentences_split))
        #   plt.imshow(img)
        #   plt.axis('off')
        #   plt.show()   

       # ["doc_id","doi", "publish_time","Document_no","authors","url","title","Sentences", "Search_words"])
        
        # count all search word in document and present them from highest to lowest  
        dict_search_words =\
            dict(Counter(ranked_result.Search_words.iloc[index]).most_common())
        print('\nI Number of search words in paper:')
        for k,v in dict_search_words.items():
            print('- {}:'.format(k), v)
            
        # paper id and document number
        print('\nII Paper ID:', ranked_result.doc_id[index], 
              '(Document no: {})'. format(ranked_result.Document_no[index]))
        
        # authors
        if pd.isnull(ranked_result.authors[index]):
            print('\nIII Authors:', 'Authors not available')
        else:
            print('\nIII Authors:', ranked_result.authors[index])
        print('\n')
        
        # abstract
        # if show_abstract == True:
        #     if pd.isnull(ranked_result.Abstract[index]):
        #         print('Abstract not available')
        #     else: 
        #         # split abstract in seperate words
        #         abstract_sentences_split = ranked_result.Abstract[index].split()
        #         # highlight the search words in red
        #         print(highlight_words(abstract_sentences_split, search_words, 'red'))
              
        ## show sentences with search words in green
        if show_sentences == True:
            print('\nIV Sentences in paper containing search words:\n')
            print(highlight_words(text_sentences_split, search_words,'green'))

In [0]:
must_have_word = 'incubation period, day'
search_example = 'What is incubation period'
papers, rank_result = rank(search_example,must_have_word)
# papers, rank_result = rank(search_example)

# Print final candidates
print('Top 10 papers (document numbers):', papers)

# Print results
print_ranked_papers(rank_result, top_n=10, show_abstract=False, show_sentences=True)


Must-have-word not found in dictionary

Found search words: ['period', 'incubation']
Top 10 papers (document numbers): Int64Index([2198,  366, 1164, 1560,  840,  545,  629, 1990, 1961,  986, 1745,
            2070, 1177, 2274, 2182],
           dtype='int64')


RESULT 1: Citation: Development of an Immunochromatographic Strip for Rapid Detection of Canine Adenovirus

I Number of search words in paper:

II Paper ID: 1d105041da2d3a6e6e16cd0dce1f0a672066d407 (Document no: 2198)

III Authors: Wang, Shujie; Wen, Yongjun; An, Tongqing; Duan, Guixin; Sun, MingXia; Ge, Jinying; Li, Xi; Yang, Kongbin; Cai, Xuehui



IV Sentences in paper containing search words:




RESULT 2: Mucin 4 Protects Female Mice from Coronavirus Pathogenesis

I Number of search words in paper:

II Paper ID: 70c7260fc0702bd4a554421ac68972c991313e56 (Document no: 366)

III Authors: Jessica A. Plante; Kenneth S. Plante; Lisa E. Gralinski; Anne Beall; Martin T. Ferris; Daniel Bottomly; Richard Green; Shannon K. McWeeney; 

In [0]:
df.head(5)

Unnamed: 0,doc_id,doi,publish_time,authors,url,title,text_body
0,43d2d4072d40a3964ea330342b9222376d700531,10.3201/eid2509.181937,2019-09-10,"Jing, Shuping; Zhang, Jing; Cao, Mengchan; Liu...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Lassa fever in travelers from West Africa,We identified a case of fatal acute respirator...
1,cf692fd14d468eaca2c460132802463f0747411e,10.3201/eid2503.171702,2019-03-10,"Farag, Elmoubasher Abu Baker; Nour, Mohamed; E...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Survey on Implementation of One Health Approac...,H uman infections with Middle East respiratory...
2,4f2102dfde801cb0cab691992bbc61cbd98b1ca5,10.3201/eid2502.180798,2019-02-10,"Rampling, Tommy; Page, Mark; Horby, Peter",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,,Although several hundred centers contribute to...
3,45e91042ac20319b849f63a7dd00dda7d11de638,10.3201/eid2601.ac2601,2020-01-10,"Breedlove, Byron",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Hunters Searching among Starry Nights and at t...,Director of Astrobiology at Columbia Universit...
4,d84f1c9036052ca0d9f8216cac4c5118ad8cea19,10.3201/eid2512.191002,2019-12-10,"Stoian, Ana M.M.; Zimmerman, Jeff; Ji, Ju; Hef...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,,"However, half-life calculations were based on ..."
