# 1. Import the dataset and libraries

In [36]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer

import warnings
warnings.filterwarnings('ignore')

In [8]:
# Reading metadata.csv file into pandas dataframe
covid_df = pd.read_csv("metadata\metadata.csv",low_memory=False)
covid_df.head(2)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,


In [79]:
columnnames = covid_df.columns
print(columnnames)

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id'],
      dtype='object')


In [81]:
print("shape of the dataframe : ", covid_df.shape)

shape of the dataframe :  (467521, 19)


In [455]:
covid_df.abstract[1]

'Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO• -dependent oxidative stress. Although NO• is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO• to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO• -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO• generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presumed contribution of NO• to inflammatory diseases of the lun

In [456]:
covid_df['authors'][0]

'Madani, Tariq A; Al-Ghamdi, Aisha A'

# 2. Selecting Documents for indexing

##  Checking for missing values

In [457]:
missing = covid_df.isnull().sum()
missing

cord_uid                 0
sha                 313313
source_x                 0
title                  229
doi                 209216
pmcid               303497
pubmed_id           241784
license                  0
abstract            129532
publish_time           219
authors              13303
journal              31440
mag_id              467521
who_covidence_id    276110
arxiv_id            461368
pdf_json_files      313313
pmc_json_files      343643
url                 189789
s2_id                41690
dtype: int64

In [458]:
print("Total number of documents with abstract : ",len(covid_df)-missing['abstract'])

Total number of documents with abstract :  337989


In [459]:
print("Out of {} docments, {} documents have titles".format(len(covid_df),(len(covid_df)-missing['title'])))
print("{} documents donot have titles".format(missing['title']))

Out of 467521 docments, 467292 documents have titles
229 documents donot have titles


In [83]:
## Selecting 1300 docs to start with
docs = covid_df.head(1300)
len(docs)

1300

In [84]:
docs.head(2)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,


In [85]:
docs.isnull().sum()

cord_uid               0
sha                   51
source_x               0
title                  0
doi                    0
pmcid                  0
pubmed_id              0
license                0
abstract              59
publish_time           0
authors               20
journal                0
mag_id              1300
who_covidence_id    1300
arxiv_id            1300
pdf_json_files        51
pmc_json_files        46
url                    0
s2_id               1300
dtype: int64

In [86]:
docs = docs.dropna(axis=1, how='all') # dropping the columns with all missing values
docs.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal',
       'pdf_json_files', 'pmc_json_files', 'url'],
      dtype='object')

In [87]:
# mag_id,who_covidence_id,arxiv_id,s2_id theses columns have all missing values so those 4 columns are dropped   

In [88]:
docs_with_abstract = np.where(docs.abstract.notnull()) # selecting rows with no abstract missing
docs = docs.iloc[docs_with_abstract]

In [89]:
docs_with_sha = np.where(docs.sha.notnull()) # selecting rows with no sha missing
docs = docs.iloc[docs_with_sha]

In [90]:
docs_with_authors = np.where(docs.authors.notnull()) # selecting rows with no author names missing
docs = docs.iloc[docs_with_authors]

In [91]:
docs_with_pdf_json_files = np.where(docs.pdf_json_files.notnull()) # selecting rows with no pdf_json_files missing
docs_with_pmf_json_files = np.where(docs.pmc_json_files.notnull()) # selecting rows with no pdf_json_files missing
docs = docs.iloc[docs_with_pdf_json_files]
docs = docs.iloc[docs_with_pmf_json_files]

In [92]:
print("Number of documents without any missing values : ",len(docs))

Number of documents without any missing values :  1186


### Seleting first 1000 documents without any missing values for indexing

In [24]:
docs = docs.head(1000) # These 1000 documents are indexed using Logstash
docs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 1092
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   cord_uid        1000 non-null   object
 1   sha             1000 non-null   object
 2   source_x        1000 non-null   object
 3   title           1000 non-null   object
 4   doi             1000 non-null   object
 5   pmcid           1000 non-null   object
 6   pubmed_id       1000 non-null   object
 7   license         1000 non-null   object
 8   abstract        1000 non-null   object
 9   publish_time    1000 non-null   object
 10  authors         1000 non-null   object
 11  journal         1000 non-null   object
 12  pdf_json_files  1000 non-null   object
 13  pmc_json_files  1000 non-null   object
 14  url             1000 non-null   object
dtypes: object(15)
memory usage: 125.0+ KB


In [471]:
docs.tail(2)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,pdf_json_files,pmc_json_files,url
1091,d0bk9gu5,8437870dfb10809764da7204fd758ff3e9ee85db,PMC,Dangerous liaisons: molecular basis for a synd...,10.3389/fmicb.2013.00035,PMC3594938,23487416,cc-by,The most severe manifestations of malaria (cau...,2013-03-12,"Conant, Katelyn L.; Kaleeba, Johnan A. R.",Front Microbiol,document_parses/pdf_json/8437870dfb10809764da7...,document_parses/pmc_json/PMC3594938.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
1092,0jx6mwiw,df783d511b145a10e7f609a87392eb50799d2b2b,PMC,NOA36 Protein Contains a Highly Conserved Nucl...,10.1371/journal.pone.0059065,PMC3596294,23516598,cc-by,NOA36/ZNF330 is an evolutionarily well-preserv...,2013-03-13,"de Melo, Ivan S.; Jimenez-Nuñez, Maria D.; Igl...",PLoS One,document_parses/pdf_json/df783d511b145a10e7f60...,document_parses/pmc_json/PMC3596294.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...


In [472]:
docs.to_csv("CORD-19_1000docs.csv",index=False)

# This csv file with selected 1000 documents is indexed using elasticsearch

In [473]:
pd.read_csv("CORD_19_indexed.csv").head(3) # This is how the csv file looks like after indexing using Elasticseach

Unnamed: 0,@timestamp,@version,_id,_index,_score,_type,abstract,authors,cord_uid,doi,...,path,pdf_json_files,pmc_json_files,pmcid,publish_time,pubmed_id,sha,source_x,title,url
0,2021-03-02T00:37:43.421Z,1,4Mhe8HcBhQmyCbDQi6-I,cord-19,,_doc,The most severe manifestations of malaria (cau...,"Conant, Katelyn L.; Kaleeba, Johnan A. R.",d0bk9gu5,10.3389/fmicb.2013.00035,...,C:/Users/jhans/Information Rereieval/CORD-19_1...,document_parses/pdf_json/8437870dfb10809764da7...,document_parses/pmc_json/PMC3594938.xml.json,PMC3594938,2013-03-12T00:00:00.000Z,23487416,8437870dfb10809764da7204fd758ff3e9ee85db,PMC,Dangerous liaisons: molecular basis for a synd...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
1,2021-03-02T00:37:43.421Z,1,z8he8HcBhQmyCbDQjrLY,cord-19,,_doc,NOA36/ZNF330 is an evolutionarily well-preserv...,"de Melo, Ivan S.; Jimenez-Nuñez, Maria D.; Igl...",0jx6mwiw,10.1371/journal.pone.0059065,...,C:/Users/jhans/Information Rereieval/CORD-19_1...,document_parses/pdf_json/df783d511b145a10e7f60...,document_parses/pmc_json/PMC3596294.xml.json,PMC3596294,2013-03-13T00:00:00.000Z,23516598,df783d511b145a10e7f609a87392eb50799d2b2b,PMC,NOA36 Protein Contains a Highly Conserved Nucl...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
2,2021-03-02T00:37:43.420Z,1,V8he8HcBhQmyCbDQi7GN,cord-19,,_doc,Global climate change is expected to affect th...,"CANN, K. F.; THOMAS, D. Rh.; SALMON, R. L.; WY...",exqza1kg,10.1017/s0950268812001653,...,C:/Users/jhans/Information Rereieval/CORD-19_1...,document_parses/pdf_json/e008bb9bd16411df2029b...,document_parses/pmc_json/PMC3594835.xml.json,PMC3594835,2012-08-09T00:00:00.000Z,22877498,e008bb9bd16411df2029bfbfd2df3fef72a7e575,PMC,Extreme water-related weather events and water...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...


# 3. Data cleaning and text preprocessing

Text preprocessing is the process of preparing or cleaning text data before encoding. <br>
Encoding : Covert text data into numeric vector using techniques such as BagofWords, TF-IDF, Word2Vec etc.

Preprocessing the raw text:

- Removing all irrelevant characters (Numbers and punctuations)
- Converting all characters into lowercase
- remove tags
- Tokenization
- Removing stopwords
- Stemming and Lemmetization
- Removing the words having length <=2
- Converting the list of tokens back to the string

In [480]:
text_before_preprocess = covid_df.title[0] + covid_df.abstract[0]
print("Text before preprocessing : ","\n")
print(text_before_preprocess,'\n')
print("length of text before preprocessing : ",len(text_before_preprocess))

Text before preprocessing :  

Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi ArabiaOBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were ass

## Function for data preprocessing

In [25]:
lemma= WordNetLemmatizer() # initializing object of WordNetLemmatizer for lemmatization
stop_words = set(stopwords.words('english')) # taking all the unique English stopwords from nltk corpus

def preprocess(text):
    text = re.sub('[^a-zA-Z]',' ',text) # removing numbers and punctuations
    text = str(text).lower()            # convert all characters into lowercase
    text = word_tokenize(text)          # tokenization 
    text = [item for item in text if item not in stop_words] # removing stopwords
    text = [lemma.lemmatize(word=w,pos='v') for w in text]   # lemmatization
    text = [i for i in text if len(i) > 2]   # removing token of length <=2
    text = ' '.join(text)                    # joining the tokens with space in between to form sentence
    
    return text

docs['text'] = docs['title'] + docs['abstract'] # combining the title and abstract column into a single column called text
docs['text'] = docs['text'].apply(lambda x: preprocess(x)) # applying the 'preprocess' function on 'text' column
docs['text'][0] # visualizing the processed text

'clinical feature culture prove mycoplasma pneumoniae infections king abdulaziz university hospital jeddah saudi arabiaobjective retrospective chart review describe epidemiology clinical feature patients culture prove mycoplasma pneumoniae infections king abdulaziz university hospital jeddah saudi arabia methods patients positive pneumoniae culture respiratory specimens january december identify microbiology record chart patients review result patients identify require admission infections community acquire infection affect age group common infants pre school children occur year round common fall spring three quarter patients comorbidities twenty four isolate associate pneumonia upper respiratory tract infections bronchiolitis cough fever malaise common symptoms crepitations wheeze common sign patients pneumonia crepitations bronchial breathe immunocompromised patients likely non immunocompromised patients present pneumonia versus patients pneumonia uneventful recovery recover follow c

In [481]:
print("Text after preprocessing : ","\n")
print(docs['text'][0],'\n')
print("length of text after preprocessing : ",len(docs['text'][0]))

Text after preprocessing :  

clinical feature culture prove mycoplasma pneumoniae infections king abdulaziz university hospital jeddah saudi arabiaobjective retrospective chart review describe epidemiology clinical feature patients culture prove mycoplasma pneumoniae infections king abdulaziz university hospital jeddah saudi arabia methods patients positive pneumoniae culture respiratory specimens january december identify microbiology record chart patients review result patients identify require admission infections community acquire infection affect age group common infants pre school children occur year round common fall spring three quarter patients comorbidities twenty four isolate associate pneumonia upper respiratory tract infections bronchiolitis cough fever malaise common symptoms crepitations wheeze common sign patients pneumonia crepitations bronchial breathe immunocompromised patients likely non immunocompromised patients present pneumonia versus patients pneumonia unevent

In [495]:
docs.head(2)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,pdf_json_files,pmc_json_files,url,text
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,clinical feature culture prove mycoplasma pneu...
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,nitric oxide pro inflammatory mediator lung di...


# Building Vocabulary and selecting keywords

In [26]:
documents = docs['text'].tolist() # getting the text column and converting it into a list

In [44]:
# creating a vocabulary of words, ignoring the word that appear in 85% of documents
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_df=0.85)
token_count_vector = vectorizer.fit_transform(documents) 
# creates the vocabulary, the result is a sparse representation of count of each word

In [45]:
token_count_vector.shape
# The shape is (1000,12295) since we have 1000 documents and vocabulary size is 12295

(1000, 12295)

In [51]:
token_count_vector.toarray()[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [46]:
list(vectorizer.vocabulary_.keys())[:5] # looking at first 5 words of vocabulary

['clinical', 'feature', 'culture', 'prove', 'mycoplasma']

# Extracting keywords

In [None]:
# First compute the IDF(InverseDocumentFrequency) values. For this, we take  the sparse matrix generated from CountVectorizer 
# (token_count_vector) to calculate the IDF by invoking tfidf_transformer.fit(...)

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(word_count_vector)

In [75]:
feature_names=vectorizer.get_feature_names() # get all the words from vocabulary

doc = docs['text'] # This is the column for which we want to generate tf-idf
results = [] # list to store the extracted keywords for each document
for d in doc:
     #Transform a count matrix to a normalized tf or tf-idf representation
    tf_idf_vector = tfidf_transformer.transform(vectorizer.transform([d]))
   
  
    #sorting the tf-idf vectors by descending order of scores by calling the function "sort_tf_idf_vector()"
    sorted_items=sort_tf_idf_vector(tf_idf_vector.tocoo())
       
    #extract only the top n; n here is top 20% words with high tf-idf values by calling the function "extract_topn_from_vector()"
    keywords=extract_topn_from_vector(feature_names,sorted_items,int(len(sorted_items)*0.2))
    print(keywords)
    results.append(' '.join(keywords))
    

['pneumoniae', 'patients', 'comorbidities', 'pneumonia', 'common', 'infections', 'die', 'king', 'jeddah', 'crepitations', 'abdulaziz', 'saudi', 'mycoplasma', 'chart', 'culture', 'immunocompromised', 'university', 'infants']
['inflammatory', 'oxidant', 'presume', 'nitric', 'oxide', 'contribution', 'lung', 'anti', 'evidence', 'tract', 'phagocyte', 'peroxynitrite', 'oxidations', 'comprehensively']
['surfactant', 'deficient', 'glycoconjugates', 'organic', 'lipids', 'microorganisms', 'surface', 'contribute', 'abnormal', 'lung', 'express', 'macrophages', 'epithelial', 'variety', 'antigens', 'phosphatidylinositol', 'oligomerized', 'mixtures', 'genitourinary', 'exocrine']
['lung', 'diseases', 'role', 'inflammatory', 'tone', 'endothelin']
['pneumovirus', 'epithelial', 'gene', 'infectionrespiratory', 'pvm', 'chemoattractant', 'incompletely', 'responses', 'subfamily', 'paramyxoviridae', 'unravel', 'rsv', 'rodents']
['trs', 'discontinuous', 'strand', 'subgenomic', 'leader', 'body', 'rna', 'transfe

['pleural', 'mesothelioma', 'malignant', 'exposure', 'asbestos', 'pain', 'occupational', 'chest', 'involvement', 'thicken', 'peritoneal', 'spontaneous', 'history', 'life', 'vaginalis', 'tunica', 'sweat', 'rind', 'presentations', 'pericardium', 'palliation', 'multidisciplinary', 'mesothelium', 'mesotheliomas', 'mesotheliomamalignant']
['stockpile', 'drug', 'antiviral', 'treatment', 'resistance', 'pandemic', 'strain', 'outbreak', 'run', 'timely', 'level', 'population', 'onset', 'sensitive', 'resistant', 'plan', 'use', 'scale', 'size', 'wide', 'strategy', 'apply']
['dog', 'rabies', 'domestic', 'populations', 'canine', 'elimination', 'global', 'transmission', 'africa', 'throughout', 'dynamics', 'rabiesrabies', 'rabid', 'historic', 'trajectories', 'tanzania', 'realistic', 'carnivore', 'dangerous', 'uniquely', 'obstacle']
['gmo', 'detection', 'amplification', 'lamp', 'isothermal', 'dna', 'transgene', 'template', 'amplify', 'use', 'background', 'sensitivity', 'apply']
['hpai', 'vaccines', 'ch

['pav', 'adhu', 'vector', 'vaccine', 'adenovirus', 'alternative', 'neutralize', 'responses', 'efficacy', 'vaccination', 'porcine', 'immune', 'day', 'promise', 'protection', 'antibodies', 'antibody']
['apcs', 'couple', 'antigens', 'liposome', 'fatty', 'liposomes', 'unsaturated', 'antigen', 'dma', 'surface', 'present', 'class', 'pinocytosis', 'cells', 'saturate', 'cytochalasin', 'acid', 'confocal']
['therapeutic', 'hbv', 'vaccines', 'woodchuck', 'nucleos', 'ide', 'analogues', 'preclinical', 'innovative', 'chronic', 'vaccination', 'approach', 'hepatitis', 'model', 'woodchucks']
['hla', 'peptides', 'supertype', 'transgenic', 'parasite', 'gla', 'elicit', 'adjuvants', 'administer', 'foundation', 'mice', 'ifn', 'toxoplasmosis', 'spa', 'sag', 'padre', 'gra', 'burden', 'immunization', 'cys', 'toxoplasma', 'pam', 'bind', 'vaccines', 'bioinformatic', 'novel', 'use', 'cell', 'responses']
['influenza', 'protective', 'vaccine', 'epitopes', 'conserve', 'ctl', 'cross', 'universal', 'contribution', 'vi

['transmembrane', 'ceacam', 'tyrosine', 'domain', 'anchorage', 'growth', 'dimerization', 'valine', 'isoforms', 'terminal', 'residues', 'glycine', 'mutations', 'leucine', 'independent', 'helix', 'functionality', 'determinant', 'mutants']
['ski', 'hcv', 'lds', 'master', 'host', 'liver', 'regulator', 'infection', 'lipid', 'act', 'lifecycle', 'cholesterol', 'indirect', 'novel', 'assembly', 'enzymes', 'inhibitor', 'spectrum', 'block', 'reduction', 'target', 'broad', 'pathogenesis', 'site', 'direct', 'antiviral', 'key', 'sterol']
['dcs', 'hmgb', 'hiv', 'dissemination', 'alarmin', 'innate', 'lymphoid', 'cells', 'initiate', 'immune', 'tissue', 'antiviral', 'synapse', 'prototypic', 'polarization', 'migrate', 'cellsdendritic', 'dependent', 'transmission']
['meningitis', 'kit', 'detection', 'seeplex', 'pathogens', 'csf', 'ace', 'bacterial', 'sample', 'acute', 'identification', 'clinical', 'analytical', 'validation', 'reference', 'copy', 'high', 'specificity', 'sensitivity']
['color', 'taxonomic',

['hcmv', 'sumoylation', 'sumo', 'processivity', 'dna', 'polymerase', 'cytomegalovirus', 'replication', 'cellular', 'subunit', 'viral', 'cycle', 'modify', 'life', 'protein', 'ubc', 'sumoylated', 'modifier', 'mannerduring', 'intranuclear', 'holoenzyme']
['fas', 'mice', 'lpr', 'fasl', 'lung', 'lps', 'pro', 'neutrophilic', 'injury', 'mechanically', 'ventilate', 'tracheal', 'complexes', 'system', 'intra', 'deficient', 'function', 'ventilation', 'mechanical', 'response']
['ifitm', 'vertebrate', 'family', 'genes', 'sub', 'evolutionary', 'aquatic', 'gene', 'selection', 'positive', 'duplication', 'divide', 'divergence', 'locus', 'primates', 'evolution', 'conservation', 'transmembrane', 'functional', 'two', 'imply', 'group']
['gabon', 'ebola', 'africa', 'research', 'equatorial', 'cirmf', 'emerge', 'center', 'international', 'epidemics', 'unit', 'program', 'since', 'reemergence']
['method', 'estimate', 'south', 'initial', 'africa', 'case', 'field', 'confirm', 'transmissibility', 'mean', 'registry

In [63]:
def sort_tf_idf_vector(coo_matrix):
    """Sorting the tf-idf vector"""
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items,topn):
    """get the feature names and tf-idf score of top 20% words"""
    
    sorted_items = sorted_items[:topn]  #collect only topn items from vector

    topnwords = []
    
    # index of word, respective tf-idf score
    for idx, score in sorted_items:
        
        #append the sorted words
        topnwords.append(feature_names[idx])
   
    return topnwords

In [42]:
results

['pneumoniae patients comorbidities pneumonia common infections die king jeddah crepitations abdulaziz saudi mycoplasma chart culture immunocompromised university infants',
 'inflammatory oxidant presume nitric oxide contribution lung anti evidence tract phagocyte peroxynitrite oxidations comprehensively',
 'surfactant deficient glycoconjugates organic lipids microorganisms surface contribute abnormal lung express macrophages epithelial variety antigens phosphatidylinositol oligomerized mixtures genitourinary exocrine',
 'lung diseases role inflammatory tone endothelin',
 'pneumovirus epithelial gene infectionrespiratory pvm chemoattractant incompletely responses subfamily paramyxoviridae unravel rsv rodents',
 'trs discontinuous strand subgenomic leader body rna transfer synthesis sequence trss minus template end derive',
 'transfuse haemoglobin improve ill critically normal level blood rbcs outcomerecent care rct',
 'symposium medicine intensive care emergency international tutorials

In [503]:
docs['keywords'] = results # Adding keywords column to the dataframe

In [504]:
docs.head(3)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,pdf_json_files,pmc_json_files,url,text,keywords
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,clinical feature culture prove mycoplasma pneu...,pneumoniae patients comorbidities pneumonia co...
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,nitric oxide pro inflammatory mediator lung di...,inflammatory oxidant presume nitric oxide cont...
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,surfactant protein pulmonary host defensesurfa...,surfactant deficient glycoconjugates organic l...


In [505]:
docs.drop(columns='text',inplace=True) 
# Dropping text column from dataframe as we already selected keywords and added them to datframe

In [506]:
docs.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal',
       'pdf_json_files', 'pmc_json_files', 'url', 'keywords'],
      dtype='object')

In [507]:
# Converting docs dataframe to csv file for indexing
docs.to_csv("keywords_added_docs.csv",index=False)

In [None]:
# This file is indexed using Logstash 