### IMPORTING LIBRARIES

In [9]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
import random
import string
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.corpora import Dictionary
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from gensim.models import LdaModel
from gensim.models.ldamulticore import LdaMulticore
import glob
from tqdm import tqdm
!pip install langdetect
from langdetect import detect
from spacy.lang.en.stop_words import STOP_WORDS
from gensim.models import TfidfModel
from gensim.similarities import MatrixSimilarity
from gensim.matutils import cossim
import pyLDAvis
import pyLDAvis.gensim_models
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)



### READING DOCUMENTS

In [19]:
base_dir = '/kaggle/input/CORD-19-research-challenge'
documents_dir=glob.glob(f'{base_dir}/document_parses/**/*.json')
# filenames = os.listdir(documents_dir)
print("Number of documents :", len(documents_dir))

Number of documents : 454197


In [20]:
class parseFiles():
    
    def __init__(self, file):
        with open(file) as f:
            doc = json.load(f)
            self.id = doc['paper_id']
            self.title = doc['metadata']['title']
            self.abstract = ""
            self.body_text = ""
            self.authors = ""
            
            for author in doc['metadata']['authors']:
                self.authors += author['first'] +" "+ author['last'] + ' ,'
            
            self.authors = self.authors[:-2]
            
            for abstr in doc['abstract']:
                self.abstract += abstr['text']+'\n'
            
            for body in doc['body_text']:
                self.body_text += body['text']+'\n'
                
    def __repr__(self):
        return f'{self.id} : {self.title} :: {self.abstract[:250]} ++++ {self.body_text[:250]}'
        

In [21]:
def read_data(documents):
    dicts = {'paper_id': [], 'title':[], 'abstract': [], 'body_text': [], 'authors':[]}
    for idx, document in enumerate(tqdm(documents)):
        content = parseFiles(document)
        dicts['paper_id'].append(content.id)
        dicts['title'].append(content.title)
        dicts['abstract'].append(content.abstract)
        dicts['body_text'].append(content.body_text)
        dicts['authors'].append(content.authors)
    dataset = pd.DataFrame(dicts, columns=['paper_id', 'title', 'abstract', 'body_text', 'authors'])
    return dataset        

In [22]:
df_data = read_data(documents_dir[:15000])
df_data.head()

100%|██████████| 15000/15000 [02:01<00:00, 123.33it/s]


Unnamed: 0,paper_id,title,abstract,body_text,authors
0,8187ea360c53a56ca2c579d758a5d6aa67716836,,,Research Letter to the Editor:\nWe are writing...,
1,a0d063dca746b135afe0451ce0b3bb1e06cf15ae,Ethnic and regional variations in hospital mor...,Background Brazil ranks second worldwide in to...,The COVID-19 pandemic has created an unprecede...,"Núcleo De Astrofísica E Cosmologia ,P Baqui ,V..."
2,edb294108440787c9f074483fd3c953a83e53622,Corona! Die Krise der Verschlankung und ihre F...,,Die Corona-Pandemie ist eine Gefahr für die Ge...,"Z Arb , Wiss ,Irene Raehlmann"
3,ee5af71875f2e77135974c75980ce22fff03e4f8,What Is Critical About the Crisis of Expertise...,,"Particularly in these pandemic times, appeals ...",Riccardo Chesta
4,a0bc6bc5b8547b98a2d77b81ca81cb18fa1b7ee9,Journal Pre-proofs Letter to the Editor Retina...,,"To the editor, We read with great interest the...","Noemi Guemes-Villahoz ,Barbara Burgos-Blasco ,..."


### PRE-PROCESSING DOCUMENTS

In [23]:
def clean_dataset(df_data):
    
    df_data['clean_body_text'] = df_data['body_text'].copy()
    
    ## remove content in square brackets
    df_data['clean_body_text'] = df_data['clean_body_text'].apply(lambda x : re.sub('\[.*?\]', '', x))
    print("removed all the contents between []")
    
    ## remove content in rond brackets
    df_data['clean_body_text'] = df_data['clean_body_text'].apply(lambda x : re.sub('\(.*?\)', '', x))
    print("removed all the cotents between ()")
    
    ## remove all whitespaces
    df_data['clean_body_text'] = df_data['clean_body_text'].apply(lambda x : re.sub('\s+', ' ', x))
    print("removed all whitespaces")
    
    ## remove all puntuations
    df_data['clean_body_text'] = df_data['clean_body_text'].apply(lambda x : re.sub(f"[{string.punctuation}]", "", x))
    print("removed all punctuations")
    
    ## remove all numbers
    df_data['clean_body_text'] = df_data['clean_body_text'].apply(lambda x : re.sub("\d+", "", x))
    print("removed all numbers")
    
    ## convert to lower case
    df_data['clean_body_text'] = df_data['clean_body_text'].apply(lambda x : x.lower())
    print("converted to lower case")
    
    df_data.drop_duplicates(['abstract', 'clean_body_text'], inplace=True)
    df_data.dropna(inplace=True)
    print("duplicates and rows with NA values removed")
    
    return df_data

In [24]:
def filter_english_documents(df_data, show_non_english = False):
    
    non_english_docs_idxs = []
    
    for i in tqdm(range(df_data.shape[0])):
        
        try:
            if not detect(df_data.iloc[i]['clean_body_text']) == 'en':
                non_english_docs_idxs.append(i)
        except:
            non_english_docs_idxs.append(i)
            
    if show_non_english:
        print("NON ENGLISH DOCUMENTS................. \n\n")
        for i in non_english_docs_idxs[:5]:
            print (df_data.iloc[i]['clean_body_text'][:250]+"......")
            print("="*100) 
            
            
    df_data.drop(df_data.index[non_english_docs_idxs], inplace = True)
    return df_data

In [25]:
def remove_small_papers(df_data, show_small_papers=False):
    
    small_paper_idxs = []
    for i in tqdm(range(df_data.shape[0])):
        if len(df_data.iloc[i]['clean_body_text']) < 500:
                small_paper_idxs.append(i)
                
    if show_small_papers:
        print("SMALL PAPERS..................\n\n")
        count = 0
        
        for i in small_paper_idxs:
            
            if(len(df_data.iloc[i]['clean_body_text']) < 5):
                continue
                
            count += 1
            print (df_data.iloc[i]['clean_body_text'][:250]+"......")
            print("="*100)
            
            if count == 5:
                break
            
    df_data.drop(df_data.index[small_paper_idxs], inplace=True)
    return df_data

In [26]:
df_dataset = df_data.copy()

In [27]:
df_dataset_clean = clean_dataset(df_dataset)

removed all the contents between []
removed all the cotents between ()
removed all whitespaces
removed all punctuations
removed all numbers
converted to lower case
duplicates and rows with NA values removed


In [28]:
df_dataset_without_small_papers = remove_small_papers(df_dataset_clean, True)

100%|██████████| 14989/14989 [00:01<00:00, 9436.40it/s] 

SMALL PAPERS..................


age  ......
the difficulty in obtaining meaningful alignments among the alphavirus utrs  suggests that they evolve quickly probably due to different evolutionary pressures in ......
 compared to baseline if the ram variable was significantly responsive at that time point  a  is shown otherwise a  is shown for the other variables the median value at each time point was compared to baseline medians if the variable had exhibited a ......
mit der inputoutputrechnung ......
the authors report no conflict of interest on march   the new york times reported that a manhattan woman was the subjects were diagnosed with sarscov on admission during the hospital stay  or postpartum the irb determined that this study did not meet......





In [29]:
df_dataset_eng = filter_english_documents(df_dataset_without_small_papers, True)

100%|██████████| 14895/14895 [11:13<00:00, 22.11it/s]

NON ENGLISH DOCUMENTS................. 


die coronapandemie ist eine gefahr für die gesundheit und das leben der menschheit deren problematische folgen sich in allen gesellschaftlichen bereichen und im alltag niederschlagen die rasante weltweite ausbreitung des coronavirus ist allem anschei......
patiënt a was een volledig tegen bof gevaccineerde jarige vrouw zij bezocht haar huisarts met een sinds een dag bestaande zwelling van de linkerzijde van haar gezicht  ze had verhoging maar voelde zich niet ziek de huisarts voelde een opgezette gland......
− wg a the visual inspection of stator windings and cores of large turbogenerators ein erster fragebogen wurde verteilt f claassens steht als convener nicht mehr zur verfügung die wg sucht einen neuen convener um die arbeit fortsetzen zu können der a......
hintergrund die behandlung von patienten mit allergien und atopieassoziierten erkrankungen wurde seit beginn der covidpandemie vor große herausforderungen gestellt empfehlungen zum social




In [30]:
df_data_final = df_dataset_eng.copy()

In [31]:
df_data_final.head()

Unnamed: 0,paper_id,title,abstract,body_text,authors,clean_body_text
0,8187ea360c53a56ca2c579d758a5d6aa67716836,,,Research Letter to the Editor:\nWe are writing...,,research letter to the editor we are writing t...
1,a0d063dca746b135afe0451ce0b3bb1e06cf15ae,Ethnic and regional variations in hospital mor...,Background Brazil ranks second worldwide in to...,The COVID-19 pandemic has created an unprecede...,"Núcleo De Astrofísica E Cosmologia ,P Baqui ,V...",the covid pandemic has created an unprecedente...
3,ee5af71875f2e77135974c75980ce22fff03e4f8,What Is Critical About the Crisis of Expertise...,,"Particularly in these pandemic times, appeals ...",Riccardo Chesta,particularly in these pandemic times appeals t...
4,a0bc6bc5b8547b98a2d77b81ca81cb18fa1b7ee9,Journal Pre-proofs Letter to the Editor Retina...,,"To the editor, We read with great interest the...","Noemi Guemes-Villahoz ,Barbara Burgos-Blasco ,...",to the editor we read with great interest the ...
5,af289740ecabb11a55c08f7a4b99ca2c8c9e746e,Clinical Medicine Review Multi-Organ Involveme...,,Coronavirus disease 2019 (COVID-19) is a novel...,"Vikram Thakur ,Radha Ratho ,Pradeep Kumar ,Sha...",coronavirus disease is a novel emerging human...


In [32]:
customize_stop_words = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'elsevier', 'pmc', 'czi',
    '-pron-'
]
final_stop_words = set(stopwords.words('english')).union(STOP_WORDS).union(set(customize_stop_words))

In [33]:
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    #remove stop words
    tokens = list(filter(lambda x: x not in final_stop_words, tokens))
    #lemmatize the words so that other forms of same word becomes a single word
    tokens = list(map(lambda x: nltk.WordNetLemmatizer().lemmatize(x), tokens))
    # remove tokens with length < 2
    tokens = list(filter(lambda x: len(x) > 2, tokens))
    return ' '.join(list(tokens))

In [34]:
df_data_final["tokenized_text"] = df_data_final["clean_body_text"].apply(tokenize_text)

In [35]:
df_data_final["tokens"] = df_data_final["tokenized_text"].apply(lambda x: x.split())

In [36]:
df_data_final.head()

Unnamed: 0,paper_id,title,abstract,body_text,authors,clean_body_text,tokenized_text,tokens
0,8187ea360c53a56ca2c579d758a5d6aa67716836,,,Research Letter to the Editor:\nWe are writing...,,research letter to the editor we are writing t...,research letter editor writing attention profo...,"[research, letter, editor, writing, attention,..."
1,a0d063dca746b135afe0451ce0b3bb1e06cf15ae,Ethnic and regional variations in hospital mor...,Background Brazil ranks second worldwide in to...,The COVID-19 pandemic has created an unprecede...,"Núcleo De Astrofísica E Cosmologia ,P Baqui ,V...",the covid pandemic has created an unprecedente...,covid pandemic created unprecedented worldwide...,"[covid, pandemic, created, unprecedented, worl..."
3,ee5af71875f2e77135974c75980ce22fff03e4f8,What Is Critical About the Crisis of Expertise...,,"Particularly in these pandemic times, appeals ...",Riccardo Chesta,particularly in these pandemic times appeals t...,particularly pandemic time appeal state crisis...,"[particularly, pandemic, time, appeal, state, ..."
4,a0bc6bc5b8547b98a2d77b81ca81cb18fa1b7ee9,Journal Pre-proofs Letter to the Editor Retina...,,"To the editor, We read with great interest the...","Noemi Guemes-Villahoz ,Barbara Burgos-Blasco ,...",to the editor we read with great interest the ...,editor read great interest correspondence raon...,"[editor, read, great, interest, correspondence..."
5,af289740ecabb11a55c08f7a4b99ca2c8c9e746e,Clinical Medicine Review Multi-Organ Involveme...,,Coronavirus disease 2019 (COVID-19) is a novel...,"Vikram Thakur ,Radha Ratho ,Pradeep Kumar ,Sha...",coronavirus disease is a novel emerging human...,coronavirus disease novel emerging human infec...,"[coronavirus, disease, novel, emerging, human,..."


In [37]:
df_data_final.to_csv(r"/kaggle/output")

### PRE-PROCESSING QUERY

In [38]:
Query = "What do we know about COVID19 risk factors?  What have we learned from epidemiological"+\
" studies? Data on potential risks factors Smoking, pre-existing pulmonary disease Co-infections"+\
"(determine whether co-existing respiratory/viral infections make the virus more transmissible" +\
" or virulent) and other co-morbidities Neonates and pregnant women, Socio-economic and behavioral"+\
" factors to understand the economic impact of the virus and whether there were differences."+\
" Transmission dynamics of the virus, including the basic reproductive number, incubation period,"+\
" serial interval, modes of transmission and environmental factors. Severity of disease, including"+\
" risk of fatality among symptomatic hospitalized patients, and high-risk patient groups."+\
" Susceptibility of populations Public health mitigation measures that could be effective for control."

In [39]:
def clean_query(query):
    
    query = re.sub('\[.*?\]', '', query)
    query = re.sub('\(.*?\)', '', query)
    query = re.sub('\s+', ' ', query)
    query = re.sub(f"[{string.punctuation}]", "", query)
    query = query.lower()
    
    return query

In [40]:
cl_query = clean_query(Query)

In [41]:
cl_query

'what do we know about covid19 risk factors what have we learned from epidemiological studies data on potential risks factors smoking preexisting pulmonary disease coinfections and other comorbidities neonates and pregnant women socioeconomic and behavioral factors to understand the economic impact of the virus and whether there were differences transmission dynamics of the virus including the basic reproductive number incubation period serial interval modes of transmission and environmental factors severity of disease including risk of fatality among symptomatic hospitalized patients and highrisk patient groups susceptibility of populations public health mitigation measures that could be effective for control'

In [42]:
tokenize_query = tokenize_text(cl_query)

In [43]:
tokenize_query

'know covid19 risk factor learned epidemiological study data potential risk factor smoking preexisting pulmonary disease coinfections comorbidities neonate pregnant woman socioeconomic behavioral factor understand economic impact virus difference transmission dynamic virus including basic reproductive number incubation period serial interval mode transmission environmental factor severity disease including risk fatality symptomatic hospitalized patient highrisk patient group susceptibility population public health mitigation measure effective control'

In [44]:
query_tokens = tokenize_query.split()

### APPLYING TF-IDF

In [46]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(df_data_final["tokens"])

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [47]:
print(len(dictionary))

24728


In [48]:
corpus = df_data_final["tokens"].apply(lambda x: dictionary.doc2bow(x))

In [49]:
corpus

0        [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...
1        [(0, 1), (1, 11), (7, 6), (18, 2), (20, 1), (2...
3        [(1, 2), (9, 1), (10, 3), (15, 1), (18, 1), (2...
4        [(65, 1), (183, 1), (260, 1), (329, 1), (417, ...
5        [(0, 1), (24, 1), (26, 2), (30, 1), (37, 5), (...
                               ...                        
14994    [(4, 1), (7, 1), (9, 1), (11, 3), (24, 1), (26...
14995    [(7, 3), (37, 2), (63, 3), (71, 1), (75, 1), (...
14996    [(7, 4), (18, 1), (26, 5), (27, 2), (30, 1), (...
14997    [(0, 1), (4, 1), (7, 1), (9, 2), (11, 2), (22,...
14999    [(9, 1), (24, 1), (30, 1), (37, 4), (44, 1), (...
Name: tokens, Length: 14507, dtype: object

In [107]:
model = TfidfModel(dictionary=dictionary, corpus=[corpus]) 

In [108]:
tfidf_docs = [model[x] for x in corpus]

In [109]:
tfidf_docs[0][:10]

[(0, 0.018693994385859763),
 (1, 0.016478567567750593),
 (2, 0.029307710557728846),
 (3, 0.04231082835933714),
 (4, 0.02829781370526277),
 (5, 0.03544943757138879),
 (6, 0.05034792618204534),
 (7, 0.0482346250560622),
 (8, 0.059504644420199496),
 (9, 0.017773840590671858)]

In [110]:
query_bow = dictionary.doc2bow(query_tokens)

In [111]:
tfidf_query = model[query_bow]

In [112]:
sim_matrix = MatrixSimilarity(tfidf_docs, num_features=len(dictionary))
cosine_similarities = sim_matrix[tfidf_query]

In [113]:
res = sorted(range(len(cosine_similarities)), key = lambda sub: cosine_similarities[sub])[-5:]

In [114]:
for idx in reversed(res):
    print("TITLE: " + df_data_final["title"].iloc[idx]+'\n')
    print("BODY: " + df_data_final["body_text"].iloc[idx][:500]+'\n')

TITLE: Prevalence, clinical features, and outcomes of SARS-CoV-2 infection in pregnant women with or without mild/moderate symptoms: Results from universal screening in a tertiary care center in Mexico City, Mexico

BODY: Coronavirus Disease 2019 , caused by the novel severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), has now reached more than 87,736,000 confirmed cases and 1,892,256 deaths. As of April 2020, Mexico has the fourth-highest number of COVID-19 associated deaths worldwide (https://coronavirus.jhu.edu/map), and at the time of this study also one of the highest positivity rates; more than 40% at the national level and 32.7% for Mexico City. National epidemiological reports have shown an elev

TITLE: World Journal of Clinical Cases Healthy neonate born to a SARS-CoV-2 infected woman: A case report and review of literature Informed consent statement: Conflict-of-interest statement: The

BODY: Since the first reports of pneumonia cases caused by a new coronavirus wer

### LDA Model

In [115]:
bow_lda = LdaMulticore(corpus, id2word=dictionary, num_topics=10, workers = 4, chunksize=200, passes=10, random_state = 100)

In [116]:
bow_lda.print_topics()[:10]

[(0,
  '0.007*"network" + 0.005*"set" + 0.004*"image" + 0.004*"function" + 0.004*"approach" + 0.004*"feature" + 0.004*"performance" + 0.004*"example" + 0.003*"state" + 0.003*"algorithm"'),
 (1,
  '0.008*"animal" + 0.007*"specie" + 0.006*"concentration" + 0.005*"dog" + 0.005*"activity" + 0.004*"cat" + 0.004*"water" + 0.004*"bacterial" + 0.004*"surface" + 0.004*"compound"'),
 (2,
  '0.011*"air" + 0.011*"mask" + 0.007*"particle" + 0.006*"droplet" + 0.005*"temperature" + 0.005*"room" + 0.005*"area" + 0.005*"procedure" + 0.005*"transmission" + 0.005*"water"'),
 (3,
  '0.019*"vaccine" + 0.018*"sarscov" + 0.009*"influenza" + 0.009*"transmission" + 0.009*"outbreak" + 0.009*"infected" + 0.009*"viral" + 0.008*"antibody" + 0.008*"symptom" + 0.007*"testing"'),
 (4,
  '0.014*"participant" + 0.011*"social" + 0.008*"people" + 0.008*"child" + 0.007*"mental" + 0.006*"survey" + 0.006*"anxiety" + 0.006*"score" + 0.005*"age" + 0.005*"symptom"'),
 (5,
  '0.023*"protein" + 0.011*"sequence" + 0.010*"sarscov"

In [117]:
'''
Check which topic our query belongs to using the LDA BOW model.
'''
for index, score in sorted(bow_lda[query_bow], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, bow_lda.print_topic(index, 10)))


Score: 0.41806554794311523	 
Topic: 0.019*"vaccine" + 0.018*"sarscov" + 0.009*"influenza" + 0.009*"transmission" + 0.009*"outbreak" + 0.009*"infected" + 0.009*"viral" + 0.008*"antibody" + 0.008*"symptom" + 0.007*"testing"

Score: 0.24647000432014465	 
Topic: 0.011*"death" + 0.008*"display" + 0.008*"version" + 0.008*"granted" + 0.008*"authorfunder" + 0.008*"estimate" + 0.008*"period" + 0.007*"holder" + 0.007*"perpetuity" + 0.007*"country"

Score: 0.16321209073066711	 
Topic: 0.011*"air" + 0.011*"mask" + 0.007*"particle" + 0.006*"droplet" + 0.005*"temperature" + 0.005*"room" + 0.005*"area" + 0.005*"procedure" + 0.005*"transmission" + 0.005*"water"

Score: 0.12082767486572266	 
Topic: 0.014*"participant" + 0.011*"social" + 0.008*"people" + 0.008*"child" + 0.007*"mental" + 0.006*"survey" + 0.006*"anxiety" + 0.006*"score" + 0.005*"age" + 0.005*"symptom"

Score: 0.03825583681464195	 
Topic: 0.006*"therapy" + 0.006*"blood" + 0.006*"acute" + 0.005*"lung" + 0.005*"diagnosis" + 0.004*"outcome" 

In [118]:
pyLDAvis.enable_notebook()
vis_0 = pyLDAvis.gensim_models.prepare(bow_lda, corpus, dictionary)
vis_0

In [119]:
def get_similar_doc(q_bow, doc_bow, model_lda):
    
    lda_vec1 = model_lda[q_bow]
    cosine_sim = []
    i = 0
    for doc in doc_bow:
        lda_vec2 = model_lda[doc]
        similarity = cossim(lda_vec1, lda_vec2)
        cosine_sim.append(similarity)
        i= i+1
    
    res = sorted(range(len(cosine_sim)), key = lambda sub: cosine_sim[sub])[-5:]
    
    return res

In [120]:
result_bow = get_similar_doc(query_bow, corpus, bow_lda)

In [121]:
for idx in reversed(result_bow):
    print("TITLE: " + df_data_final["title"].iloc[idx]+'\n')
    print("BODY: " + df_data_final["body_text"].iloc[idx][:500]+'\n')

TITLE: Routine screening for SARS CoV-2 in unselected pregnant women at delivery

BODY: a1111111111 a1111111111 a1111111111 a1111111111 a1111111111
Coronavirus disease 2019 (COVID-19), caused by Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2), has been defined as a global public health emergency [1] . Six months after the emergence of this novel virus, South America has become the epicenter of COVID-19 pandemic.
It has been proposed that pregnant women should be considered a high-risk population, since gestation itself could be related with several pregnancy-related co

TITLE: A Randomised Controlled Trial of Face Masks and Hand Hygiene in Reducing Influenza Transmission in Households

BODY: A randomised controlled trial of face masks and hand hygiene in reducing influenza transmission in households.
Relevant definitions: Index case: the first subject to be infected with influenza in a household. Household contact: any person living in the same household as the index case.

In [122]:
tfidf_lda = LdaMulticore(tfidf_docs, id2word=dictionary, num_topics=5, workers = 4, chunksize=200, passes=10, random_state = 100)

In [123]:
tfidf_lda.print_topics()[:5]

[(0,
  '0.002*"participant" + 0.002*"student" + 0.002*"social" + 0.001*"authorfunder" + 0.001*"care" + 0.001*"perpetuity" + 0.001*"country" + 0.001*"granted" + 0.001*"mental" + 0.001*"healthcare"'),
 (1,
  '0.001*"und" + 0.000*"der" + 0.000*"ist" + 0.000*"mit" + 0.000*"für" + 0.000*"lcs" + 0.000*"den" + 0.000*"bei" + 0.000*"von" + 0.000*"das"'),
 (2,
  '0.001*"usc" + 0.000*"zikv" + 0.000*"rebounded" + 0.000*"demonstrable" + 0.000*"conspicuously" + 0.000*"domainspecific" + 0.000*"love" + 0.000*"revisiting" + 0.000*"underused" + 0.000*"comprehensiveness"'),
 (3,
  '0.004*"cell" + 0.003*"sarscov" + 0.003*"protein" + 0.002*"antibody" + 0.002*"viral" + 0.002*"gene" + 0.002*"ace" + 0.002*"lung" + 0.002*"mouse" + 0.002*"rna"'),
 (4,
  '0.000*"grab" + 0.000*"ptc" + 0.000*"utp" + 0.000*"agc" + 0.000*"acg" + 0.000*"sevenpoint" + 0.000*"ttp" + 0.000*"interactivity" + 0.000*"rtpcrbased" + 0.000*"reiterates"')]

In [124]:
'''
Check which topic our query belongs to using the LDA TF-IDF model.
'''
for index, score in sorted(tfidf_lda[tfidf_query], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, tfidf_lda.print_topic(index, 10)))


Score: 0.6952779293060303	 
Topic: 0.002*"participant" + 0.002*"student" + 0.002*"social" + 0.001*"authorfunder" + 0.001*"care" + 0.001*"perpetuity" + 0.001*"country" + 0.001*"granted" + 0.001*"mental" + 0.001*"healthcare"

Score: 0.2136681228876114	 
Topic: 0.004*"cell" + 0.003*"sarscov" + 0.003*"protein" + 0.002*"antibody" + 0.002*"viral" + 0.002*"gene" + 0.002*"ace" + 0.002*"lung" + 0.002*"mouse" + 0.002*"rna"

Score: 0.030351314693689346	 
Topic: 0.000*"grab" + 0.000*"ptc" + 0.000*"utp" + 0.000*"agc" + 0.000*"acg" + 0.000*"sevenpoint" + 0.000*"ttp" + 0.000*"interactivity" + 0.000*"rtpcrbased" + 0.000*"reiterates"

Score: 0.0303513091057539	 
Topic: 0.001*"usc" + 0.000*"zikv" + 0.000*"rebounded" + 0.000*"demonstrable" + 0.000*"conspicuously" + 0.000*"domainspecific" + 0.000*"love" + 0.000*"revisiting" + 0.000*"underused" + 0.000*"comprehensiveness"

Score: 0.0303513053804636	 
Topic: 0.001*"und" + 0.000*"der" + 0.000*"ist" + 0.000*"mit" + 0.000*"für" + 0.000*"lcs" + 0.000*"den" + 0

In [125]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(tfidf_lda, tfidf_docs, dictionary)
vis

In [126]:
result_tfidf = get_similar_doc(query_bow, tfidf_docs, tfidf_lda)

In [127]:
for idx in reversed(result_tfidf):
    print("TITLE: " + df_data_final["title"].iloc[idx]+'\n')
    print("BODY: " + df_data_final["body_text"].iloc[idx][:500]+'\n')

TITLE: Rheumatology practice amidst the COVID-19 pandemic: a pragmatic view

BODY: In the last 3 months, societies worldwide have undergone major transformations to meet the challenges posed by the coronavirus disease 2019 (COVID- 19) pandemic. 1 2 Particular strain was put on healthcare systems, which had to adapt to the dramatic rise in demand, by increasing response capacity and prioritising assistance to patients with COVID-19. This was instrumental in reducing direct mortality, alongside with the implementation of travel restriction and social isolation measures. 3 4 Howe

TITLE: Journal Pre-proofs Keynote (green) The impact of COVID-19 on the cell and gene therapies industry: disruptions, opportunities, and future prospects The impact of COVID-19 on the cell and gene therapies industry: disruptions, opportunities, and future prospects Author biographies

BODY: In this review, we first discuss how the COVID-19 pandemic has caused disruption to the overall activities of CGT develop

### K-Nearest-Neighbours

In [13]:
def get_glove_vec(glove_path):
    
    with open(glove_path, "r") as fh:
        
        glove_word_to_vec = {}
        
        for line in tqdm(fh):
            line = line.strip().split()
            glove_word_to_vec[line[0]] = np.array(line[1:], dtype = np.float64)
            
    return glove_word_to_vec

In [14]:
base_dir = "/kaggle/input/glove6b100dtxt/glove.6B.100d.txt"
glove_word_to_vec = get_glove_vec(base_dir)

In [50]:
def get_doc_embeddings():
    
    doc_embeddings = np.zeros((len(corpus), 100))
    
    i = 0
    for doc in corpus:
        emb = np.zeros(glove_word_to_vec["a"].shape[0])
        
        for idx, count in doc:
            word = dictionary[idx]
            if word in glove_word_to_vec:
                emb += count*glove_word_to_vec[word]
                
        doc_embeddings[i:] = emb
        i += 1
        
    return doc_embeddings

In [51]:
doc_embeddings = get_doc_embeddings()

In [104]:
def get_querry_embeddings():
    
    emb = np.zeros(glove_word_to_vec["a"].shape[0])
    words = []
    for idx, count in query_bow:
        word = dictionary[idx]
        if word in glove_word_to_vec:
            words.append(word)
            emb += count*glove_word_to_vec[word]
    return [emb, words]

In [105]:
q_emb, words = get_querry_embeddings()

In [106]:
words

['basic',
 'comorbidities',
 'effective',
 'epidemiological',
 'impact',
 'period',
 'preexisting',
 'public',
 'socioeconomic',
 'economic',
 'interval',
 'measure',
 'severity',
 'susceptibility',
 'transmission',
 'understand',
 'woman',
 'dynamic',
 'fatality',
 'hospitalized',
 'pulmonary',
 'smoking',
 'symptomatic',
 'know',
 'mode',
 'incubation',
 'mitigation',
 'serial',
 'environmental',
 'behavioral',
 'learned',
 'pregnant',
 'neonate',
 'reproductive']

In [100]:
class KNN:
    
    def __init__(self, doc_emb, k):
        self.doc_embeddings = doc_emb
        self.k = k
    
    def get_nearest_neigh(self, query_emb):
        
        diff = self.doc_embeddings - query_emb
        dist = np.linalg.norm(diff, axis=1)
        numb_of_doc = self.doc_embeddings.shape[0]
        nearest_k = sorted(range(numb_of_doc), key = lambda x: dist[x])[-self.k:]
        return nearest_k
    

In [101]:
knn = KNN(doc_embeddings, 5)
nearest_neigh = knn.get_nearest_neigh(q_emb)
nearest_neigh

[560, 9055, 2859, 9206, 10978]

In [102]:
for idx in reversed(nearest_neigh):
    print("TITLE: " + df_data_final["title"].iloc[idx]+'\n')
    print("BODY: " + df_data_final["body_text"].iloc[idx][:500]+'\n')

TITLE: CHAP TER 1 Alimentary System

BODY: Examination of the oral cavity should be standard procedure during any postmortem examination. To obtain a clear view of the mucous membranes of the buccal and oral cavities, teeth, tongue, gums, and tonsils, it is essential to split the mandibular symphysis and separate the mandibles as far as possible. A thorough examination of all structures will reveal not only local lesions, but often those that may be due to systemic disease. Lesions may be associated with congenital anomalies (genetic and

TITLE: The 45th Annual Meeting of the European Society for Blood and Marrow Transplantation: Physicians -Poster Sessionannual-meeting

BODY: Background: Allogeneic hematopoietic stem cell transplantation is routinely offered to patients with high-risk or advanced ALL in the hopes of improving outcomes. Use of truly non-myeloablative (NMA) conditioning reduces toxicity in other contexts but outcome data for ALL patients after NMA transplants is lacking