**Dataset :** https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge

In [13]:
import spacy
import string
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint
from IPython.utils import io
from IPython.display import Image
from tqdm.notebook import tqdm

from spacy.lang.en.stop_words import STOP_WORDS

warnings.filterwarnings('ignore')
tqdm.pandas()

In [27]:
data = pd.read_csv('/content/data.csv')
data.reset_index(inplace=True, drop=True)

In [28]:
print("Columns in dataset : ",data.columns)
print("Shape of data : ",data.shape)

Columns in dataset :  Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id'],
      dtype='object')
Shape of data :  (10000, 19)


In [29]:
data.head(3)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,5y88qbty,d90b44917af7303c32432725c373fb8c1e1c5a6c,PMC,Sindrome acuta da stress respiratorio (ARDS),10.1007/978-88-470-0590-7_21,PMC7122671,,no-cc,"L’esatta incidenza dell’ARDS non è nota, poich...",2007,,La respirazione artificiale,,,,document_parses/pdf_json/d90b44917af7303c32432...,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,
1,0gg9fm7x,8f2a3a40cc6a4617295b8883a2008f80d2799b4e,PMC,Reflecting on the EU: the Good and the Bad Tim...,10.1007/s42439-020-00017-y,PMC7271638,,no-cc,,2020-06-04,"Corradetti, Claudio",Jus Cogens,,,,document_parses/pdf_json/8f2a3a40cc6a4617295b8...,document_parses/pmc_json/PMC7271638.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,
2,ppqopxzo,,PMC,PCR: Protocols for diagnosis of human and anim...,10.1016/0168-1702(96)01363-9,PMC7173250,,no-cc,,1999-03-19,"Desselberger, U.",Virus Res,,,,,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cord_uid          10000 non-null  object 
 1   sha               7656 non-null   object 
 2   source_x          10000 non-null  object 
 3   title             10000 non-null  object 
 4   doi               10000 non-null  object 
 5   pmcid             10000 non-null  object 
 6   pubmed_id         5840 non-null   object 
 7   license           10000 non-null  object 
 8   abstract          7000 non-null   object 
 9   publish_time      10000 non-null  object 
 10  authors           9000 non-null   object 
 11  journal           9468 non-null   object 
 12  mag_id            0 non-null      float64
 13  who_covidence_id  0 non-null      float64
 14  arxiv_id          0 non-null      float64
 15  pdf_json_files    7656 non-null   object 
 16  pmc_json_files    6934 non-null   object 

In [31]:
data.describe(include='all')

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
count,10000,7656,10000,10000,10000,10000,5840.0,10000,7000,10000,9000,9468,0.0,0.0,0.0,7656,6934,10000,0.0
unique,10000,7656,1,9860,10000,10000,5840.0,12,6996,2976,8783,3109,,,,7656,6934,10000,
top,5y88qbty,d90b44917af7303c32432725c373fb8c1e1c5a6c,PMC,Panorama,10.1007/978-88-470-0590-7_21,PMC7122671,34601681.0,no-cc,This article is one of ten reviews selected fr...,2020-12-31,"Larsen, Reinhard",PLoS One,,,,document_parses/pdf_json/d90b44917af7303c32432...,document_parses/pmc_json/PMC7271638.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,
freq,1,1,10000,7,1,1,1.0,5431,3,62,11,224,,,,1,1,1,
mean,,,,,,,,,,,,,,,,,,,
std,,,,,,,,,,,,,,,,,,,
min,,,,,,,,,,,,,,,,,,,
25%,,,,,,,,,,,,,,,,,,,
50%,,,,,,,,,,,,,,,,,,,
75%,,,,,,,,,,,,,,,,,,,


**Approach**

For each incoming query, calculate the semantic similarity with all documents in the dataset and pick top N publications from the dataset.

But before we start we building these out, we will have to process the abstract and represent them as contextual word embeddings.

In [32]:
data_column = pd.DataFrame(columns = ['cord_uid', 'title', 'abstract', 'doi'])
data_column['cord_uid'] = data['cord_uid']
data_column['title'] = data['title']
data_column['abstract'] = data['abstract']
data_column['doi'] = data['doi']

data_column.head(3)

Unnamed: 0,cord_uid,title,abstract,doi
0,5y88qbty,Sindrome acuta da stress respiratorio (ARDS),"L’esatta incidenza dell’ARDS non è nota, poich...",10.1007/978-88-470-0590-7_21
1,0gg9fm7x,Reflecting on the EU: the Good and the Bad Tim...,,10.1007/s42439-020-00017-y
2,ppqopxzo,PCR: Protocols for diagnosis of human and anim...,,10.1016/0168-1702(96)01363-9


In [33]:
data_column.drop_duplicates(['abstract'], inplace=True)
data_column.dropna(inplace=True)
data_column.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6996 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   cord_uid  6996 non-null   object
 1   title     6996 non-null   object
 2   abstract  6996 non-null   object
 3   doi       6996 non-null   object
dtypes: object(4)
memory usage: 273.3+ KB


We will focus only publication in english and drop anything than non-english

In [34]:
!pip install langdetect


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m942.1/981.5 kB[0m [31m28.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993221 sha256=a7685f89f2ab230efcabcd32a27abbbf8b72f754dfef04b38824df99ebd34507
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [36]:
from langdetect import detect, DetectorFactory

# Set the seed for reproducibility
DetectorFactory.seed = 0

# Hold labels - languages
languages = []

# Assuming 'data_column' is a DataFrame with a column named 'abstract'
for ii in tqdm(range(len(data_column))):  # Loop through each row in the DataFrame
    text = data_column.iloc[ii]['abstract'].split(" ")  # Split the text into words

    lang = "en"  # Default language is set to English
    try:
        if len(text) > 50:
            lang = detect(" ".join(text[:50]))  # Detect language from the first 50 words
        elif len(text) > 0:
            lang = detect(" ".join(text))  # Detect language from the available text
    except Exception as e:
        all_word = set(text)  # Use set to remove duplicates
        try:
            lang = detect(" ".join(all_word))  # Detect language from unique words
        except Exception as e:
            lang = "unknown"  # Assign "unknown" if detection fails
            pass

    languages.append(lang)  # Append the detected language to the list

  0%|          | 0/6996 [00:00<?, ?it/s]

Lets look number of articles for each language

In [37]:
languages_dict = {}
for lang in languages:
  languages_dict[lang] = languages.count(lang)

print(languages_dict)
print(len(languages))

{'it': 7, 'en': 6833, 'nl': 33, 'de': 120, 'fr': 2, 'pt': 1}
6996


In [40]:
data_column['languages'] = languages

# Filter the DataFrame to include only rows where the language is English
df = data_column[data_column['languages'] == 'en']

# Display the filtered DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6833 entries, 4 to 9999
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   cord_uid   6833 non-null   object
 1   title      6833 non-null   object
 2   abstract   6833 non-null   object
 3   doi        6833 non-null   object
 4   languages  6833 non-null   object
dtypes: object(5)
memory usage: 320.3+ KB


In [41]:
df = df.drop(columns=['languages'], axis = 1)
df.head(3)

Unnamed: 0,cord_uid,title,abstract,doi
4,oc65n0lr,"Evaluation of bio-aerosols type, density, and ...",Exposure to bioaerosols in the air of hospital...,10.1007/s11356-021-16733-x
5,lzw1q285,Genome Sequence of Torovirus Identified from a...,Porcine torovirus (PToV) strain PToV-NPL/2013 ...,10.1128/genomea.01291-14
7,n0tozedi,Successful implementation of parenting support...,Although emotional and behavioural problems am...,10.1371/journal.pone.0265589


Creating custom stopword

In [42]:
custom_stop_words = [
'dol', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure',
'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.',
'al.', 'Elsevier', 'PMC', 'CZI', 'www']

for w in custom_stop_words:
  if w not in STOP_WORDS:
    STOP_WORDS.add(w)

Cleaning Abstract

In [47]:
import re

def text_preprocessing(text):
    # Convert to lowercase and remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    # Split text into words
    words = text.split()
    # Remove stop words and any punctuation remnants
    words = [word.lower() for word in words if word.lower() not in STOP_WORDS]
    # Join the words back into a string
    text = ' '.join(words)
    return text

df['preprocessed_abstract'] = df['abstract'].apply(text_preprocessing)
df.head(3)

Unnamed: 0,cord_uid,title,abstract,doi,preprocessed_abstract
4,oc65n0lr,"Evaluation of bio-aerosols type, density, and ...",Exposure to bioaerosols in the air of hospital...,10.1007/s11356-021-16733-x,exposure bioaerosols air hospitals associated ...
5,lzw1q285,Genome Sequence of Torovirus Identified from a...,Porcine torovirus (PToV) strain PToV-NPL/2013 ...,10.1128/genomea.01291-14,porcine torovirus ptov strain ptov npl 2013 id...
7,n0tozedi,Successful implementation of parenting support...,Although emotional and behavioural problems am...,10.1371/journal.pone.0265589,emotional behavioural problems young children ...


To train word2vec model we want to convert the sentence into list

In [49]:
abstract = df['preprocessed_abstract'].values

#lets use spacy for faster tokenization
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'ner'])
nlp.add_pipe('sentencizer')

def tokenize_sentence(sentence):
  sentence_corpus = []
  doc = nlp(sentence)
  sentence = [sent.text.strip() for sent in doc.sents]
  for sent in sentence:
    processed_sent_list = sent.split(" ")
    sentence_corpus.append(processed_sent_list)
  return sentence_corpus

df['tokenized_abstract'] = df['preprocessed_abstract'].apply(tokenize_sentence)

corpus_data = df['tokenized_abstract'].to_list()
word2vec_corpus = [word for word in corpus_data]

In [56]:
# Flatten the nested lists
word2vec_corpus = [sentence for sublist in word2vec_corpus for sentence in sublist]


In [57]:
from gensim.models import Word2Vec

model = Word2Vec(word2vec_corpus, min_count=1, workers=4, window=5, sg=1, negative=5, vector_size=150)

In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer


text = df['preprocessed_abstract'].values


word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words='english',
    max_features=1500
)

WordFeatures = word_vectorizer.fit_transform(text)

In [77]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity



query = "origin of corona virus"


query_tfidf = word_vectorizer.transform([query])


cosine_similarities = cosine_similarity(query_tfidf, WordFeatures).flatten()


top_5_indices = np.argsort(cosine_similarities)[::-1][:5]


for idx in top_5_indices:
    print(f"Abstract {idx+1}: Cosine Similarity = {cosine_similarities[idx]:.4f}")
    print(f"Title: {df['title'].iloc[idx]}\n")




Abstract 106: Cosine Similarity = 1.0000
Title: Measles Virus

Abstract 6698: Cosine Similarity = 1.0000
Title: Parvovirus

Abstract 1398: Cosine Similarity = 0.3924
Title: Increasing the number of available ranks in virus taxonomy from five to ten and adopting the Baltimore classes as taxa at the basal rank

Abstract 5133: Cosine Similarity = 0.3801
Title: Pathogenicity of Hemagglutinating Encephalomyelitis (Vomiting and Wasting Disease) Virus of Pigs, using Different Routes of Inoculation

Abstract 6121: Cosine Similarity = 0.3688
Title: Pathogenesis of Theiler's Murine Encephalomyelitis Virus

