In [23]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import re

pd.pandas.set_option('display.max_columns', None)

In [24]:
dataset = pd.read_csv("preprocessed_data.csv")

In [25]:
dataset.head()

Unnamed: 0,pmid,title,authors,citation,first_author,journal/book,publication_year,create_date,pmcid,nihms_id,doi,abstract
0,39435445,Editorial: The operationalization of cognitive...,"Winter M, Probst T, Tallon M, Schobel J, Pryss R.",Front Neurosci. 2024 Oct 7;18:1501636. doi: 10...,Winter M,Front Neurosci,2024,2024/10/22,PMC11491427,Unavailable,10.3389/fnins.2024.1501636,Unavailable
1,39398866,Characterization of arteriosclerosis based on ...,"Zhou J, Li X, Demeke D, Dinh TA, Yang Y, Janow...",J Med Imaging (Bellingham). 2024 Sep;11(5):057...,Zhou J,J Med Imaging (Bellingham),2024,2024/10/14,PMC11466048,Unavailable,10.1117/1.JMI.11.5.057501,PURPOSE: Our purpose is to develop a computer ...
2,39390053,Multi-scale input layers and dense decoder agg...,"Lan X, Jin W.",Sci Rep. 2024 Oct 10;14(1):23729. doi: 10.1038...,Lan X,Sci Rep,2024,2024/10/10,PMC11467340,Unavailable,10.1038/s41598-024-74701-0,Accurate segmentation of COVID-19 lesions from...
3,39367648,An initial game-theoretic assessment of enhanc...,"Fatemi MY, Lu Y, Diallo AB, Srinivasan G, Azhe...",Brief Bioinform. 2024 Sep 23;25(6):bbae476. do...,Fatemi MY,Brief Bioinform,2024,2024/10/05,PMC11452536,Unavailable,10.1093/bib/bbae476,The application of deep learning to spatial tr...
4,39363262,Truncated M13 phage for smart detection of E. ...,"Yuan J, Zhu H, Li S, Thierry B, Yang CT, Zhang...",J Nanobiotechnology. 2024 Oct 3;22(1):599. doi...,Yuan J,J Nanobiotechnology,2024,2024/10/04,PMC11451008,Unavailable,10.1186/s12951-024-02881-y,BACKGROUND: The urgent need for affordable and...


# Task 1

The pretrained sentence transformer "all-MiniLM-L6-v2" is used as a semantic natural language processing technique to filter out papers that do not meet the criteria of utilizing deep learning approaches in virology/epidemiology. The "all-MiniLM-L6-v2" model is lightweight because it is a smaller transformer-based model optimized for speed and efficiency, designed to work well for generating embeddings in a resource-friendly manner.

Approach used to filter the relevant papers:

1.  Generate the embeddings of keywords (used while collecting the original dataset from PubMed Central (PMC) database).
2.  Generate the embeddings of titles and abstract (combined) for each row.
3.  Compare the embeddings of papers and keywords using cosine similarity.
4.  If the similarity score is greater than the threshold, then that specific paper is deemed as relevant.

In [26]:
# Initialize the SentenceTransformer model

model = SentenceTransformer('all-MiniLM-L6-v2')

In [27]:
# Create embeddings for deep learning-related keywords

keywords = ["virology", "epidemiology", "neural network", "artificial neural network", "machine learning model", "feedforward neural network", "neural net algorithm", "multilayer perceptron", "convolutional neural network", "recurrent neural network", "long short-term memory network", "cnn", "grnn", "rnn", "lstm", "deep learning", "deep neural networks", "computer vision", "vision model", "image processing", "vision algorithms", "computer graphics and vision", "object recognition", "scene understanding", "natural language processing", "text mining", "nlp", "computational linguistics", "language processing", "text analytics", "textual data analysis", "text data analysis", "text analysis", "speech and language technology", "language modeling", "computational semantics", "generative artificial intelligence", "generative ai", "generative deep learning", "generative models", "transformer models", "self-attention models", "transformer architecture", "transformer", "attention-based neural networks", "transformer networks", "sequence-to-sequence models", "large language model", "llm", "transformer-based model", "pretrained language model", "generative language model", "foundation model", "state-of-the-art language model", "multimodal model", "multimodal neural network", "vision transformer", "diffusion model", "generative diffusion model", "diffusion-based generative model", "continuous diffusion model"]

In [28]:
# Generate embeddings for each keyword

keyword_embeddings = model.encode(keywords)

In [29]:
keyword_embeddings

array([[-1.40852574e-02,  1.73932258e-02, -1.00297565e-02, ...,
        -8.54644999e-02,  8.96995589e-02,  1.44412806e-02],
       [ 2.80762706e-02,  4.93878275e-02, -4.09113690e-02, ...,
         5.30148318e-05,  8.49230739e-04,  4.48480190e-04],
       [-6.44818321e-02, -2.96157189e-02,  3.31570841e-02, ...,
         7.31250942e-02,  1.21323708e-02, -6.29288629e-02],
       ...,
       [ 1.83021219e-03, -1.16974704e-01,  8.30989182e-02, ...,
         1.82120893e-02,  7.14613171e-03,  3.86925563e-02],
       [-9.83968191e-03, -1.07192129e-01,  7.44549036e-02, ...,
         2.45842785e-02,  2.08438430e-02,  3.23861949e-02],
       [ 5.45306802e-02, -9.36757103e-02,  1.18466029e-02, ...,
         1.08546913e-02,  9.82687902e-03,  9.06553790e-02]], dtype=float32)

In [30]:
# Function to compute cosine similarity between two embeddings

def compute_similarity(paper_embedding, keyword_embeddings):
    similarities = cosine_similarity([paper_embedding], keyword_embeddings)
    return similarities.max()  # Return the maximum similarity

Major disadvantage of SBERT model "all-MiniLM-L6-v2":

The sentence transformer model "all-MiniLM-L6-v2" can handle only 256 tokens and the rest of the text input will be truncated. This truncation leads to loss of data and the transformer will not be able to make proper classification of relevant papers. 

Solution:

To overcome this disadvantage, the input text will be divided into several chunks of 200 tokens each and they will be embedded. The final text embedding will be the mean of the embeddings of these chunks.

In [31]:
# Function to get combined embedding of mean, min, max and then take mean of these

def get_long_text_embedding(text, model, chunk_size=200):
    # Split text into chunks
    words = text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    
    # Generate embeddings for each chunk
    chunk_embeddings = [model.encode(chunk) for chunk in chunks]
    chunk_embeddings = np.vstack(chunk_embeddings)
    
    # Calculate mean, max, and min embeddings
    mean_embedding = np.mean(chunk_embeddings, axis=0)
    max_embedding = np.max(chunk_embeddings, axis=0)
    min_embedding = np.min(chunk_embeddings, axis=0)
    
    # Take the mean of [mean_embedding, max_embedding, min_embedding]
    combined_embedding = np.mean([mean_embedding, max_embedding, min_embedding], axis=0)
    return combined_embedding

Inorder to achieve the filtering and classification of research papers that utilize deep learning approaches in virology/epidemiology, the columns "title" and "abstract" are very important.

Every research paper (each row in the dataset) has a 'title' in the dataset as it has no missing values.
 
If the "abstract" is not present in a row, then only title is being considered for classification of papers.

In [32]:
# Function to filter papers based on cosine similarity
def filter_paper(row, keyword_embeddings, model, threshold=0.35):
    # Combine Title and Abstract for processing
    #print(row['title'] + " " + row['abstract'])
    # print(row['title'])
    # print(row['abstract'])
    text = row['title'].lower() + " " + row['abstract'].lower()

    # Removing newline escape sequences
    text = re.sub(r'[\n]', ' ', text)
    
    # Get embedding for the combined text (handling long abstracts by chunking)
    paper_embedding = get_long_text_embedding(text, model)
    
    # Compute similarity with all keyword embeddings
    similarity = compute_similarity(paper_embedding, keyword_embeddings)
    
    # If similarity exceeds threshold, keep the paper
    return similarity > threshold

In [33]:
# Apply filtering to the dataframe

dataset['relevant_paper'] = dataset.apply(filter_paper, axis=1, keyword_embeddings=keyword_embeddings, model=model)

In [34]:
dataset['relevant_paper'].value_counts()

relevant_paper
False    7794
True     3656
Name: count, dtype: int64

In [35]:
# Filter the relevant papers

relevant_papers = dataset[dataset['relevant_paper'] == True].reset_index(drop=True)

In [37]:
relevant_papers.head()

Unnamed: 0,pmid,title,authors,citation,first_author,journal/book,publication_year,create_date,pmcid,nihms_id,doi,abstract,relevant_paper
0,39435445,Editorial: The operationalization of cognitive...,"Winter M, Probst T, Tallon M, Schobel J, Pryss R.",Front Neurosci. 2024 Oct 7;18:1501636. doi: 10...,Winter M,Front Neurosci,2024,2024/10/22,PMC11491427,Unavailable,10.3389/fnins.2024.1501636,Unavailable,True
1,39367648,An initial game-theoretic assessment of enhanc...,"Fatemi MY, Lu Y, Diallo AB, Srinivasan G, Azhe...",Brief Bioinform. 2024 Sep 23;25(6):bbae476. do...,Fatemi MY,Brief Bioinform,2024,2024/10/05,PMC11452536,Unavailable,10.1093/bib/bbae476,The application of deep learning to spatial tr...,True
2,39181806,Cross-modal Transfer Learning Based on an Impr...,"Guo S, Chen H, Sheng X, Xiong Y, Wu M, Fischer...",Ultrasound Med Biol. 2024 Nov;50(11):1638-1645...,Guo S,Ultrasound Med Biol,2024,2024/08/24,Unavailable,Unavailable,10.1016/j.ultrasmedbio.2024.06.009,OBJECTIVE: Deep-learning algorithms have been ...,True
3,39112796,A generalist vision-language foundation model ...,"Zhang K, Zhou R, Adhikarla E, Yan Z, Liu Y, Yu...",Nat Med. 2024 Aug 7. doi: 10.1038/s41591-024-0...,Zhang K,Nat Med,2024,2024/08/07,Unavailable,Unavailable,10.1038/s41591-024-03185-2,Traditional biomedical artificial intelligence...,True
4,39056477,"DeepComBat: A statistically motivated, hyperpa...","Hu F, Lucas A, Chen AA, Coleman K, Horng H, Ng...",Hum Brain Mapp. 2024 Aug 1;45(11):e26708. doi:...,Hu F,Hum Brain Mapp,2024,2024/07/26,PMC11273293,Unavailable,10.1002/hbm.26708,Neuroimaging data acquired using multiple scan...,True


In [None]:
# Save the filtered papers

relevant_papers.to_csv('filtered_papers.csv', index=False)

In [39]:
print(f"Number of relevant papers: {len(relevant_papers)}")

Number of relevant papers: 3656
