In [19]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import re
import import_ipynb
import data_utils

pd.pandas.set_option('display.max_columns', None)

ModuleNotFoundError: No module named 'import_ipynb'

In [2]:
dataset = pd.read_csv("filtered_papers.csv")

In [3]:
dataset.head()

Unnamed: 0,pmid,title,authors,citation,first_author,journal/book,publication_year,create_date,pmcid,nihms_id,doi,abstract,relevant_paper
0,39435445,Editorial: The operationalization of cognitive...,"Winter M, Probst T, Tallon M, Schobel J, Pryss R.",Front Neurosci. 2024 Oct 7;18:1501636. doi: 10...,Winter M,Front Neurosci,2024,2024/10/22,PMC11491427,Unavailable,10.3389/fnins.2024.1501636,Unavailable,True
1,39367648,An initial game-theoretic assessment of enhanc...,"Fatemi MY, Lu Y, Diallo AB, Srinivasan G, Azhe...",Brief Bioinform. 2024 Sep 23;25(6):bbae476. do...,Fatemi MY,Brief Bioinform,2024,2024/10/05,PMC11452536,Unavailable,10.1093/bib/bbae476,The application of deep learning to spatial tr...,True
2,39181806,Cross-modal Transfer Learning Based on an Impr...,"Guo S, Chen H, Sheng X, Xiong Y, Wu M, Fischer...",Ultrasound Med Biol. 2024 Nov;50(11):1638-1645...,Guo S,Ultrasound Med Biol,2024,2024/08/24,Unavailable,Unavailable,10.1016/j.ultrasmedbio.2024.06.009,OBJECTIVE: Deep-learning algorithms have been ...,True
3,39112796,A generalist vision-language foundation model ...,"Zhang K, Zhou R, Adhikarla E, Yan Z, Liu Y, Yu...",Nat Med. 2024 Aug 7. doi: 10.1038/s41591-024-0...,Zhang K,Nat Med,2024,2024/08/07,Unavailable,Unavailable,10.1038/s41591-024-03185-2,Traditional biomedical artificial intelligence...,True
4,39056477,"DeepComBat: A statistically motivated, hyperpa...","Hu F, Lucas A, Chen AA, Coleman K, Horng H, Ng...",Hum Brain Mapp. 2024 Aug 1;45(11):e26708. doi:...,Hu F,Hum Brain Mapp,2024,2024/07/26,PMC11273293,Unavailable,10.1002/hbm.26708,Neuroimaging data acquired using multiple scan...,True


In [4]:
# Define the keywords for Text mining, Computer Vision and Other

text_mining_keywords = ["natural language processing", "text mining", "nlp", "computational linguistics", "language processing", "text analytics", "textual data analysis", "text data analysis", "text analysis", "speech and language technology", "language modeling", "computational semantics", "transformer models", "self-attention models", "transformer architecture", "transformer", "attention-based neural networks", "transformer networks", "sequence-to-sequence models", "large language model", "llm", "transformer-based model", "pretrained language model", "generative language model", "foundation model", "state-of-the-art language model"]

computer_vision_keywords = ["computer vision", "vision model", "image processing", "vision algorithms", "computer graphics and vision", "object recognition", "scene understanding", "vision transformer", "multimodal model", "multimodal neural network", "diffusion model", "generative diffusion model", "diffusion-based generative model", "continuous diffusion model"]

other_keywords = ["virology", "epidemiology", "neural network", "artificial neural network", "machine learning model", "feedforward neural network", "neural net algorithm", "multilayer perceptron", "recurrent neural network", "rnn", "long short-term memory network", "lstm", "grnn", "deep learning", "deep neural networks", "generative artificial intelligence", "generative ai", "generative deep learning", "generative models"]

In [5]:
# Initialize the SentenceTransformer model

model = SentenceTransformer('all-MiniLM-L6-v2')

In [7]:
# Function to compute cosine similarity between two embeddings

def compute_similarity(paper_embedding, keyword_embeddings):
    similarities = cosine_similarity([paper_embedding], keyword_embeddings)
    return similarities.max()  # Return the maximum similarity

In [8]:
# Function to get combined embedding of mean, min, max and then take mean of these

def get_long_text_embedding(text, model, chunk_size=200):
    # Split text into chunks
    words = text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    
    # Generate embeddings for each chunk
    chunk_embeddings = [model.encode(chunk) for chunk in chunks]
    chunk_embeddings = np.vstack(chunk_embeddings)
    
    # Calculate mean, max, and min embeddings
    mean_embedding = np.mean(chunk_embeddings, axis=0)
    max_embedding = np.max(chunk_embeddings, axis=0)
    min_embedding = np.min(chunk_embeddings, axis=0)
    
    # Take the mean of [mean_embedding, max_embedding, min_embedding]
    combined_embedding = np.mean([mean_embedding, max_embedding, min_embedding], axis=0)
    return combined_embedding

In [9]:
# Generate embeddings for each set of keywords

text_mining_embeddings = model.encode(text_mining_keywords, convert_to_tensor=True)

computer_vision_embeddings = model.encode(computer_vision_keywords, convert_to_tensor=True)

other_embeddings = model.encode(other_keywords, convert_to_tensor=True)

In [10]:
# Function to classify each paper
def classify_paper(row):
    # Combine title and abstract
    text = row['title'] + " " + row['abstract']
    
    # Removing newline escape sequences
    text = re.sub(r'[\n]', ' ', text)

    # Get embedding for long text with chunking
    paper_embedding = get_long_text_embedding(text, model)

    # Calculate similarity with each keyword category
    text_mining_similarity = util.cos_sim(paper_embedding, text_mining_embeddings).max().item()
    computer_vision_similarity = util.cos_sim(paper_embedding, computer_vision_embeddings).max().item()
    other_similarity = util.cos_sim(paper_embedding, other_embeddings).max().item()

    # Determine classification based on highest similarity score
    if text_mining_similarity > 0.3 and computer_vision_similarity > 0.3:
        return "both"
    elif text_mining_similarity > computer_vision_similarity and text_mining_similarity > other_similarity and text_mining_similarity > 0.4:
        return "text mining"
    elif computer_vision_similarity > text_mining_similarity and computer_vision_similarity > other_similarity and computer_vision_similarity > 0.4:
        return "computer vision"
    elif other_similarity > text_mining_similarity and other_similarity > computer_vision_similarity and other_similarity > 0.4:
        return "other"
    else:
        return "other"  # Default to "other" if no category exceeds the threshold

In [11]:
# Apply the classification function to each paper

dataset['Method_Type'] = dataset.apply(classify_paper, axis=1)

In [12]:
dataset['Method_Type'].value_counts()

Method_Type
other              2801
text mining         549
both                179
computer vision     127
Name: count, dtype: int64

In [13]:
print(dataset[['title', 'abstract', 'Method_Type']].head())

                                               title  \
0  Editorial: The operationalization of cognitive...   
1  An initial game-theoretic assessment of enhanc...   
2  Cross-modal Transfer Learning Based on an Impr...   
3  A generalist vision-language foundation model ...   
4  DeepComBat: A statistically motivated, hyperpa...   

                                            abstract      Method_Type  
0                                        Unavailable  computer vision  
1  The application of deep learning to spatial tr...            other  
2  OBJECTIVE: Deep-learning algorithms have been ...             both  
3  Traditional biomedical artificial intelligence...            other  
4  Neuroimaging data acquired using multiple scan...            other  


In [14]:
dataset['Method_Type'].value_counts()

Method_Type
other              2801
text mining         549
both                179
computer vision     127
Name: count, dtype: int64

In [16]:
dataset.shape

(3656, 14)

In [15]:
# Save the filtered papers

dataset.to_csv('methodType_classified_papers.csv', index=False)