In [56]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import re
import import_ipynb
import data_utils as du

pd.pandas.set_option('display.max_columns', None)

In [57]:
dataset = pd.read_csv("methodType_classified_papers.csv")

In [58]:
dataset.head()

Unnamed: 0,pmid,title,authors,citation,first_author,journal/book,publication_year,create_date,pmcid,nihms_id,doi,abstract,relevant_paper,Method_Type
0,39435445,Editorial: The operationalization of cognitive...,"Winter M, Probst T, Tallon M, Schobel J, Pryss R.",Front Neurosci. 2024 Oct 7;18:1501636. doi: 10...,Winter M,Front Neurosci,2024,2024/10/22,PMC11491427,Unavailable,10.3389/fnins.2024.1501636,Unavailable,True,computer vision
1,39367648,An initial game-theoretic assessment of enhanc...,"Fatemi MY, Lu Y, Diallo AB, Srinivasan G, Azhe...",Brief Bioinform. 2024 Sep 23;25(6):bbae476. do...,Fatemi MY,Brief Bioinform,2024,2024/10/05,PMC11452536,Unavailable,10.1093/bib/bbae476,The application of deep learning to spatial tr...,True,other
2,39181806,Cross-modal Transfer Learning Based on an Impr...,"Guo S, Chen H, Sheng X, Xiong Y, Wu M, Fischer...",Ultrasound Med Biol. 2024 Nov;50(11):1638-1645...,Guo S,Ultrasound Med Biol,2024,2024/08/24,Unavailable,Unavailable,10.1016/j.ultrasmedbio.2024.06.009,OBJECTIVE: Deep-learning algorithms have been ...,True,both
3,39112796,A generalist vision-language foundation model ...,"Zhang K, Zhou R, Adhikarla E, Yan Z, Liu Y, Yu...",Nat Med. 2024 Aug 7. doi: 10.1038/s41591-024-0...,Zhang K,Nat Med,2024,2024/08/07,Unavailable,Unavailable,10.1038/s41591-024-03185-2,Traditional biomedical artificial intelligence...,True,both
4,39056477,"DeepComBat: A statistically motivated, hyperpa...","Hu F, Lucas A, Chen AA, Coleman K, Horng H, Ng...",Hum Brain Mapp. 2024 Aug 1;45(11):e26708. doi:...,Hu F,Hum Brain Mapp,2024,2024/07/26,PMC11273293,Unavailable,10.1002/hbm.26708,Neuroimaging data acquired using multiple scan...,True,other


In [59]:
# Initialize the SentenceTransformer model

model = SentenceTransformer('all-MiniLM-L6-v2')

In [60]:
method_keywords = ["neural network", "artificial neural network", "machine learning model", "feedforward neural network", "neural net algorithm", "multilayer perceptron", "convolutional neural network", "cnn", "recurrent neural network", "rnn", "long short-term memory network", "lstm", "grnn", "deep neural networks", "transformer models", "self-attention models", "transformer architecture", "transformer", "attention-based neural networks", "transformer networks", "sequence-to-sequence models", "large language model", "llm", "transformer-based model", "pretrained language mode", "generative language model", "foundation model", "state-of-the-art language model", "vision transformer", "diffusion model", "generative diffusion model", "diffusion-based generative model", "continuous diffusion model"]

In [61]:
# Generate embeddings for each method keyword and store in a dictionary

method_embeddings = {keyword: model.encode(keyword, convert_to_tensor=True) for keyword in method_keywords}

In [62]:
method_embeddings

{'neural network': tensor([-6.4482e-02, -2.9616e-02,  3.3157e-02,  5.4728e-04, -3.7752e-02,
          7.5584e-02,  5.3797e-02, -7.6511e-03,  3.0332e-02, -8.1563e-02,
          1.8049e-02, -6.5760e-03,  5.2156e-03, -1.2081e-02, -1.5097e-01,
          3.0410e-02, -2.5747e-02,  7.7051e-03, -1.1151e-01, -2.8282e-02,
         -2.1990e-02,  4.8459e-03, -5.0901e-02, -1.1059e-02,  1.0984e-03,
          6.8441e-02,  2.1926e-02,  3.8624e-02, -5.9551e-03, -7.1150e-02,
          9.4967e-02, -1.6378e-02, -7.8110e-03,  3.9365e-02, -1.3549e-01,
         -2.1688e-02, -8.4803e-02,  5.9701e-03,  2.1777e-02,  2.7382e-02,
         -2.4941e-02, -5.7790e-02, -5.3466e-03,  1.7415e-02,  1.1889e-01,
          4.4646e-02,  8.7921e-04,  6.0592e-02,  2.5310e-02,  6.7536e-02,
         -7.0825e-02, -2.8347e-02, -5.4989e-02,  9.7299e-03,  7.2789e-02,
          5.1975e-02, -3.0907e-02,  6.0207e-03, -1.0442e-01, -3.3306e-03,
          6.6588e-02, -2.3577e-02, -7.0007e-04,  5.4123e-03,  1.1715e-01,
          3.7736e-02

In [63]:
# Define a similarity threshold

SIMILARITY_THRESHOLD = 0.25

In [64]:
# Function to extract method names used in the paper

def extract_methods(row):
    # Combine title and abstract into a single text
    text = row['title'] + " " + row['abstract']

    # Removing newline escape sequences
    text = re.sub(r'[\n]', ' ', text)
    
    # Get embedding for long text with chunking
    paper_embedding = du.get_long_text_embedding(text, model)

    # Initialize variables to track the highest similarity and corresponding method
    max_similarity = -1  # Start with a low similarity value
    selected_method = "None"  # Default to "None" if no method is found

    # Iterate over all methods and their embeddings
    for keyword, embedding in method_embeddings.items():
        similarity = util.cos_sim(paper_embedding, embedding).item()  # get scalar similarity
        if similarity > max_similarity:  # Update if a higher similarity is found
            max_similarity = similarity
            selected_method = keyword
    
    # Return the method with the highest similarity if it exceeds the threshold
    if max_similarity > SIMILARITY_THRESHOLD:
        return selected_method
    else:
        return "None"  # Return "None" if no method exceeds the threshold

In [65]:
# Apply the method extraction function to each paper

dataset['Methods_Used'] = dataset.apply(extract_methods, axis=1)

In [66]:
# Save or display the extracted methods

#relevant_papers.to_csv('papers_with_extracted_methods.csv', index=False)
print(dataset[['title', 'abstract', 'Methods_Used']].head())

                                               title  \
0  Editorial: The operationalization of cognitive...   
1  An initial game-theoretic assessment of enhanc...   
2  Cross-modal Transfer Learning Based on an Impr...   
3  A generalist vision-language foundation model ...   
4  DeepComBat: A statistically motivated, hyperpa...   

                                            abstract  \
0                                        Unavailable   
1  The application of deep learning to spatial tr...   
2  OBJECTIVE: Deep-learning algorithms have been ...   
3  Traditional biomedical artificial intelligence...   
4  Neuroimaging data acquired using multiple scan...   

                      Methods_Used  
0  attention-based neural networks  
1             deep neural networks  
2                              cnn  
3  attention-based neural networks  
4                              cnn  


In [67]:
dataset.shape

(3656, 15)

In [68]:
dataset['Methods_Used'].value_counts()

Methods_Used
machine learning model              1063
None                                 637
deep neural networks                 506
cnn                                  246
large language model                 196
state-of-the-art language model      176
artificial neural network            124
neural network                        88
lstm                                  76
vision transformer                    73
sequence-to-sequence models           73
long short-term memory network        65
convolutional neural network          60
diffusion model                       38
rnn                                   34
self-attention models                 28
neural net algorithm                  27
pretrained language mode              26
generative language model             24
feedforward neural network            19
diffusion-based generative model      19
recurrent neural network              12
attention-based neural networks       12
continuous diffusion model            12
tra

In [69]:
dataset.to_csv("research_papers_classification.csv", index=False)