In [None]:
# https://www.kaggle.com/code/keitazoumana/scientific-document-similarity-search-with-scibert/notebook

In [None]:
import pandas as pd

data = pd.read_csv("output2.csv")
print(f"Data Shape: {data.shape}")

# Drop w/ Missing Abstract
data = data.drop_duplicates(subset=['Abstract'])
data = data.dropna(subset=['Abstract'])
data = data.reset_index(drop = True)
print(f"Data Shape: {data.shape}")

In [None]:
def show_random_papers(size=10):
    papers = data.sample(size)
    
    for index in list(papers.index):
        print(f"Paper #{index}")
        print(f"-Title: {data.iloc[index]['Title']}")
        print(f"-Abstract: {data.iloc[index]['Abstract']}")
        print("\n")

show_random_papers()

In [None]:
import torch
from transformers import BertTokenizer,  AutoModelForSequenceClassification

pretrained_model = 'allenai/scibert_scivocab_uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_model, do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, output_attentions=False, output_hidden_states=True)

In [None]:
from keras.preprocessing.sequence import pad_sequences

def embed_text(text):
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    # print(f"Input IDs: {[input_ids]}")
    padded_input_ids = pad_sequences([input_ids], maxlen=210, dtype="long", truncating="post", padding="post")
    # print(f"Padded Input IDs: {[padded_input_ids]}")
    
    input_ids = padded_input_ids[0]
    # print(f"Input IDs: {[input_ids]}")

    # Attention Mask
    # It seems that it holds boolean values.
    attention_mask = [int(i > 0) for i in input_ids]
    
    # Convert to Tensors
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    
    # Pseudo-Batch
    input_ids = input_ids.unsqueeze(0)
    attention_mask = attention_mask.unsqueeze(0)
    
    model.eval()
    
    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():        
        logits, encoded_layers = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, return_dict=False)

    layer_i = 12 # The last BERT layer before the classifier.
    batch_i = 0 # Only one input in the batch.
    token_i = 0 # The first token, corresponding to [CLS]
        
    # Extract the embedding.
    embedding = encoded_layers[layer_i][batch_i][token_i]
    embedding = embedding.detach().cpu().numpy()

    return (embedding)

embedding = embed_text(data.iloc[0]["Abstract"])
print(f"Embedding Shape: {embedding.shape}")

In [None]:
# Add Embeddings
embeds = []
for abstract in data.Abstract:
    embeds.append(embed_text(abstract))

data["Embeddings"] = embeds

In [None]:
# Creating the embeddings took an extremely
# long time, so I'm going to save them.
# import pickle

# with open("data-2.pkl", "wb") as f:
#   pickle.dump(data, f)

with open("data-2.pkl", "rb") as f:
    data = pickle.load(f)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def process_text(text):
    vect = embed_text(text)
    print(f"Shape: {vect.shape}")
    vect = np.array(vect)
    vect = vect.reshape(1, -1)
    print(f"Shape: {vect.shape}")
    return vect

def get_top_N_papers_cosine(text, N=5):
    text_vect = process_text(text)
    
    # Run Similarity Search
    data["Similarity"] = data["Embeddings"].apply(lambda x: cosine_similarity(text_vect, [x]))
    data["Similarity"] = data["Similarity"].apply(lambda x: x[0][0])

    # Sort
    top_N_papers = data.sort_values(by='Similarity', ascending=False)[0:N]
    
    return top_N_papers

top_papers = get_top_N_papers_cosine('''
    Current theory on trophic interactions in food webs assumes thatecologically
    similar species can be treated collectively as a single functional unit such as a guild or
    trophic level. This theory implies that all species within that unit transmit identical direct
    and indirect effects throughout the community. We evaluated this assumption by conducting
    experiments to compare the direct and indirect effects of three top-predator species, be
    longing to the same hunting spider guild, on the same species of grasshopper and on old
    field grasses and herbs. Observations under field conditions revealed that each spiderspecies
    exhibited different hunting behavior (i.e., sit-and-wait, sit-and-pursue, and active hunting)
    and occupied different locations within the vegetation canopy. These differences resulted
    in different direct effects on grasshopper prey. Grasshoppers demonstrated significant be
    havioral (diet) shifts in the presence of sit-and-wait and sit-and-pursue species but not when
    faced with actively hunting species. Grasshopper density was significantly reduced byspider
    species that occupied lower parts of the vegetation canopy (sit-and-pursue and actively
    hunting species), but it was not significantly reduced by the sit-and-wait spider species that
    occupied the upper parts of the canopy. These direct effects manifested themselves differ
    ently in the plant trophic level. The sit-and-wait spider caused indirect effects on plants
    by changing grasshopper foraging behavior (a trait-mediated effect). The sit-and-pursue
    spider caused indirect effects by reducing grasshopper density (density-mediated effects);
    the effects of changes in grasshopper behavior were thus not reflected in the plant trophic
    level. The actively hunting spiders had strictly density-mediated indirect effects on plants.
    The study offers mechanistic insight into how predator species within the same guild can
    have very different trophic effects in food webs. Thus classical modeling approaches that
    treat all predator species as a single functional unit may not adequately capture biologically
    relevant details that influence community dynamics.
''', data.shape[0])

In [121]:
def sort_papers_by_cosine(references):
    # Similarity
    # For each reference (text), we take the cosine similarity between its embeddings
    # and the embeddings in the dataframe. This is stored in another column (e.g. Similarity0).
    for index, reference in enumerate(references):
        reference_vect = process_text(reference)
        data[f"Similarity{index}"] = data["Embeddings"].apply(lambda x: cosine_similarity(reference_vect, [x])[0][0])

    # Average Similarity
    # It's just named Similarity, I'm already typo-prone
    data['Similarity'] = data[[f"Similarity{i}" for i in range(len(references))]].mean(axis=1)
    
    # Sort
    return data.sort_values(by='Similarity', ascending=False)

In [None]:
abstracts = [
    '''
        Using food web models that account for juvenile and adult
        individuals of species, I show that commonly observed differences between juveniles and adults in foraging capacity
        and predation risk result in larger, more complex communities than predicted by models without stage structure.
        Based on their species interaction networks these complex
        and diverse communities would be expected to be unstable, but these destabilizing effects of species interactions are
        overruled by stabilizing changes in juvenile–adult stage structure. Differences between juvenile and adult individuals hence
        offer a natural resolution to the diversity–stability enigma of
        ecological communities.
    ''',
    '''
        Current theory on trophic interactions in food webs assumes thatecologically
        similar species can be treated collectively as a single functional unit such as a guild or
        trophic level. This theory implies that all species within that unit transmit identical direct
        and indirect effects throughout the community. We evaluated this assumption by conducting
        experiments to compare the direct and indirect effects of three top-predator species, be
        longing to the same hunting spider guild, on the same species of grasshopper and on old
        field grasses and herbs. Observations under field conditions revealed that each spiderspecies
        exhibited different hunting behavior (i.e., sit-and-wait, sit-and-pursue, and active hunting)
        and occupied different locations within the vegetation canopy. These differences resulted
        in different direct effects on grasshopper prey. Grasshoppers demonstrated significant be
        havioral (diet) shifts in the presence of sit-and-wait and sit-and-pursue species but not when
        faced with actively hunting species. Grasshopper density was significantly reduced byspider
        species that occupied lower parts of the vegetation canopy (sit-and-pursue and actively
        hunting species), but it was not significantly reduced by the sit-and-wait spider species that
        occupied the upper parts of the canopy. These direct effects manifested themselves differ
        ently in the plant trophic level. The sit-and-wait spider caused indirect effects on plants
        by changing grasshopper foraging behavior (a trait-mediated effect). The sit-and-pursue
        spider caused indirect effects by reducing grasshopper density (density-mediated effects);
        the effects of changes in grasshopper behavior were thus not reflected in the plant trophic
        level. The actively hunting spiders had strictly density-mediated indirect effects on plants.
        The study offers mechanistic insight into how predator species within the same guild can
        have very different trophic effects in food webs. Thus classical modeling approaches that
        treat all predator species as a single functional unit may not adequately capture biologically
        relevant details that influence community dynamics.
    ''',
    '''
        The composition of an ecosystem is thought to be
        important for determining its resistance to invasion.
        Studiesofnaturalecosystems,fromplanttomicrobial communities, have found that more diverse communities are more resistant to invasion. In some
        cases, more diverse communities resist invasion by more completely consuming the resources necessary for the invader. We show that Escherichia
        coli cansuccessfullyinvadeculturesofthealgaChlamydomonas reinhardtii (phototroph) or the ciliate
        Tetrahymena thermophila (predator) but cannot invade a community where both are present. The invasion resistance of the algae-ciliate community
        arises from a higher-order interaction between species (interaction modification) that is unrelated to resource consumption. We show that the mode of
        this interaction is the algal inhibition of bacterial aggregation, whichleavesbacteriavulnerabletopredation. This moderequiresboththealgaeandtheciliate
        to be present and provides an example of invasion resistance through an interaction modification.
    ''',
    '''
        Investigating how prey density influences a
        prey’s combined predation risk from multiple predator
        species is critical for understanding the widespread
        importance of multiple predator effects. We conducted
        experiments that crossed six treatments consisting of
        zero, one, or two predator species (hellgrammites,
        greenside darters, and creek chubs) with three treat
        ments in which we varied the density of mayfly prey.
        None of the multiple predator effects in our system were
        independent, and instead, the presence of multiple
        predator species resulted in risk reduction for the prey
        across both multiple predator combinations and all
        three levels of prey density. Risk reduction is likely to
        have population-level consequences for the prey,
        resulting in larger prey populations than would be pre
        dicted if the effects of multiple predator species were
        independent. For one of the two multiple predator
        combinations, the magnitude of risk reduction margin
        ally increased with prey density. As a result, models
        predicting the combined risk from multiple predator
        species in this system will sometimes need to account for
        prey density as a factor influencing per-capita prey death
        rates.
    '''
]

In [122]:
papers = sort_papers_by_cosine(abstracts)

Shape: (768,)
Shape: (1, 768)
Shape: (768,)
Shape: (1, 768)
Shape: (768,)
Shape: (1, 768)
Shape: (768,)
Shape: (1, 768)


In [124]:
papers[["Title", "DOI", "Abstract", "Similarity", "Similarity0", "Similarity1", "Similarity2", "Similarity3"]].to_csv('Papers.csv', index=False, encoding='utf-8')