In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read in abstracts
df = pd.read_csv('../data/processed_data.csv')
abstracts = df['Abstract'].tolist()
print(abstracts[:3])


['The contribution of genomics and associated technologies to human health risk assessment for environmental chemicals has focused largely on elucidating mechanisms of toxicity, as discussed in other articles in this issue. However, there is interest in moving beyond hazard characterization to making more direct impacts on quantitative risk assessment (QRA)--i.e., the determination of toxicity values for setting exposure standards and cleanup values. We propose that the evolution of QRA of environmental chemicals in the post-genomic era will involve three, somewhat overlapping phases in which different types of approaches begin to mature. The initial focus (in Phase I) has been and continues to be on "augmentation" of weight of evidence--using genomic and related technologies qualitatively to increase the confidence in and scientific basis of the results of QRA. Efforts aimed towards "integration" of these data with traditional animal-based approaches, in particular quantitative predic

In [3]:
# Load BioBERT
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.2")


In [4]:
# Function to convert abstract to embedding
def abstract_to_embedding(abstract):
    inputs = tokenizer(abstract, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Convert abstracts to embeddings
embeddings = [abstract_to_embedding(abstract) for abstract in abstracts]
print(embeddings[0])


[ 1.58904821e-01 -1.01016238e-01 -1.12258337e-01  2.28342861e-01
  1.58077553e-01 -1.95189670e-01 -9.46777612e-02  1.17288806e-01
  4.70631383e-02  2.68044304e-02  2.92304635e-01  1.61803946e-01
 -3.45313221e-01  3.18430334e-01 -2.95296669e-01 -3.27226758e-01
  3.70891131e-02 -4.10510637e-02  2.51066149e-03  5.06394282e-02
 -2.73296386e-01  3.59641574e-02 -3.49983536e-02  1.27345338e-01
 -4.29695565e-03 -2.25574017e-01 -1.34626672e-01  4.88225430e-01
  1.19844107e-02  3.03707987e-01 -1.14617281e-01  4.19144258e-02
 -1.77363977e-02 -7.46608526e-02  2.65904456e-01  1.86258536e-02
 -1.32324591e-01  3.33870083e-01 -2.92369455e-01  3.99936169e-01
  9.67998803e-02  2.52345383e-01 -2.62100577e-01  1.00676648e-01
  1.40506178e-01  8.76275003e-02 -3.54565173e-01 -6.93815872e-02
 -3.98033321e-01  1.28518909e-01  2.35870510e-01  2.17605531e-01
  4.22382839e-02 -7.68198352e-03  1.82250842e-01 -1.37360366e-02
  3.29071134e-02 -1.61566868e-01  1.63298130e-01  2.90409565e-01
 -3.49685885e-02 -1.33380

In [5]:
# Define some test queries for evaluation
test_queries = ["Is there a correlation between intelligence and genetics?", 
                "Is there a correlation between spiritual intelligence and the success of addiction treatment with methadone?"]

def queries_to_embeddings(queries, tokenizer, model):
    embeddings = []
    for query in queries:
        inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return embeddings

# Convert queries to embeddings
query_embeddings = queries_to_embeddings(test_queries, tokenizer, model)


# Function to find most similar abstracts for a given query embedding
def find_most_similar(query_emb, abstract_embs, top_n=3):
    similarities = cosine_similarity([query_emb], abstract_embs)[0]
    top_indices = np.argsort(similarities)[::-1][:top_n]
    top_similarities = np.sort(similarities)[::-1][:top_n]
    return top_indices, top_similarities


# Evaluate each query
for query, query_emb in zip(test_queries, query_embeddings):
    print(f"Query: {query}")
    top_indices, top_similarities = find_most_similar(query_emb, embeddings)
    print("Most similar abstracts:")
    for index, similarity in zip(top_indices, top_similarities):
        print(f"Abstract {index + 1} (Similarity: {similarity:.2f}):")
        print(df.iloc[index]['Abstract'])
        print("\n")

# Manual or semi-automatic evaluation of relevance
# This part depends on how you wish to evaluate relevance.


Query: Is there a correlation between intelligence and genetics?
Most similar abstracts:
Abstract 8219 (Similarity: 0.89):
Does general intelligence exist across species, and has it been a target of natural selection? These questions can be addressed with genomic data, which can rule out artifacts by demonstrating that distinct cognitive abilities are genetically correlated and thus share a biological substrate. This work has begun with data from humans and can be extended to other species; it should focus not only on general intelligence but also specific capacities like language and spatial ability.


Abstract 8205 (Similarity: 0.89):
Burkart et al. consider that the relationship between general intelligence and socio-cognitive abilities is poorly understood in animals and humans. We examine this conclusion in the perspective of an already substantial evidence base on the relationship among general intelligence, theory of mind, and emotional intelligence. We propose a link between ge

In [8]:
def count_long_abstracts(df, max_length=512):
    long_abstract_count = 0
    for abstract in df['Abstract']:
        tokens = tokenizer.encode(abstract, add_special_tokens=True)
        if len(tokens) > max_length:
            long_abstract_count += 1
    return long_abstract_count

# Count how many abstracts are longer than 512 tokens
num_long_abstracts = count_long_abstracts(df)
print("Number of abstracts longer than 512 tokens:", num_long_abstracts)

def average_length_excess(df, max_length=512):
    total_excess_length = 0
    long_abstract_count = 0
    for abstract in df['Abstract']:
        tokens = tokenizer.encode(abstract, add_special_tokens=True)
        if len(tokens) > max_length:
            long_abstract_count += 1
            total_excess_length += len(tokens) - max_length
    return total_excess_length / long_abstract_count if long_abstract_count > 0 else 0

# Calculate the average length by which abstracts exceed 512 tokens
average_excess_length = average_length_excess(df)
print("Average excess length of abstracts longer than 512 tokens:", average_excess_length)


Number of abstracts longer than 512 tokens: 1174
Average excess length of abstracts longer than 512 tokens: 128.27683134582622


In [9]:
np.save('embeddings.npy', embeddings)
#embeddings = np.load('embeddings.npy')

In [7]:
# # Initialize Pinecone Index
# index = Index("your-index-name")

# # Upload embeddings to Pinecone
# for i, emb in enumerate(embeddings):
#     index.upsert(vectors=[(str(i), emb)])

# # Don't forget to save your index's state if needed