In [6]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
# Read in abstracts
df = pd.read_csv('../data/processed_data.csv')
abstracts = df['Abstract'].tolist()
print(abstracts[:3])


['This study utilized a sample of 50 college students to assess the possibility that responding to the Wechsler Adult Intelligence Scale-Fourth Edition (WAIS-IV) Symbol Search subtest items with an "x" instead of a "single slash mark" would affect performance. A second sample of 50 college students was used to assess the impact on WAIS-IV Block Design performance of presenting all the items with only red surfaces facing up. The modified Symbol Search and Block Design administrations yielded mean scaled scores and raw scores that did not differ significantly from mean scores obtained with standard administrations. Findings should not be generalized beyond healthy, well-educated young adults.', 'Interpretation of the Wechsler Memory Scale-Fourth Edition may involve examination of multiple memory index score contrasts and similar comparisons with Wechsler Adult Intelligence Scale-Fourth Edition ability indexes. Standardization sample data suggest that 15-point differences between any spec

In [4]:
# Load BioBERT
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.2")


config.json: 100%|██████████| 1.11k/1.11k [00:00<00:00, 1.69MB/s]
vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.09MB/s]
pytorch_model.bin: 100%|██████████| 436M/436M [02:58<00:00, 2.44MB/s] 


In [5]:
# Function to convert abstract to embedding
def abstract_to_embedding(abstract):
    inputs = tokenizer(abstract, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Convert abstracts to embeddings
embeddings = [abstract_to_embedding(abstract) for abstract in abstracts]
print(embeddings[0])


[ 1.05518319e-01 -7.27858394e-04  6.61616623e-02  1.99372530e-01
  1.51566893e-01 -5.29078357e-02 -8.65662768e-02 -1.84828565e-02
 -1.42010629e-01  8.08489621e-02  2.50732124e-01  2.37021297e-01
 -1.40783057e-01  1.91651389e-01 -2.37451628e-01 -1.68438315e-01
  2.29460254e-01 -2.45551839e-01 -7.49462470e-02  9.89590362e-02
 -8.65538493e-02  2.09155679e-02 -2.46501043e-01  2.26303726e-01
  1.12659875e-02 -1.48865700e-01 -2.21872721e-02  5.36195338e-01
  4.95551452e-02  4.06869501e-01 -4.64816317e-02 -7.64579326e-02
 -1.17997199e-01 -1.45644084e-01  2.78057873e-01  1.10940578e-04
  6.58589825e-02  3.98846030e-01 -1.21954851e-01  4.15806293e-01
  2.52137005e-01  1.32821649e-01 -6.07461296e-02  2.81009972e-02
 -1.28330320e-01 -2.70095412e-02 -1.33037761e-01 -1.98737830e-01
 -2.41225570e-01 -2.80349590e-02  1.84306264e-01  3.04313581e-02
  2.05818847e-01 -3.53929065e-02  1.41964495e-01 -6.76313937e-02
 -1.10725999e-01 -1.68497693e-02  8.94852653e-02  2.84274846e-01
  1.03190579e-01 -2.56582

In [15]:
# Define some test queries for evaluation
test_queries = ["Is there a correlation between intelligence and genetics?", 
                "Is there a correlation between spiritual intelligence and the success of addiction treatment with methadone?"]

def queries_to_embeddings(queries, tokenizer, model):
    embeddings = []
    for query in queries:
        inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return embeddings

# Convert queries to embeddings
query_embeddings = queries_to_embeddings(test_queries, tokenizer, model)


# Function to find most similar abstracts for a given query embedding
def find_most_similar(query_emb, abstract_embs, top_n=3):
    similarities = cosine_similarity([query_emb], abstract_embs)[0]
    top_indices = np.argsort(similarities)[::-1][:top_n]
    top_similarities = np.sort(similarities)[::-1][:top_n]
    return top_indices, top_similarities


# Evaluate each query
for query, query_emb in zip(test_queries, query_embeddings):
    print(f"Query: {query}")
    top_indices, top_similarities = find_most_similar(query_emb, embeddings)
    print("Most similar abstracts:")
    for index, similarity in zip(top_indices, top_similarities):
        print(f"Abstract {index + 1} (Similarity: {similarity:.2f}):")
        print(df.iloc[index]['Abstract'])
        print("\n")

# Manual or semi-automatic evaluation of relevance
# This part depends on how you wish to evaluate relevance.


Query: Is there a correlation between intelligence and genetics?
Most similar abstracts:
Abstract 1191 (Similarity: 0.88):
I argue in this comment that Hunt's analysis of the intelligence of nations is smarter than it is wise. It is based on too narrow a conception of intelligence. It also conflates correlation and causation with regard to the relation between IQ and socially defined success, both individual and familial. I suggest that a better approach might be to compare nations as well for their creativity and wisdom.


Abstract 746 (Similarity: 0.88):
Several large-scale searches for genes that influence complex human traits, such as intelligence and personality, in the normal range of variation have failed to identify even one gene that makes a significant difference. All previously published claims for genetic influences of this kind now appear to have been false positives. For more serious psychiatric and medical disorders such as schizophrenia and autism, several genes have be

In [13]:
def count_long_abstracts(df, max_length=512):
    long_abstract_count = 0
    for abstract in df['Abstract']:
        tokens = tokenizer.encode(abstract, add_special_tokens=True)
        if len(tokens) > max_length:
            long_abstract_count += 1
    return long_abstract_count

# Count how many abstracts are longer than 512 tokens
num_long_abstracts = count_long_abstracts(df)
print("Number of abstracts longer than 512 tokens:", num_long_abstracts)

def average_length_excess(df, max_length=512):
    total_excess_length = 0
    long_abstract_count = 0
    for abstract in df['Abstract']:
        tokens = tokenizer.encode(abstract, add_special_tokens=True)
        if len(tokens) > max_length:
            long_abstract_count += 1
            total_excess_length += len(tokens) - max_length
    return total_excess_length / long_abstract_count if long_abstract_count > 0 else 0

# Calculate the average length by which abstracts exceed 512 tokens
average_excess_length = average_length_excess(df)
print("Average excess length of abstracts longer than 512 tokens:", average_excess_length)


Number of abstracts longer than 512 tokens: 65
Average excess length of abstracts longer than 512 tokens: 93.1076923076923


In [None]:
# # Initialize Pinecone Index
# index = Index("your-index-name")

# # Upload embeddings to Pinecone
# for i, emb in enumerate(embeddings):
#     index.upsert(vectors=[(str(i), emb)])

# # Don't forget to save your index's state if needed