In [9]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pprint

In [12]:
# Read the two CSV files
df_part1 = pd.read_csv('../data/processed_data_part1.csv')
df_part2 = pd.read_csv('../data/processed_data_part2.csv')

# Concatenate the two DataFrames
df = pd.concat([df_part1, df_part2], ignore_index=True)

# Read in abstracts
abstracts = df['Abstract'].tolist()
pprint.pprint(abstracts[:3])
#print(abstracts[:3])


['SUMMARY Several lines of evidence support the involvement of inflammatory '
 'and immunologic abnormalities in chronic fatigue syndrome CFS Since recent '
 'studies have shown that α1 antitrypsin AAT possesses antiinflammatory '
 'properties the potential therapeutic effect of AAT treatment on CFS has been '
 'investigated A 49yearold woman diagnosed with CFS was treated with '
 'intravenous infusions of a human plasmaderived AAT concentrate 60 mgkg body '
 'weight weekly for 8 consecutive weeks The patients monocyte elastase a '
 'regulator of inflammatory processes was 1170 Umg At completion of treatment '
 'improvement in maximal workload was observed 540717 of predicted '
 'Additionally amelioration in working memory scores 8394 and perceptual '
 'organization scores 7583 were detected on the Wechsler Adult Intelligence '
 'ScaleIII test Monocyte elastase decreased to a normal range 150 Umg '
 'Improvement in functional capacity allowed the patient to work in parttime '
 'employm

In [13]:
# Check for GPU availability and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load BioBERT
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.2").to(device)


In [14]:
# Function to convert abstract to embedding
def abstract_to_embedding(abstract):
    inputs = tokenizer(abstract, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Convert abstracts to embeddings
embeddings = [abstract_to_embedding(abstract) for abstract in abstracts]
print(embeddings[0])


ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
# Define some test queries for evaluation
test_queries = ["Is there a correlation between intelligence and genetics?", 
                "Is there a correlation between spiritual intelligence and the success of addiction treatment with methadone?"]

def queries_to_embeddings(queries, tokenizer, model):
    embeddings = []
    for query in queries:
        inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return embeddings

# Convert queries to embeddings
query_embeddings = queries_to_embeddings(test_queries, tokenizer, model)


# Function to find most similar abstracts for a given query embedding
def find_most_similar(query_emb, abstract_embs, top_n=3):
    similarities = cosine_similarity([query_emb], abstract_embs)[0]
    top_indices = np.argsort(similarities)[::-1][:top_n]
    top_similarities = np.sort(similarities)[::-1][:top_n]
    return top_indices, top_similarities


# Evaluate each query
for query, query_emb in zip(test_queries, query_embeddings):
    print(f"Query: {query}")
    top_indices, top_similarities = find_most_similar(query_emb, embeddings)
    print("Most similar abstracts:")
    for index, similarity in zip(top_indices, top_similarities):
        print(f"Abstract {index + 1} (Similarity: {similarity:.2f}):")
        print(df.iloc[index]['Abstract'])
        print("\n")



Query: Is there a correlation between intelligence and genetics?
Most similar abstracts:
Abstract 8219 (Similarity: 0.89):
Does general intelligence exist across species, and has it been a target of natural selection? These questions can be addressed with genomic data, which can rule out artifacts by demonstrating that distinct cognitive abilities are genetically correlated and thus share a biological substrate. This work has begun with data from humans and can be extended to other species; it should focus not only on general intelligence but also specific capacities like language and spatial ability.


Abstract 8205 (Similarity: 0.89):
Burkart et al. consider that the relationship between general intelligence and socio-cognitive abilities is poorly understood in animals and humans. We examine this conclusion in the perspective of an already substantial evidence base on the relationship among general intelligence, theory of mind, and emotional intelligence. We propose a link between ge

In [None]:
def count_long_abstracts(df, max_length=512):
    long_abstract_count = 0
    for abstract in df['Abstract']:
        tokens = tokenizer.encode(abstract, add_special_tokens=True)
        if len(tokens) > max_length:
            long_abstract_count += 1
    return long_abstract_count

# Count how many abstracts are longer than 512 tokens
num_long_abstracts = count_long_abstracts(df)
print("Number of abstracts longer than 512 tokens:", num_long_abstracts)

def average_length_excess(df, max_length=512):
    total_excess_length = 0
    long_abstract_count = 0
    for abstract in df['Abstract']:
        tokens = tokenizer.encode(abstract, add_special_tokens=True)
        if len(tokens) > max_length:
            long_abstract_count += 1
            total_excess_length += len(tokens) - max_length
    return total_excess_length / long_abstract_count if long_abstract_count > 0 else 0

# Calculate the average length by which abstracts exceed 512 tokens
average_excess_length = average_length_excess(df)
print("Average excess length of abstracts longer than 512 tokens:", average_excess_length)


Number of abstracts longer than 512 tokens: 1174
Average excess length of abstracts longer than 512 tokens: 128.27683134582622


In [None]:
np.save('embeddings.npy', embeddings)