In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

MODEL_NAME='FremyCompany/BioLORD-2023'

import torch

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['T2D', 'diabetes']


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # ('tavakolih/all-MiniLM-L6-v2-pubmed-full')
model = AutoModel.from_pretrained(MODEL_NAME)

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

  from .autonotebook import tqdm as notebook_tqdm


Sentence embeddings:
tensor([[ 0.0352, -0.0610,  0.0081,  ..., -0.0234, -0.0204, -0.0256],
        [ 0.0725, -0.0013,  0.0417,  ...,  0.0076,  0.0047, -0.0276]])


In [2]:
sentence_embeddings.shape[1]

768

In [3]:
euclidean_distance = F.pairwise_distance(sentence_embeddings[0].unsqueeze(0), sentence_embeddings[1].unsqueeze(0))

# Compute cosine similarity between the two sentence embeddings
cosine_similarity = F.cosine_similarity(sentence_embeddings[0].unsqueeze(0), sentence_embeddings[1].unsqueeze(0))

print("\nEuclidean Distance:", euclidean_distance.item())
print("Cosine Similarity:", cosine_similarity.item())


Euclidean Distance: 1.147909164428711
Cosine Similarity: 0.3411523103713989


In [4]:
import csv
import pickle
import numpy as np
from pronto import Ontology
import spacy
import faiss
from tqdm import tqdm
import warnings
import gc
import re
import pandas

warnings.simplefilter("ignore")



def create_quantized_index(embeddings_np, d, nlist):
    """Create a trained IVFPQ index."""
    m = 32
    quantizer = faiss.IndexFlatL2(d)
    index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
    index.train(embeddings_np)
    return index


#
def get_average_embeddings_batched(terms):
    """Return average embeddings for terms."""
    docs = list(nlp.pipe(terms))
    embeddings = []

    for doc in docs:
        # Filtering out tokens without vectors or with unexpected vector sizes
        valid_vectors = [token.vector for token in doc if token.has_vector and token.vector_norm != 0 and token.vector.shape[0] == 300]

        # If no valid vectors, append a zero vector
        if len(valid_vectors) == 0:
            embeddings.append(np.zeros((300,)))
        else:
            average_embedding = np.mean(valid_vectors, axis=0)
            embeddings.append(average_embedding)

    return embeddings


def get_average_embeddings_batched_transformers(sentences, model_name=MODEL_NAME):
    """Return average embeddings for sentences using a Transformers model."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling (mean pooling function is used here)
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0]  # First element of model_output contains token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings


In [5]:
import pandas as pd

In [6]:
# Filenames

path__ = "/nfs/production/literature/santosh_tirunagari/BACKUP/"
INPUT_FILENAME = path__+"work/github/source_data/knowledge_base/bao/BAO.csv"
OUTPUT_PICKLE_FILENAME = path__+"/work/github/CAPITAL/normalisation/dictionary/bao.pkl"
OUTPUT_LIST = path__+"work/github/CAPITAL/normalisation/dictionary/bao_list.txt"
FAISS_INDEX_FILENAME = path__+"work/github/CAPITAL/normalisation/dictionary/bao_terms.index"
# OUTPUT_INDEXED_TERMS_FILENAME = path__+"work/github/ML_annotations/normalisation/dictionary/bao_indexed_terms.pkl"


In [7]:
def process_column_content(s):
    """Clean and strip unwanted characters and split by pipe if present."""
    # First, clean the string by removing specific patterns
    cleaned = s.strip().lower()
    
    # Check if the cleaned string contains a pipe symbol and split if it does
    if '|' in cleaned:
        return cleaned.split('|')
    else:
        return cleaned


df = pd.read_csv(INPUT_FILENAME, usecols=['Class ID', 'Preferred Label', 'Synonyms', 'Definitions', 'alternative term'], 
                 sep=',', engine='python', on_bad_lines='skip')


term_to_id = {}
embeddings = []  
indexed_terms = []

In [8]:
flattened_data = []
for _, row in df.iterrows():
    term_id = row['Class ID']
    for col in ['Preferred Label', 'Synonyms', 'Definitions', 'alternative term']:
        term_names = row[col]
        if pd.notnull(term_names):  # Check if the term_name is not NaN
            processed_terms = process_column_content(term_names)
            if isinstance(processed_terms, list):
                for term in processed_terms:
                    flattened_data.append((term_id, term))
            else:
                flattened_data.append((term_id, processed_terms))

# Convert flattened data to a DataFrame for easier manipulation
flattened_df = pd.DataFrame(flattened_data, columns=['Class ID', 'Term Name'])

In [9]:
flattened_df

Unnamed: 0,Class ID,Term Name
0,http://purl.obolibrary.org/obo/CHEBI_50444,adenosine phosphodiesterase inhibitor
1,http://purl.obolibrary.org/obo/CHEBI_131787,dopamine receptor d2 antagonist
2,http://purl.obolibrary.org/obo/CHEBI_131787,d2r antagonist
3,http://purl.obolibrary.org/obo/CHEBI_131787,d2 receptor antagonist
4,http://purl.obolibrary.org/obo/CHEBI_131789,runx1 inhibitor
...,...,...
33353,http://purl.obolibrary.org/obo/DOID_3953,adrenal cancer
33354,http://purl.obolibrary.org/obo/DOID_3953,tumor of the adrenal gland
33355,http://purl.obolibrary.org/obo/DOID_3953,malignant neoplasm of adrenal gland
33356,http://purl.obolibrary.org/obo/DOID_3953,malignant adrenal tumor


In [10]:
flattened_df = flattened_df[0:1000]
flattened_df

Unnamed: 0,Class ID,Term Name
0,http://purl.obolibrary.org/obo/CHEBI_50444,adenosine phosphodiesterase inhibitor
1,http://purl.obolibrary.org/obo/CHEBI_131787,dopamine receptor d2 antagonist
2,http://purl.obolibrary.org/obo/CHEBI_131787,d2r antagonist
3,http://purl.obolibrary.org/obo/CHEBI_131787,d2 receptor antagonist
4,http://purl.obolibrary.org/obo/CHEBI_131789,runx1 inhibitor
...,...,...
995,http://purl.obolibrary.org/obo/CHEBI_194423,aquaretics
996,http://purl.obolibrary.org/obo/CHEBI_194423,aquaretic agents
997,http://purl.obolibrary.org/obo/CHEBI_194423,aquaretic agent
998,http://purl.obolibrary.org/obo/CHEBI_77703,ec 4.3.1.3 (histidine ammonia-lyase) inhibitor


In [None]:
embeddings = []
term_to_id = {}
indexed_terms = []

BATCH_SIZE = 100
NLIST = 10
term_batches = []
id_batches = []
current_batch_terms = []
current_batch_ids = []

for _, row in tqdm(flattened_df.iterrows(), total=flattened_df.shape[0], desc="Processing terms"):
    term_id = row['Class ID']
    term_name = row['Term Name']

    # Process the term_name
    term_name = process_column_content(term_name)

    # Check for empty or single character terms and skip them
    if not term_name or len(term_name) <= 1:
        continue

    current_batch_terms.append(term_name)
    current_batch_ids.append(term_id)

    if len(current_batch_terms) == BATCH_SIZE:
        term_batches.append(current_batch_terms)
        id_batches.append(current_batch_ids)
        current_batch_terms = []
        current_batch_ids = []

# Catch any remaining terms not added to a batch
if current_batch_terms:
    term_batches.append(current_batch_terms)
    id_batches.append(current_batch_ids)

for term_batch, id_batch in tqdm(zip(term_batches, id_batches), total=len(term_batches), desc="Generating Embeddings"):
    batch_embeddings = get_average_embeddings_batched_transformers(term_batch)
    
    for term, term_id, embedding in zip(term_batch, id_batch, batch_embeddings):
        norm = np.linalg.norm(embedding)

        # Check if the embedding is a zero vector
        if norm == 0:
            print(f"Term '{term}' with ID '{term_id}' has a zero vector.")

        # Normalizing the vector
        normalized_embedding = embedding if norm == 0 else embedding / norm
        embeddings.append(normalized_embedding)
        term_to_id[term] = term_id
        indexed_terms.append(term)

        # Clear out the current batch to free up memory
    del term_batch, id_batch, batch_embeddings
    gc.collect()

# Assuming we have already calculated sentence_embeddings somewhere in the script
d = embeddings[0].shape[0] if embeddings else 0  # Dynamically get the dimension
embeddings_np = np.array(embeddings).astype('float32')
index = create_quantized_index(embeddings_np, d, nlist=NLIST)
index.add(embeddings_np)

# Free up memory after using embeddings_np
del embeddings, embeddings_np
gc.collect()

print("Saving quantized faiss index...")
faiss.write_index(index, FAISS_INDEX_FILENAME)

# print("Saving term to ID mapping...")
# with open(OUTPUT_PICKLE_FILENAME, "wb") as outfile:
#     pickle.dump(term_to_id, outfile)

print("Saving term to ID mapping and indexed terms...")
with open(OUTPUT_PICKLE_FILENAME, "wb") as outfile:
    pickle.dump({"term_to_id": term_to_id, "indexed_terms": indexed_terms}, outfile)


print("Writing terms to a txt file...")
with open(OUTPUT_LIST, "w") as txt_file:
    for term in term_to_id.keys():
        txt_file.write(term + "\n")

Processing terms: 100%|██████████████████| 1000/1000 [00:00<00:00, 18422.57it/s]
Generating Embeddings: 100%|████████████████████| 10/10 [01:32<00:00,  9.28s/it]


In [9]:
# # Filenames

# path__ = "/nfs/production/literature/santosh_tirunagari/BACKUP/"
# OUTPUT_PICKLE_FILENAME = path__+"/work/github/source_data/dictionary/bao.pkl"
# OUTPUT_LIST = path__+"work/github/CAPITAL/normalisation/dictionary/bao_list.txt"
# FAISS_INDEX_FILENAME = path__+"work/github/CAPITAL/normalisation/dictionary/bao_terms.index"
# # OUTPUT_INDEXED_TERMS_FILENAME = path__+"work/github/ML_annotations/normalisation/dictionary/bao_indexed_terms.pkl"


In [10]:
import faiss
import pickle
import spacy
import numpy as np
from fuzzywuzzy import fuzz

# Load the term to ID mapping and indexed terms
with open(OUTPUT_PICKLE_FILENAME, "rb") as infile:
    data = pickle.load(infile)
    term_to_id = data["term_to_id"]
    indexed_terms = data["indexed_terms"]

# Load the FAISS index
index = faiss.read_index(FAISS_INDEX_FILENAME)


def retrieve_similar_terms(query, k=5):
    """Retrieve top k similar terms given a query."""
    query = query.lower()  # Convert query to lowercase
    query_embedding = get_average_embeddings_batched_transformers([query])[0].numpy()  # Get average embedding of the query

    # Normalize the query embedding
    norm = np.linalg.norm(query_embedding)
    query_embedding = query_embedding if norm == 0 else query_embedding / norm
    query_embedding = query_embedding.reshape(1, -1).astype('float32')

    # Search the index
    D, I = index.search(query_embedding, k)

    similar_terms = []
    for i in range(k):
        term = indexed_terms[I[0][i]]
        score = D[0][i]
        term_id = term_to_id[term]
        similar_terms.append((term, term_id, score))

    return similar_terms

def retrieve_similar_terms_with_fuzzy(query, k):
    """Retrieve k terms similar to the query."""
    query_embedding = get_average_embeddings_batched_transformers([query])[0].numpy()  # Get average embedding of the query

    # Normalize the query embedding
    norm = np.linalg.norm(query_embedding)
    query_embedding = query_embedding if norm == 0 else query_embedding / norm
    query_embedding = query_embedding.reshape(1, -1).astype('float32')

    # Search the index
    D, I = index.search(query_embedding, k)

    # Retrieve the terms from the indexed_terms list
    candidate_terms = [indexed_terms[i] for i in I[0]]

    # Get fuzzy matching scores for these terms
    scores = [fuzz.ratio(query, term) for term in candidate_terms]

    # Pair up terms with their scores
    term_score_pairs = list(zip(candidate_terms, scores))

    # Rank these pairs based on scores
    ranked_term_score_pairs = sorted(term_score_pairs, key=lambda x: x[1], reverse=True)

    return ranked_term_score_pairs[:k]


FileNotFoundError: [Errno 2] No such file or directory: '/nfs/production/literature/santosh_tirunagari/BACKUP//work/github/source_data/dictionaries/bao.pkl'

In [64]:
# Example usage
query = "TR-FRET"# "nucleosome"
results = retrieve_similar_terms(query, 5)

for term, term_id, score in results:
    print(f"Term: {term}, ID: {term_id}, Score: {score}")

Term: tr-fret, ID: http://www.bioassayontology.org/bao#BAO_0000004, Score: 0.3012159466743469
Term: cret, ID: http://www.bioassayontology.org/bao#BAO_0000462, Score: 0.7369926571846008
Term: trupath, ID: http://www.bioassayontology.org/bao#BAO_0010081, Score: 0.7620600461959839
Term: trna, ID: http://www.bioassayontology.org/bao#BAO_0000276, Score: 0.7682545781135559
Term: thale-cress, ID: http://purl.obolibrary.org/obo/NCBITaxon_3702, Score: 0.784970223903656


In [65]:
results = retrieve_similar_terms_with_fuzzy(query, 10)
for term, score in results:
    print(f"Term: {term}, Score: {score}")

Term: tr-fret, Score: 14
Term: cin-quin, Score: 13
Term: thale-cress, Score: 11
Term: cret, Score: 0
Term: trupath, Score: 0
Term: trna, Score: 0
Term: tcep, Score: 0
Term: thale cress, Score: 0
Term: gullet, Score: 0
Term: acumen, Score: 0
