In [1]:
!which python

/nfs/production/literature/santosh_tirunagari/transformers_env/bin/python


In [2]:
path_floret_model = '/nfs/production/literature/santosh_tirunagari/BACKUP/work/github/source_data/floret_embeddings/en_floret_model'

In [3]:
# ! python -m pip install floret 'spacy~=3.4.0' pandas --quiet

In [4]:
import spacy

# This is the spaCy pipeline with floret vectors
nlp_fl = spacy.load(path_floret_model)



In [7]:
word_1 = nlp_fl.vocab["sars-cov-2"]
word_2 = nlp_fl.vocab["sars-cov-2"]

word_1.similarity(word_2)

1.0

In [8]:
word_1 = nlp_fl.vocab["sars"]
word_2 = nlp_fl.vocab["sars-cov-2"]

word_1.similarity(word_2)

0.45980381965637207

In [9]:
tokens = nlp_fl("sars cov 2")
    
print(tokens.text, tokens.vector[:3], tokens.vector_norm) # Only the first three components of the vector 
    
for token in tokens:
    print(token.text, token.vector[:3], token.vector_norm)

sars cov 2 [-3.5737886  2.3306377  4.0841546] 55.919779433640024
sars [ 2.3757248  -1.0899751   0.76489997] 42.306488
cov [-1.0599601  1.1186376  3.3393645] 57.941483
2 [-12.03713   6.96325   8.1482 ] 152.65103


In [10]:
import csv
import pickle
import numpy as np
from pronto import Ontology
import spacy
import faiss
from tqdm import tqdm
import warnings
import gc
import re
import pandas as pd

warnings.simplefilter("ignore")

# Load the spaCy model
nlp = spacy.load(path_floret_model)


def create_quantized_index(embeddings_np, d):
    """Create a trained IVFPQ index."""
    nlist = 1000
    m = 30
    quantizer = faiss.IndexFlatL2(d)
    index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
    index.train(embeddings_np)
    return index

#
def get_average_embeddings_batched(terms):
    """Return average embeddings for terms."""
    docs = list(nlp.pipe(terms))
    embeddings = []

    for doc in docs:
        # Filtering out tokens without vectors or with unexpected vector sizes
        valid_vectors = [token.vector for token in doc if token.has_vector and token.vector_norm != 0 and token.vector.shape[0] == 300]

        # If no valid vectors, append a zero vector
        if len(valid_vectors) == 0:
            embeddings.append(np.zeros((300,)))
        else:
            average_embedding = np.mean(valid_vectors, axis=0)
            embeddings.append(average_embedding)

    return embeddings

In [11]:
# Filenames

path__ = "/nfs/production/literature/santosh_tirunagari/BACKUP/"
INPUT_FILENAME = path__+"work/github/source_data/knowledge_base/bao/BAO.csv"
OUTPUT_PICKLE_FILENAME = path__+"/work/github/CAPITAL/normalisation/dictionary/bao_1.pkl"
OUTPUT_LIST = path__+"work/github/CAPITAL/normalisation/dictionary/bao_1_list.txt"
FAISS_INDEX_FILENAME = path__+"work/github/CAPITAL/normalisation/dictionary/bao_1_terms.index"
# OUTPUT_INDEXED_TERMS_FILENAME = path__+"work/github/ML_annotations/normalisation/dictionary/bao_indexed_terms.pkl"


In [9]:
def process_column_content(s):
    """Clean and strip unwanted characters and split by pipe if present."""
    # First, clean the string by removing specific patterns
    cleaned = s.strip().lower()
    
    # Check if the cleaned string contains a pipe symbol and split if it does
    if '|' in cleaned:
        return cleaned.split('|')
    else:
        return cleaned


df = pd.read_csv(INPUT_FILENAME, usecols=['Class ID', 'Preferred Label', 'Synonyms', 'Definitions', 'alternative term'], 
                 sep=',', engine='python', on_bad_lines='skip')


term_to_id = {}
embeddings = []  
indexed_terms = []

In [10]:
flattened_data = []
for _, row in df.iterrows():
    term_id = row['Class ID']
    for col in ['Preferred Label', 'Synonyms', 'Definitions', 'alternative term']:
        term_names = row[col]
        if pd.notnull(term_names):  # Check if the term_name is not NaN
            processed_terms = process_column_content(term_names)
            if isinstance(processed_terms, list):
                for term in processed_terms:
                    flattened_data.append((term_id, term))
            else:
                flattened_data.append((term_id, processed_terms))

# Convert flattened data to a DataFrame for easier manipulation
flattened_df = pd.DataFrame(flattened_data, columns=['Class ID', 'Term Name'])

In [11]:
flattened_df

Unnamed: 0,Class ID,Term Name
0,http://purl.obolibrary.org/obo/CHEBI_50444,adenosine phosphodiesterase inhibitor
1,http://purl.obolibrary.org/obo/CHEBI_131787,dopamine receptor d2 antagonist
2,http://purl.obolibrary.org/obo/CHEBI_131787,d2r antagonist
3,http://purl.obolibrary.org/obo/CHEBI_131787,d2 receptor antagonist
4,http://purl.obolibrary.org/obo/CHEBI_131789,runx1 inhibitor
...,...,...
33353,http://purl.obolibrary.org/obo/DOID_3953,adrenal cancer
33354,http://purl.obolibrary.org/obo/DOID_3953,tumor of the adrenal gland
33355,http://purl.obolibrary.org/obo/DOID_3953,malignant neoplasm of adrenal gland
33356,http://purl.obolibrary.org/obo/DOID_3953,malignant adrenal tumor


In [12]:
BATCH_SIZE = 500
term_batches = []
id_batches = []
current_batch_terms = []
current_batch_ids = []

for _, row in tqdm(flattened_df.iterrows(), total=flattened_df.shape[0], desc="Processing terms"):
    term_id = row['Class ID']
    term_name = row['Term Name']

    # Assuming process_column_content is a function you've defined to process the term_name
    term_name = process_column_content(term_name)

    # Check for empty or single character terms and skip them
    if not term_name or len(term_name) <= 1:
        continue

    current_batch_terms.append(term_name)
    current_batch_ids.append(term_id)

    if len(current_batch_terms) == BATCH_SIZE:
        term_batches.append(current_batch_terms)
        id_batches.append(current_batch_ids)
        current_batch_terms = []
        current_batch_ids = []

# Catch any remaining terms not added to a batch
if current_batch_terms:
    term_batches.append(current_batch_terms)
    id_batches.append(current_batch_ids)

for term_batch, id_batch in tqdm(zip(term_batches, id_batches), total=len(term_batches),
                                 desc="Generating Embeddings"):
    batch_embeddings = get_average_embeddings_batched(term_batch)

    for term, term_id, embedding in zip(term_batch, id_batch, batch_embeddings):
        norm = np.linalg.norm(embedding)

        # Check if the embedding is a zero vector
        if norm == 0:
            print(f"Term '{term}' with ID '{term_id}' has a zero vector.")

        # Normalizing the vector
        normalized_embedding = embedding if norm == 0 else embedding / norm
        embeddings.append(normalized_embedding)
        term_to_id[term] = term_id
        indexed_terms.append(term)

        # Clear out the current batch to free up memory
    del term_batch, id_batch, batch_embeddings
    gc.collect()

d = 300
embeddings_np = np.array(embeddings).astype('float32')
index = create_quantized_index(embeddings_np, d)
index.add(embeddings_np)

# Free up memory after using embeddings_np
del embeddings, embeddings_np
gc.collect()

print("Saving quantized faiss index...")
faiss.write_index(index, FAISS_INDEX_FILENAME)

# print("Saving term to ID mapping...")
# with open(OUTPUT_PICKLE_FILENAME, "wb") as outfile:
#     pickle.dump(term_to_id, outfile)

print("Saving term to ID mapping and indexed terms...")
with open(OUTPUT_PICKLE_FILENAME, "wb") as outfile:
    pickle.dump({"term_to_id": term_to_id, "indexed_terms": indexed_terms}, outfile)


print("Writing terms to a txt file...")
with open(OUTPUT_LIST, "w") as txt_file:
    for term in term_to_id.keys():
        txt_file.write(term + "\n")

Processing terms: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 33358/33358 [00:01<00:00, 18510.69it/s]
Generating Embeddings: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [00:41<00:00,  1.62it/s]


Saving quantized faiss index...
Saving term to ID mapping and indexed terms...
Writing terms to a txt file...


# TESTING

In [13]:
import numpy as np

def get_average_embedding(term):
    tokens = term.split()
    # Get embeddings for each token
    embeddings = [nlp_fl.vocab[token].vector for token in tokens if token in nlp_fl.vocab]
    # Compute the average embedding
    average_embedding = np.mean(embeddings, axis=0)
    return average_embedding


word_1 = nlp.vocab["cyclothymic disorder"]
word_2 = nlp.vocab["Cyclothymic personality"]
word_4 = nlp.vocab["Affective personality disorder"]


word_1.similarity(word_2), word_1.similarity(word_4)

(0.5980432629585266, 0.605099081993103)

In [14]:
def get_average_embedding(term):
    tokens = term.split()
    embeddings = [nlp.vocab[token].vector for token in tokens if token in nlp.vocab]
    average_embedding = np.mean(embeddings, axis=0)
    return average_embedding

def cosine_similarity(vec1, vec2):
    # Compute cosine similarity between two vectors
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

word_1_vec = nlp.vocab["cyclothymic disorder"].vector
word_2_vec = nlp.vocab["Cyclothymic personality"].vector
word_3_vec = get_average_embedding("Affective personality disorder")
# For word_4, we get the vector of the entire phrase
word_4_doc = nlp("Affective personality disorder")
word_4_vec = word_4_doc.vector

print(cosine_similarity(word_1_vec, word_2_vec))
print(cosine_similarity(word_1_vec, word_3_vec))
print(cosine_similarity(word_1_vec, word_4_vec))

0.5980433
0.69048494
0.67023414


In [58]:
word_2_vec.shape

(300,)

In [None]:
# Displacement of [3H]DTG from sigma 2 receptor in rat PC12 cells incubated for 120 mins in presence of [3H]-(+)-pentazocine by liquid scintillation counting method
# 15:34
# Displacement of GDP-BODIPY probe from BTN3A1 full intracellular domain (unknown origin) at 0.1 mM measured after 60 mins by fluorescence polarization assay
# 15:34
# Binding affinity to recombinant human carbonic anhydrase 13 expressed in Escherichia coli expression system assessed as kinetic gibbs free energy change by ITC method
# 15:34
# Binding affinity to MDM2 in human U87MG cells assessed as inhibition of MDM2/p53 protein interaction after 10 mins by quantitative sandwich immuno assay


# Santosh Tirunagari
#   16:01
# http://hl-codon-49-04.ebi.ac.uk:8888/notebooks/notebooks/normalisation%20analysis/BAO.ipynb


# Ines Smit
#   16:09
# TR-FRET assay
# 16:09
# LC-MS analysis
# 16:10
# thermal shift assay
# 16:10
# radioligand competition binding assay
# 16:10
# Kinomescan method

In [12]:
import faiss
import pickle
import spacy
import numpy as np
from fuzzywuzzy import fuzz

# Load spaCy model
nlp = spacy.load(path_floret_model)


def get_average_embeddings_batched(terms):
    """Return average embeddings for terms."""
    docs = list(nlp.pipe(terms))
    embeddings = []

    for doc in docs:
        # Filtering out tokens without vectors or with unexpected vector sizes
        valid_vectors = [token.vector for token in doc if token.has_vector and token.vector_norm != 0 and token.vector.shape[0] == 300]

        # If no valid vectors, append a zero vector
        if len(valid_vectors) == 0:
            embeddings.append(np.zeros((300,)))
        else:
            average_embedding = np.mean(valid_vectors, axis=0)
            embeddings.append(average_embedding)

    return embeddings

# Load the term to ID mapping and indexed terms
with open(OUTPUT_PICKLE_FILENAME, "rb") as infile:
    data = pickle.load(infile)
    term_to_id = data["term_to_id"]
    indexed_terms = data["indexed_terms"]

# Load the FAISS index
index = faiss.read_index(FAISS_INDEX_FILENAME)


def retrieve_similar_terms(query, k=5):
    """Retrieve top k similar terms given a query."""
    # Convert query to lowercase
    query = query.lower()
    
    # Get average embedding of the query
    query_embedding = get_average_embeddings_batched([query])
    
    norm = np.linalg.norm(query_embedding)
    query_embedding = query_embedding if norm == 0 else query_embedding / norm
    query_embedding = query_embedding.reshape(1, -1).astype('float32')

    # Search the index
    D, I = index.search(query_embedding, k)
    
    similar_terms = []
    for i in range(k):
        term = indexed_terms[I[0][i]]
        score = D[0][i]
        term_id = term_to_id[term]
        similar_terms.append((term, term_id, score))
    
    return similar_terms


def retrieve_similar_terms_with_fuzzy(query, k):
    """Retrieve k terms similar to the query."""
    query = query
    
    # Get average embedding of the query
    query_embedding = get_average_embeddings_batched([query])
    
    norm = np.linalg.norm(query_embedding)
    query_embedding = query_embedding if norm == 0 else query_embedding / norm
    query_embedding = query_embedding.reshape(1, -1).astype('float32')

    # Search the index
    D, I = index.search(query_embedding, k)
    
    # Retrieve the terms from the indexed_terms list
    candidate_terms = [indexed_terms[i] for i in I[0]]
    
    # Get fuzzy matching scores for these terms
    scores = [fuzz.ratio(query, term) for term in candidate_terms]
    
    # Pair up terms with their scores
    term_score_pairs = list(zip(candidate_terms, scores))
    
    # Rank these pairs based on scores
    ranked_term_score_pairs = sorted(term_score_pairs, key=lambda x: x[1], reverse=True)
    
    return ranked_term_score_pairs[:k]


In [13]:
# Example usage
query = "TR-FRET"# "nucleosome"
results = retrieve_similar_terms(query, 5)

for term, term_id, score in results:
    print(f"Term: {term}, ID: {term_id}, Score: {score}")

Term: tr-fret, ID: http://www.bioassayontology.org/bao#BAO_0000004, Score: 0.140354186296463
Term: presto-tango, ID: http://www.bioassayontology.org/bao#BAO_0010079, Score: 0.34693869948387146
Term: bronsted-base, ID: http://purl.obolibrary.org/obo/CHEBI_39142, Score: 0.4002431631088257
Term: crispr-cas9, ID: http://www.bioassayontology.org/bao#BAO_0010249, Score: 0.40232881903648376
Term: non-linear qsar, ID: http://www.bioassayontology.org/bao#BAO_0002309, Score: 0.4076499938964844


In [14]:
results = retrieve_similar_terms_with_fuzzy(query, 10)
for term, score in results:
    print(f"Term: {term}, Score: {score}")

Term: hep-g2, Score: 15
Term: oci-ly3, Score: 14
Term: acp-tag, Score: 14
Term: cyclin-b1, Score: 12
Term: arrestin-gfp, Score: 11
Term: ubiquitin-rho, Score: 10
Term: htrf kinease-tk, Score: 9
Term: g2/mitotic-specific cyclin-b1, Score: 6
Term: cbf-his/runx1-biotin protein complex, Score: 5
Term: an endocrine gland cancer located_in the adrenal glands which are located above the kidneys., Score: 0
