In [30]:
import json
import pandas as pd

# Replace 'sample.jsonl' with your actual file path
file_path = 'data/patch-27-10-2024-0.api.json'
extracted_data = []

# Manually reading each line as JSON since jsonlines isn't available
# Mapping for type abbreviations
type_map = {
    "go_term": "GO",
    "exp_methods": "EM",
    "gene_protein": "GP",
    "disease": "DS",
    "chemical": "CD",
    "organism": "OG",
    "gene expression": 'GO',
    "methods":'EM'
}

# Open file with 'latin-1' encoding
with open(file_path, 'r', encoding='latin-1') as file:
    for line in file:
        obj = json.loads(line)
        for ann in obj.get('anns', []):
            exact_text = ann['exact']
            uri = ann['tags'][0]['uri']  # Assumes each tag has a single URI
            link = ann['tags'][0]['uri'].split('/')[-1]
            ann_type = type_map.get(ann['type'], "Unknown")  # Map the type or use "Unknown" if not in type_map
            # ann_type = ann['type']
            extracted_data.append({"exact": exact_text, "ground": link, "uri": uri, "type": ann_type})



FileNotFoundError: [Errno 2] No such file or directory: 'data/patch-28-10-2024-0.api.json'

In [28]:
# Convert to DataFrame for easy viewing and manipulation
extracted_df = pd.DataFrame(extracted_data)
print(extracted_df.head())  # Display first few rows


Empty DataFrame
Columns: []
Index: []


In [3]:
extracted_df['type'].unique()

array(['GP', 'DS', 'EM', 'CD', 'OG', 'Unknown', 'GO'], dtype=object)

In [4]:
import faiss
import pickle
import spacy
import numpy as np
from fuzzywuzzy import fuzz, process
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load spaCy model
nlp = spacy.load("/home/stirunag/work/github/CAPITAL/normalisation/en_floret_model")

# Mapping of file names to each annotation type
file_mapping = {
    'CD': ('chebi_terms.index', 'chebi_terms.pkl'),
    'OG': ('NCBI_terms.index', 'NCBI_terms.pkl'),
    'DS': ('umls_terms.index', 'umls_terms.pkl'),
    'GP': ('uniprot_terms.index', 'uniprot_terms.pkl'),
    'GO': ('go_terms.index', 'go_terms.pkl'),
    'EM': ('em_terms.index', 'em_terms.pkl')
}

# Load data and indices for each annotation type
base_path = "/home/stirunag/work/github/CAPITAL/normalisation/dictionary/"
loaded_data = {}
for annotation_type, (index_file, pkl_file) in file_mapping.items():
    with open(base_path + pkl_file, "rb") as infile:
        data = pickle.load(infile)
    index = faiss.read_index(base_path + index_file)
    
    # Create a reverse mapping from CUI to term
    id_to_term = {v: k for k, v in data["term_to_id"].items()}
    
    # Store loaded data
    loaded_data[annotation_type] = {
        "term_to_id": data["term_to_id"],
        "indexed_terms": data["indexed_terms"],
        "index": index,
        "id_to_term": id_to_term
    }
    print(f"Loaded data for {annotation_type}")


# List of phrases to remove (converted to lowercase)
phrases_to_remove = [
    '--', 'physical finding', 'diagnosis', 'disorder', 'procedure', 'finding',
    'symptom', 'history', 'treatment', 'manifestation', 'disease', 'finding',
    'morphologic abnormality', 'etiology', 'observable entity', 'event',
    'situation', 'degrees', 'in some patients', 'cm', 'mm',
    '#', 'rare', 'degree', 'including anastomotic', 'navigational concept',
    '1 patient', 'qualifier value', 'lab test', 'unintentional',
    'tophi', 'nos', 'msec', 'reni', 'less common', 'as symptom'
]

# Function to clean term
def clean_term(term):
    # Convert term to lowercase for consistent comparison
    term_lower = term.lower()
    
    # Remove specified phrases
    for phrase in phrases_to_remove:
        term_lower = re.sub(rf'\b{re.escape(phrase)}\b', '', term_lower, flags=re.IGNORECASE)
    
    # Remove punctuation
    term_cleaned = term_lower.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    term_cleaned = ' '.join(term_cleaned.split())
    
    return term_cleaned

# Function to clean term
def clean_term_EM(term):
    # Convert term to lowercase for consistent comparison
    if term.endswith("es"):
        return term[:-2]
    elif term.endswith("s"):
        return term[:-1]
    return term
    
    return term_cleaned
    

# Matching functions
def get_exact_match(term, term_dict):
    return term_dict.get(term)

def get_fuzzy_match(term, term_dict, threshold=70):
    result = process.extractOne(term, term_dict.keys(), scorer=fuzz.ratio)
    if result:
        match, score = result[0], result[1]
        if score >= threshold:
            return term_dict[match]
    return None

def is_flat_index(index):
    return isinstance(index, faiss.IndexFlat)

def get_embedding_match(term, index, indexed_terms, term_dict, model, threshold=0.7):
    term_vector = model(term).vector.reshape(1, -1).astype('float32')
    faiss.normalize_L2(term_vector)
    
    # Perform search on the FAISS index
    _, I = index.search(term_vector, 1)
    
    if I[0][0] != -1:
        matched_term = indexed_terms[I[0][0]]
        similarity = cosine_similarity(term_vector, model(matched_term).vector.reshape(1, -1))[0][0]
        if similarity >= threshold:
            return term_dict.get(matched_term, "No Match")
    return None

def map_terms(entities, annotation_type, model):
    """Map new entities using exact, fuzzy, and embedding matches, with abbreviation fallback."""
    data = loaded_data[annotation_type]
    term_dict = data["term_to_id"]
    indexed_terms = data["indexed_terms"]
    index = data["index"]

    mapped_entities = {}
    for entity in entities:
        match = get_exact_match(entity, term_dict)
        if not match:
            if annotation_type =='DS':
                match = get_embedding_match(clean_term(entity.lower()), index, indexed_terms, term_dict, model)
            elif annotation_type =='EM':
                match = get_embedding_match(clean_term_EM(entity.lower()), index, indexed_terms, term_dict, model)
            else:
                match = get_embedding_match(entity.lower(), index, indexed_terms, term_dict, model)
        mapped_entities[entity] = match if match else "No Match"
    return mapped_entities
    
def map_terms_reverse(entities, annotation_type, model):
    """Map entities using exact, similarity, and embedding matches, returning both code and term."""
    data = loaded_data[annotation_type]
    term_dict = data["term_to_id"]
    id_to_term = data["id_to_term"]
    indexed_terms = data["indexed_terms"]
    index = data["index"]

    mapped_entities = {}
    for entity in entities:
        # Normalize entity based on annotation type requirements
        normalized_entity = entity.lower() if annotation_type != 'GP' else entity
        
        # Try exact match
        match = get_exact_match(normalized_entity, term_dict)
        
        # If exact match fails, try similarity or embedding matching
        if not match:
            if annotation_type =='DS':
                match = get_embedding_match(clean_term(entity.lower()), index, indexed_terms, term_dict, model)
            elif annotation_type =='EM':
                match = get_embedding_match(clean_term_EM(entity.lower()), index, indexed_terms, term_dict, model)
            else:
                match = get_embedding_match(entity.lower(), index, indexed_terms, term_dict, model)
        
        # Set grounded_code and grounded_term based on match
        if match:
            grounded_code = match
            grounded_term = id_to_term.get(match, "Unknown Term")  # Retrieve term from id_to_term or use default
        else:
            grounded_code, grounded_term = "No Match", "No Match"
        
        # Ensure both code and term are consistently stored in the result
        mapped_entities[entity] = (grounded_code, grounded_term)
    
    return mapped_entities



  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Loaded data for CD
Loaded data for OG
Loaded data for DS
Loaded data for GP
Loaded data for GO
Loaded data for EM


In [5]:
extracted_df = extracted_df[(extracted_df["type"] != "Unknown")]
extracted_df

Unnamed: 0,exact,ground,uri,type
0,SR1,Q9LZW4,http://purl.uniprot.org/uniprot/Q9LZW4,GP
1,HRAS,P01115,http://purl.uniprot.org/uniprot/P01115,GP
2,Spitz Nevus,C0206739,http://linkedlifedata.com/resource/umls-concep...,DS
3,Spitz nevus,C0206739,http://linkedlifedata.com/resource/umls-concep...,DS
4,Spitz nevus,C0206739,http://linkedlifedata.com/resource/umls-concep...,DS
...,...,...,...,...
264542,fibrosarcoma,C0016057,http://linkedlifedata.com/resource/umls-concep...,DS
264543,translocation,EFO_0004024,http://www.ebi.ac.uk/efo/EFO_0004024,EM
264544,tumors,C0027651,http://linkedlifedata.com/resource/umls-concep...,DS
264545,MUC4,Q99102,http://purl.uniprot.org/uniprot/Q99102,GP


In [6]:
extracted_df = extracted_df.drop_duplicates(subset="exact", keep="first")
extracted_df

Unnamed: 0,exact,ground,uri,type
0,SR1,Q9LZW4,http://purl.uniprot.org/uniprot/Q9LZW4,GP
1,HRAS,P01115,http://purl.uniprot.org/uniprot/P01115,GP
2,Spitz Nevus,C0206739,http://linkedlifedata.com/resource/umls-concep...,DS
3,Spitz nevus,C0206739,http://linkedlifedata.com/resource/umls-concep...,DS
5,Spitzoid melanoma,C3495721,http://linkedlifedata.com/resource/umls-concep...,DS
...,...,...,...,...
264530,deep fibromatosis,C0079218,http://linkedlifedata.com/resource/umls-concep...,DS
264531,Desmoplastic fibroma of bone,C0206645,http://linkedlifedata.com/resource/umls-concep...,DS
264533,Leiomyosarcoma,C0023269,http://linkedlifedata.com/resource/umls-concep...,DS
264540,Low-grade fibromyxoid sarcoma,C1275282,http://linkedlifedata.com/resource/umls-concep...,DS


In [7]:
from tqdm import tqdm
import re
import string
df = extracted_df.copy()

 
# Initialize columns for grounded codes, terms, and match status
df["grounded_code"] = "No Match"
df["grounded_term"] = "No Match"

# Iterate over each unique annotation type with a progress bar
for anno_type in tqdm(df["type"].unique(), desc="Processing annotation types"):
    # Filter for rows of the current annotation type and get a list of unique exact terms
    anno_list = df[df['type'] == anno_type]['exact'].unique().tolist()
    
    # Get the mapped results for the current annotation type
    mapped_result = map_terms_reverse(anno_list, anno_type, nlp)
    
    # Update the DataFrame with the mapped results
    for exact_term, (grounded_code, grounded_term) in mapped_result.items(): # tqdm(, desc=f"Mapping terms for {anno_type}", leave=False, position=1):
        # Filter rows in df that match both the exact term and the annotation type
        matched_rows = df[(df['type'] == anno_type) & (df['exact'] == exact_term)]
        
        if not matched_rows.empty:
            # Update 'grounded_code' and 'grounded_term' for all matching rows
            df.loc[matched_rows.index, "grounded_code"] = grounded_code
            df.loc[matched_rows.index, "grounded_term"] = grounded_term
        else:
            # Log if the exact term doesn't find a matching row
            print(f"No match found in DataFrame for term: {exact_term} in type: {anno_type}")


# Replace "GO_" with "GO:" and "CHEBI:" with "CHEBI_" in relevant columns
df["grounded_code"] = df["grounded_code"].str.replace("GO_", "GO:", regex=False)
df["grounded_code"] = df["grounded_code"].str.replace("CHEBI:", "CHEBI_", regex=False)




Processing annotation types: 100%|████████████████████████████████████████████████████████████| 6/6 [01:25<00:00, 14.26s/it]


In [8]:
# Enhanced match_status logic with case-insensitive matching for 'exact'
df["match_status"] = df.apply(
    lambda row: (
        row["ground"] == row["grounded_code"] or  # Exact match check
        row["exact"].lower() in row["grounded_term"].lower() or  # exact in grounded_term (case-insensitive)
        row["grounded_term"].lower() in row["exact"].lower()     # grounded_term in exact (case-insensitive)
    ),
    axis=1
)




In [9]:
df

Unnamed: 0,exact,ground,uri,type,grounded_code,grounded_term,match_status
0,SR1,Q9LZW4,http://purl.uniprot.org/uniprot/Q9LZW4,GP,Q3E796,smORF121,False
1,HRAS,P01115,http://purl.uniprot.org/uniprot/P01115,GP,Q60529,HRAS,True
2,Spitz Nevus,C0206739,http://linkedlifedata.com/resource/umls-concep...,DS,C0206739,spindle and or epithelioid cell nevus,True
3,Spitz nevus,C0206739,http://linkedlifedata.com/resource/umls-concep...,DS,C0206739,spindle and or epithelioid cell nevus,True
5,Spitzoid melanoma,C3495721,http://linkedlifedata.com/resource/umls-concep...,DS,C3495721,malignant spitz tumor,True
...,...,...,...,...,...,...,...
264530,deep fibromatosis,C0079218,http://linkedlifedata.com/resource/umls-concep...,DS,C0079218,desmoid type fibromatosis,True
264531,Desmoplastic fibroma of bone,C0206645,http://linkedlifedata.com/resource/umls-concep...,DS,C0206645,desmoid tumor of bone,True
264533,Leiomyosarcoma,C0023269,http://linkedlifedata.com/resource/umls-concep...,DS,C0023269,leiomyosarcoma no subtype,True
264540,Low-grade fibromyxoid sarcoma,C1275282,http://linkedlifedata.com/resource/umls-concep...,DS,C1275282,low grade fibromyxoid sarcoma,True


In [10]:
# Separate into matched, unmatched, and no match DataFrames
no_match_df = df[(df["grounded_code"] == "No Match") | (df["grounded_term"] == "No Match")]
matched_df = df[df["match_status"] == True]
unmatched_df = df[(df["match_status"] == False) & (df["grounded_code"] != "No Match") & (df["grounded_term"] != "No Match")]

In [11]:
# Display results
print("No ML grounding DataFrame:")
no_match_df = no_match_df[['exact', 'ground', 'uri', 'type']]

no_match_df

No ML grounding DataFrame:


Unnamed: 0,exact,ground,uri,type
73,Bisulfite,CHEBI_17137,http://purl.obolibrary.org/obo/CHEBI_17137,CD
80,bisulfite,CHEBI_17137,http://purl.obolibrary.org/obo/CHEBI_17137,CD
95,lipids,CHEBI_18059,http://purl.obolibrary.org/obo/CHEBI_18059,CD
128,hood,GO:1990343,http://identifiers.org/go/GO:1990343,GO
129,Cy3,CHEBI_37990,http://purl.obolibrary.org/obo/CHEBI_37990,CD
...,...,...,...,...
261273,cyclin E1,Q91780,http://purl.uniprot.org/uniprot/Q91780,GP
262314,procure,CHEBI_81784,http://purl.obolibrary.org/obo/CHEBI_81784,CD
262447,Tafinlar,CHEBI_75048,http://purl.obolibrary.org/obo/CHEBI_75048,CD
262947,PI3KÎ´,O00329,http://purl.uniprot.org/uniprot/O00329,GP


In [12]:
print("\nML Grounded:")
matched_df[['exact', 'ground', 'uri', 'type']]




ML Grounded:


Unnamed: 0,exact,ground,uri,type
1,HRAS,P01115,http://purl.uniprot.org/uniprot/P01115,GP
2,Spitz Nevus,C0206739,http://linkedlifedata.com/resource/umls-concep...,DS
3,Spitz nevus,C0206739,http://linkedlifedata.com/resource/umls-concep...,DS
5,Spitzoid melanoma,C3495721,http://linkedlifedata.com/resource/umls-concep...,DS
8,nevi,C0027960,http://linkedlifedata.com/resource/umls-concep...,DS
...,...,...,...,...
264530,deep fibromatosis,C0079218,http://linkedlifedata.com/resource/umls-concep...,DS
264531,Desmoplastic fibroma of bone,C0206645,http://linkedlifedata.com/resource/umls-concep...,DS
264533,Leiomyosarcoma,C0023269,http://linkedlifedata.com/resource/umls-concep...,DS
264540,Low-grade fibromyxoid sarcoma,C1275282,http://linkedlifedata.com/resource/umls-concep...,DS


In [13]:

print("\nMatch Summary by Type:")

# Calculate statistics of matches by annotation type, ensuring both True and False columns exist
match_summary = df.groupby("type")["match_status"].value_counts().unstack().fillna(0)
match_summary = match_summary.reindex(columns=[False, True], fill_value=0)

# Rename columns for clarity
match_summary.columns = ["mismatch_count", "match_count"]

# Calculate error rate as the percentage of mismatches for each type
match_summary["error_rate"] = (match_summary["mismatch_count"] / 
                               (match_summary["match_count"] + match_summary["mismatch_count"])) * 100

# Display the match summary with error rates
print(match_summary)


match_summary




# CD             1672  2753   37.785311
# DS             1441  4490   24.296071
# EM               47   447    9.514170
# GO              699  1380   33.621934
# GP             3520   239   93.641926
# OG              674  2348   22.303111


Match Summary by Type:
      mismatch_count  match_count  error_rate
type                                         
CD              1373         3518   28.071969
DS               958         5009   16.054969
EM                36          473    7.072692
GO               671         1464   31.428571
GP              4218         3258   56.420546
OG               619         2518   19.732228


Unnamed: 0_level_0,mismatch_count,match_count,error_rate
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CD,1373,3518,28.071969
DS,958,5009,16.054969
EM,36,473,7.072692
GO,671,1464,31.428571
GP,4218,3258,56.420546
OG,619,2518,19.732228


In [14]:
print("\nUnmatched Rows DataFrame:")
unmatched_df


Unmatched Rows DataFrame:


Unnamed: 0,exact,ground,uri,type,grounded_code,grounded_term,match_status
0,SR1,Q9LZW4,http://purl.uniprot.org/uniprot/Q9LZW4,GP,Q3E796,smORF121,False
7,melanoma,C0025202,http://linkedlifedata.com/resource/umls-concep...,DS,C1302746,melanocytic neoplasm,False
10,MAPK,O42781,http://purl.uniprot.org/uniprot/O42781,GP,P27638,SPAC31G5.09c,False
41,mice,10088,http://identifiers.org/taxonomy/10088,OG,10095,mus sp.,False
42,SR2,Q9XIW0,http://purl.uniprot.org/uniprot/Q9XIW0,GP,Q3E789,smORF118,False
...,...,...,...,...,...,...,...
263904,cytochrome-B5 reductase,P07514,http://purl.uniprot.org/uniprot/P07514,GP,P83291,F28I16.230,False
263961,cadherin-5,Q6URK6,http://purl.uniprot.org/uniprot/Q6URK6,GP,O02840,CDH5,False
264143,myofibroblastic tumors,C0334121,http://linkedlifedata.com/resource/umls-concep...,DS,C0027070,myoepithelial neoplasm,False
264323,Ae1,P04919,http://purl.uniprot.org/uniprot/P04919,GP,Q9NJQ2,Neurotoxin Ae I,False


In [15]:
unmatched_df[unmatched_df['type']=='GP']

Unnamed: 0,exact,ground,uri,type,grounded_code,grounded_term,match_status
0,SR1,Q9LZW4,http://purl.uniprot.org/uniprot/Q9LZW4,GP,Q3E796,smORF121,False
10,MAPK,O42781,http://purl.uniprot.org/uniprot/O42781,GP,P27638,SPAC31G5.09c,False
42,SR2,Q9XIW0,http://purl.uniprot.org/uniprot/Q9XIW0,GP,Q3E789,smORF118,False
125,Ago2,Q8CJG0,http://purl.uniprot.org/uniprot/Q8CJG0,GP,Q9QZ81,GERp95,False
213,RNA-binding proteins,P28890,http://purl.uniprot.org/uniprot/P28890,GP,Q3KI09,Pfl01_0854,False
...,...,...,...,...,...,...,...
263897,Cadherin 5,Q6URK6,http://purl.uniprot.org/uniprot/Q6URK6,GP,Q925C0,Synaptotagmin 5,False
263904,cytochrome-B5 reductase,P07514,http://purl.uniprot.org/uniprot/P07514,GP,P83291,F28I16.230,False
263961,cadherin-5,Q6URK6,http://purl.uniprot.org/uniprot/Q6URK6,GP,O02840,CDH5,False
264323,Ae1,P04919,http://purl.uniprot.org/uniprot/P04919,GP,Q9NJQ2,Neurotoxin Ae I,False


In [16]:
matched_df[matched_df['type']=='DS']

Unnamed: 0,exact,ground,uri,type,grounded_code,grounded_term,match_status
2,Spitz Nevus,C0206739,http://linkedlifedata.com/resource/umls-concep...,DS,C0206739,spindle and or epithelioid cell nevus,True
3,Spitz nevus,C0206739,http://linkedlifedata.com/resource/umls-concep...,DS,C0206739,spindle and or epithelioid cell nevus,True
5,Spitzoid melanoma,C3495721,http://linkedlifedata.com/resource/umls-concep...,DS,C3495721,malignant spitz tumor,True
8,nevi,C0027960,http://linkedlifedata.com/resource/umls-concep...,DS,C0027960,moles skin,True
11,melanomas,C0025202,http://linkedlifedata.com/resource/umls-concep...,DS,C0025202,mnaevocarcinoma,True
...,...,...,...,...,...,...,...
264530,deep fibromatosis,C0079218,http://linkedlifedata.com/resource/umls-concep...,DS,C0079218,desmoid type fibromatosis,True
264531,Desmoplastic fibroma of bone,C0206645,http://linkedlifedata.com/resource/umls-concep...,DS,C0206645,desmoid tumor of bone,True
264533,Leiomyosarcoma,C0023269,http://linkedlifedata.com/resource/umls-concep...,DS,C0023269,leiomyosarcoma no subtype,True
264540,Low-grade fibromyxoid sarcoma,C1275282,http://linkedlifedata.com/resource/umls-concep...,DS,C1275282,low grade fibromyxoid sarcoma,True


In [17]:
unmatched_df[unmatched_df['type']=='EM']

Unnamed: 0,exact,ground,uri,type,grounded_code,grounded_term,match_status
278,size-exclusion chromatography,MI_0071,http://purl.obolibrary.org/obo/MI_0071,EM,MI_2213,superresolution microscopy,False
2940,gene-trap,EFO_0004030,http://www.ebi.ac.uk/efo/EFO_0004030,EM,MI_2170,splitluciferase complementation,False
15615,pull-down,MI_0096,http://purl.obolibrary.org/obo/MI_0096,EM,MI_0112,ub reconstruction,False
15644,fluorescence activated cell sorting,EFO_0009108,http://www.ebi.ac.uk/efo/EFO_0009108,EM,MI_0054,fluorescenceactivated cell sorting,False
17496,cognitive-behavioural therapy,EFO_0007820,http://www.ebi.ac.uk/efo/EFO_0007820,EM,MI_2222,socioaffinity index scoring,False
19414,enzyme-linked immunosorbent assay,MI_0411,http://purl.obolibrary.org/obo/MI_0411,EM,MI_2170,splitluciferase complementation,False
19754,gel-filtration,MI_0071,http://purl.obolibrary.org/obo/MI_0071,EM,MI_0966,uv/vi,False
24544,protein-folding,MI_1031,http://purl.obolibrary.org/obo/MI_1031,EM,MI_2188,photoactivatableribonucleosideenhanced crossli...,False
27831,Enzyme-Linked Immunosorbent Assay,MI_0411,http://purl.obolibrary.org/obo/MI_0411,EM,MI_2170,splitluciferase complementation,False
27835,Dot-Blot,MI_0049,http://purl.obolibrary.org/obo/MI_0049,EM,EFO_0008833,nucchipseq,False


In [18]:
no_match_df[no_match_df['type']=='EM']

Unnamed: 0,exact,ground,uri,type
4090,GTPases,MI_0419,http://purl.obolibrary.org/obo/MI_0419,EM


In [19]:
no_match_df[no_match_df['type']=='OG']

Unnamed: 0,exact,ground,uri,type
722,T. cruzi,5693,http://identifiers.org/taxonomy/5693,OG
17870,O. insidiosus,83647,http://identifiers.org/taxonomy/83647,OG
17916,T. euproctidis,373118,http://identifiers.org/taxonomy/373118,OG
17977,B. oleae,104688,http://identifiers.org/taxonomy/104688,OG
18066,C. sesamiae,89807,http://identifiers.org/taxonomy/89807,OG
...,...,...,...,...
255250,C. nauseosus,71039,http://identifiers.org/taxonomy/71039,OG
260411,E. coli K1,1392869,http://identifiers.org/taxonomy/1392869,OG
260854,EPV,10376,http://identifiers.org/taxonomy/10376,OG
260857,EPVs,10376,http://identifiers.org/taxonomy/10376,OG


In [20]:
annotation_type = 'DS'
terms = no_match_df[no_match_df['type']==annotation_type].exact.tolist()
# terms = ['CMD']
# Use the updated function to map the terms
results = map_terms_reverse(terms, annotation_type, nlp)

results

{'sacral pressure sores': ('No Match', 'No Match'),
 'Sacral pressure sores': ('No Match', 'No Match'),
 'rare disorder': ('No Match', 'No Match'),
 'rare disease': ('No Match', 'No Match'),
 'hydrocystomas': ('No Match', 'No Match'),
 'opacities': ('No Match', 'No Match'),
 'rare-disease': ('No Match', 'No Match')}

In [21]:
annotation_type = 'CD'
terms = no_match_df[no_match_df['type']==annotation_type].exact.tolist()
# terms = ['CMD']
# Use the updated function to map the terms
results = map_terms_reverse(terms, annotation_type, nlp)

results

{'Bisulfite': ('No Match', 'No Match'),
 'bisulfite': ('No Match', 'No Match'),
 'lipids': ('No Match', 'No Match'),
 'Cy3': ('No Match', 'No Match'),
 'E210': ('No Match', 'No Match'),
 'Waters': ('No Match', 'No Match'),
 'formalin': ('No Match', 'No Match'),
 'tenacity': ('No Match', 'No Match'),
 'Lipids': ('No Match', 'No Match'),
 ' pros': ('No Match', 'No Match'),
 'histones': ('No Match', 'No Match'),
 '-Da': ('No Match', 'No Match'),
 'prospers': ('No Match', 'No Match'),
 'pmsf': ('No Match', 'No Match'),
 'bromo': ('No Match', 'No Match'),
 ' pro-': ('No Match', 'No Match'),
 'temodar': ('No Match', 'No Match'),
 'gamma': ('No Match', 'No Match'),
 'Iressa': ('No Match', 'No Match'),
 'advanced glycation end products': ('No Match', 'No Match'),
 'radio': ('No Match', 'No Match'),
 'Gamma': ('No Match', 'No Match'),
 'Velcade': ('No Match', 'No Match'),
 'ketone bodies': ('No Match', 'No Match'),
 'Temodar': ('No Match', 'No Match'),
 '-Met': ('No Match', 'No Match'),
 'flash

In [22]:
annotation_type = 'GO'
terms = no_match_df[no_match_df['type']==annotation_type].exact.tolist()
# terms = ['CMD']
# Use the updated function to map the terms
results = map_terms_reverse(terms, annotation_type, nlp)

results

{'hood': ('No Match', 'No Match'),
 'S1 Development': ('No Match', 'No Match'),
 'breakdown': ('No Match', 'No Match'),
 'deaths': ('No Match', 'No Match'),
 'efflux pump': ('No Match', 'No Match'),
 'efflux pumps': ('No Match', 'No Match'),
 'Upar': ('No Match', 'No Match'),
 'parasitism': ('No Match', 'No Match'),
 'Parasitism': ('No Match', 'No Match'),
 'macropain': ('No Match', 'No Match'),
 'Hood': ('No Match', 'No Match'),
 'Discs': ('No Match', 'No Match'),
 'discs': ('No Match', 'No Match'),
 'germplasm': ('No Match', 'No Match'),
 'mutualisms': ('No Match', 'No Match'),
 'Germplasm': ('No Match', 'No Match'),
 'taste': ('No Match', 'No Match'),
 'chorion': ('No Match', 'No Match'),
 'crown': ('No Match', 'No Match'),
 'crowns': ('No Match', 'No Match'),
 'mucous': ('No Match', 'No Match'),
 'lands': ('No Match', 'No Match'),
 'smoother': ('No Match', 'No Match'),
 'short term memory': ('No Match', 'No Match'),
 'excretions': ('No Match', 'No Match'),
 'breakdowns': ('No Match

In [23]:
annotation_type ='CD'
data = loaded_data[annotation_type]
term_dict = data["term_to_id"]
id_to_term = data["id_to_term"]
indexed_terms = data["indexed_terms"]
index = data["index"]

In [24]:
term_dict

{'(+)-atherospermoline': 'CHEBI_10',
 '(-)-medicarpin': 'CHEBI_100',
 'vismione d': 'CHEBI_10000',
 '(2s,3s,4r)-3-[4-(3-cyclopentylprop-1-ynyl)phenyl]-4-(hydroxymethyl)-1-(2-methoxy-1-oxoethyl)-2-azetidinecarbonitrile': 'CHEBI_100000',
 'n-[(2r,3s,6r)-2-(hydroxymethyl)-6-[2-[[oxo-[4-(trifluoromethyl)anilino]methyl]amino]ethyl]-3-oxanyl]-3-pyridinecarboxamide': 'CHEBI_100001',
 '3-chloro-n-[(5s,6s,9s)-5-methoxy-3,6,9-trimethyl-2-oxo-11-oxa-3,8-diazabicyclo[10.4.0]hexadeca-1(12),13,15-trien-14-yl]benzenesulfonamide': 'CHEBI_100002',
 '(4r,7s,8r)-8-methoxy-4,7,10-trimethyl-11-oxo-14-(1-oxobutylamino)-n-propyl-2-oxa-5,10-diazabicyclo[10.4.0]hexadeca-1(12),13,15-triene-5-carboxamide': 'CHEBI_100003',
 '1-(2,5-difluorophenyl)-3-[(5s,6s,9s)-5-methoxy-3,6,9-trimethyl-2-oxo-8-[oxo(2-pyrazinyl)methyl]-11-oxa-3,8-diazabicyclo[10.4.0]hexadeca-1(12),13,15-trien-14-yl]urea': 'CHEBI_100004',
 'n-[(1s,3s,4as,9ar)-1-(hydroxymethyl)-3-[2-oxo-2-(1-piperidinyl)ethyl]-3,4,4a,9a-tetrahydro-1h-pyrano[3,4-b]b

In [25]:
# Filter keys that start with "hand"
keys_starting_with_hand = [key for key in term_dict if key.lower().endswith("atom")]

# Display the result
print(keys_starting_with_hand)

[]


In [26]:
get_embedding_match('lutetium', index, indexed_terms, term_dict, nlp, threshold=0.7)

'CHEBI_37301'