In [1]:
# correct the working directory if needed
import os
if os.getcwd().endswith('scripts'):
    os.chdir('..')
    
os.environ['FOR_DISABLE_CONSOLE_CTRL_HANDLER'] = '1'


In [2]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from modules.ModelFunctions import auto_load_model, encode_concepts
from modules.timed_logger import logger
from modules.Dataset import PositiveDataset, NegativeDataset, FalsePositiveDataset, CombinedDataset
from modules.metrics import evaluate_performance
from modules.FaissDB import build_index, is_initialized, delete_repository, search_similar
from pprint import pprint


2025-10-10 15:28:46 - Loading faiss with AVX512 support.
2025-10-10 15:28:46 - Could not load library with AVX512 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx512'")
2025-10-10 15:28:46 - Loading faiss with AVX2 support.
2025-10-10 15:28:46 - Successfully loaded faiss with AVX2 support.
2025-10-10 15:28:46 - Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes. This is only an error if you're trying to use GPU Faiss.


# Load models

In [3]:
model0, tokenizer0 = auto_load_model("output/all-MiniLM-L6-v2")
model1, tokenizer1 = auto_load_model("output/finetune_initial/2025-10-02_15-38-11")

# model2, tokenizer2, train_config2 = auto_load_model("output/finetune/2025-07-30_13-33-46")


2025-10-10 15:28:46 - Use pytorch device_name: cuda:0
2025-10-10 15:28:46 - Load pretrained SentenceTransformer: output/all-MiniLM-L6-v2\init


2025-10-10 15:28:47 - Use pytorch device_name: cuda:0
2025-10-10 15:28:47 - Load pretrained SentenceTransformer: output/finetune_initial/2025-10-02_15-38-11\checkpoint-78690


Loaded latest auto-saved model from: output/all-MiniLM-L6-v2\init
Loaded latest auto-saved model from: output/finetune_initial/2025-10-02_15-38-11\checkpoint-78690


# Read concept data

In [4]:
matching_base_path = "data/matching"
relation_base_path = "data/relation"
omop_base_path = "data/omop_feather"

concept= pd.read_feather('data/omop_feather/concept.feather')
concept_ancestor = pd.read_feather(os.path.join(omop_base_path, 'concept_ancestor.feather'))
std_bridge = pd.read_feather(os.path.join(omop_base_path, 'std_bridge.feather'))

condition_concept = concept[concept['domain_id'] == 'Condition'].reset_index(drop=True)
std_condition_concept = condition_concept[condition_concept['standard_concept'] == 'S'].reset_index(drop=True)
nonstd_condition_concept = condition_concept[condition_concept['standard_concept'] != 'S'].reset_index(drop=True)

target_concepts = pd.read_feather(os.path.join(matching_base_path, 'target_concepts.feather'))

reserved_vocab = "CIEL"
reserved_concepts = nonstd_condition_concept[nonstd_condition_concept.vocabulary_id == reserved_vocab]

## Exclude non-standard concepts that do not have standard mappings
reserved_concepts = reserved_concepts[reserved_concepts.concept_id.isin(std_bridge.concept_id)]
reserved_concepts = reserved_concepts.reset_index(drop=True)


# Generate embeddings 

In [5]:
target_concepts['model0_embedding'] = encode_concepts(
    model0, 
    target_concepts.concept_name).tolist()

target_concepts['model1_embedding'] = encode_concepts(
    model1, 
    target_concepts.concept_name).tolist()





Batches:   0%|          | 0/5007 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Batches:   0%|          | 0/5007 [00:00<?, ?it/s]

In [6]:
def evaluate_model(model_name, model, query_concepts, target_concepts, query_positive_mapping):
    top_k = 100
    model_emb = build_index(
        model = model, 
        corpus_ids = target_concepts.concept_id, 
        corpus_names = target_concepts.concept_name, 
        corpus_embeddings = target_concepts[f'{model_name}_embedding'],
        repos=model_name
    )

    model_top = search_similar(
        query_ids = query_concepts.concept_id, 
        query_names = query_concepts.concept_name, 
        top_k=top_k, 
        repos=model_name
        )

    model_top = model_top.merge(
        query_positive_mapping,
        on=['query_id', 'corpus_id'],
        how='left'
    ) 
    model_top['label'] = model_top['label'].fillna(0).astype(int)


    model_eval = evaluate_performance(
        query_ids=model_top['query_id'],
        similarities=model_top['score'],
        labels=model_top['label']
    )
    return model_top, model_eval

In [7]:
# Create matching query
query_concepts = reserved_concepts[['concept_id', 'concept_name']]
query_positive_mapping = std_bridge[std_bridge.concept_id.isin(reserved_concepts.concept_id)].reset_index(drop=True).rename(
        columns={
            'concept_id': 'query_id',
            'std_concept_id': 'corpus_id'
        }
    )
query_positive_mapping['label'] = 1
query_positive_mapping.head(3)

Unnamed: 0,query_id,corpus_id,label
0,45913718,43530910,1
1,45946993,4240605,1
2,45954546,37164909,1


In [8]:
model0_top, model0_eval = evaluate_model(
    model_name='model0',
    model=model0,
    query_concepts=query_concepts,
    target_concepts=target_concepts,
    query_positive_mapping=query_positive_mapping
)


Batches:   0%|          | 0/1066 [00:00<?, ?it/s]

Searching: 100%|██████████| 34/34 [00:00<00:00, 52.82it/s]


In [16]:
print(model0_top.head(3))
pprint(model0_eval)
model0_top.to_feather('output/tmp/model0_top.feather')

   query_id         query_name  corpus_id                   corpus_name  \
0  45908352  Disorder of Penis     196158             Disorder of penis   
1  45908352  Disorder of Penis   45766654     Disorder of skin of penis   
2  45908352  Disorder of Penis    4127726  Functional disorder of penis   

      score  label  
0  1.000062      1  
1  0.867125      0  
2  0.841977      0  
{'accuracy': 0.9501184497299346,
 'average_precision': 0.8267695255536552,
 'best_hit1': 0.551304906917614,
 'best_hit10': 0.6044071660285822,
 'best_hit100': 0.6204518221936465,
 'best_hit20': 0.6125610076862378,
 'best_hit3': 0.5806178215506912,
 'best_hit5': 0.5922494666393898,
 'best_hit50': 0.6187859835754157,
 'best_reciprocal_rank': 0.5693656508201438,
 'f1_score': 0.20289096266182102,
 'precision': 0.11385265923126556,
 'recall': 0.9308996702779087,
 'roc_auc': 0.9862677554313737,
 'worst_hit1': 0.551304906917614,
 'worst_hit10': 0.6044071660285822,
 'worst_hit100': 0.6204518221936465,
 'worst_hit20'

In [10]:
model1_top, model1_eval = evaluate_model(
    model_name='model1',
    model=model1,
    query_concepts=query_concepts,
    target_concepts=target_concepts,
    query_positive_mapping=query_positive_mapping
)

Batches:   0%|          | 0/1066 [00:00<?, ?it/s]

Searching: 100%|██████████| 34/34 [00:00<00:00, 57.26it/s]


In [15]:
print(model1_top.head(3))
pprint(model1_eval)
model1_top.to_feather('output/tmp/model1_top.feather')

   query_id                               query_name  corpus_id  \
0  45927650  Nondependent Cocaine Abuse in Remission     436098   
1  45927650  Nondependent Cocaine Abuse in Remission     432302   
2  45927650  Nondependent Cocaine Abuse in Remission     433994   

                               corpus_name     score  label  
0  Nondependent cocaine abuse in remission  0.999991      1  
1          Cocaine dependence in remission  0.995519      0  
2           Cocaine dependence, continuous  0.986028      0  
{'accuracy': 0.18584148518004578,
 'average_precision': 0.8129458707239686,
 'best_hit1': 0.7125405500189964,
 'best_hit10': 0.7932314346669784,
 'best_hit100': 0.8216675921325657,
 'best_hit20': 0.8076979279305608,
 'best_hit3': 0.7586579770289622,
 'best_hit5': 0.7749656603442733,
 'best_hit50': 0.8195049244527574,
 'best_reciprocal_rank': 0.7408016952762927,
 'f1_score': 0.025684167812125288,
 'precision': 0.013009226769667759,
 'recall': 0.9995376298193199,
 'roc_auc': 0.978

# Random Check

In [14]:
# delete the query if the first match is exact match
model1_mismatched = model1_top.groupby('query_id').apply(lambda x: x if not (x.iloc[0]['label'] == 1) else pd.DataFrame()).reset_index(drop=True)
model1_mismatched

  model1_mismatched = model1_top.groupby('query_id').apply(lambda x: x if not (x.iloc[0]['label'] == 1) else pd.DataFrame()).reset_index(drop=True)


Unnamed: 0,query_id,query_name,corpus_id,corpus_name,score,label
0,45905817.0,HIV STAGING - MINOR MUCOCUTANEOUS MANIFESTATIONS,37017125.0,Disorder of skin co-occurrent with human immun...,0.961349,0.0
1,45905817.0,HIV STAGING - MINOR MUCOCUTANEOUS MANIFESTATIONS,4224566.0,Skin disorder with AIDS (acquired immunodefici...,0.959360,0.0
2,45905817.0,HIV STAGING - MINOR MUCOCUTANEOUS MANIFESTATIONS,37017106.0,Eruption of skin co-occurrent with human immun...,0.955670,0.0
3,45905817.0,HIV STAGING - MINOR MUCOCUTANEOUS MANIFESTATIONS,4220603.0,Skin rash with AIDS (acquired immunodeficiency...,0.955544,0.0
4,45905817.0,HIV STAGING - MINOR MUCOCUTANEOUS MANIFESTATIONS,606047.0,Human immunodeficiency virus modified skin dis...,0.955205,0.0
...,...,...,...,...,...,...
787836,45956673.0,Battered Baby,4170972.0,Perinatal cardiovascular disorders,0.732831,0.0
787837,45956673.0,Battered Baby,607915.0,Complication of embryo transfer,0.732206,0.0
787838,45956673.0,Battered Baby,4156891.0,Antenatal screening finding,0.728478,0.0
787839,45956673.0,Battered Baby,600528.0,Neonatal disorder due to maternal disorder of ...,0.721022,0.0


In [23]:
query_with_name = query_positive_mapping.merge(
    concept[['concept_id', 'concept_name']].rename(columns={'concept_id': 'corpus_id', 'concept_name': 'corpus_name'}, inplace=False),
    on='corpus_id',
    how='left'
).merge(
    concept[['concept_id', 'concept_name']].rename(columns={'concept_id': 'query_id', 'concept_name': 'query_name'}, inplace=False),
    on='query_id',
    how='left'
)

In [24]:
query_with_name[query_with_name.query_id == 45914571]

Unnamed: 0,query_id,corpus_id,label,corpus_name,query_name
20479,45914571,4296478,1,Stilbestrol-related vaginal adenosis,Stilbestrol-Related Vaginal Adenosis


In [26]:

model_top = search_similar(
    query_ids = [45914571], 
    query_names = ['Stilbestrol-Related Vaginal Adenosis'], 
    top_k=100, 
    repos="model1"
    )
model_top.sort_values(by='score', ascending=False).head(10)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Searching: 100%|██████████| 1/1 [00:00<?, ?it/s]


Unnamed: 0,query_id,query_name,corpus_id,corpus_name,score
64,45914571,Stilbestrol-Related Vaginal Adenosis,4242227,Vaginal adenosis,0.942984
42,45914571,Stilbestrol-Related Vaginal Adenosis,4338106,Vaginal hyperplasia,0.93771
68,45914571,Stilbestrol-Related Vaginal Adenosis,4048525,Chronic hypertrophic vulvitis,0.928483
16,45914571,Stilbestrol-Related Vaginal Adenosis,194412,Dysplasia of vagina,0.926026
12,45914571,Stilbestrol-Related Vaginal Adenosis,198198,Polyp of vagina,0.915985
13,45914571,Stilbestrol-Related Vaginal Adenosis,4182189,Atrophy of vagina,0.913577
9,45914571,Stilbestrol-Related Vaginal Adenosis,4174893,Vaginal intraepithelial neoplasia grade 2,0.911297
48,45914571,Stilbestrol-Related Vaginal Adenosis,42593508,Hypertrophy of vagina,0.910539
46,45914571,Stilbestrol-Related Vaginal Adenosis,4318850,Leukoplakia of female genital organs,0.909806
23,45914571,Stilbestrol-Related Vaginal Adenosis,4227101,Vulval intraepithelial neoplasia with squamous...,0.908898


In [43]:
query_embeddings = encode_concepts(model1, ['Stilbestrol-Related Vaginal Adenosis'])

corpus_embeddings = target_concepts[target_concepts.concept_id == 4296478].model1_embedding.values[0]

# distance between two embeddings using inner product
np.inner(query_embeddings[0], corpus_embeddings)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

1.000000040874812

In [44]:
from modules.FaissDB import GLOBAL_SPACE
faiss_index = GLOBAL_SPACE['model1']['faiss_index']
batch_scores, batch_indices = faiss_index.search(query_embeddings, 100)

In [46]:
batch_indices[0]

array([144762,  75366, 149478,  87619, 121544, 115379, 116082, 111109,
       157468, 153461, 123732, 130675, 111115, 123736,  90095, 105238,
       111108,  96094,  70908,  81214,  70907, 122449, 101072, 131869,
       133099, 123639,  90093,  68968, 107125, 132978, 123637,  69788,
       126014, 157467,  70904, 123734, 143141,  93090, 137148, 111111,
       123731,  72742, 101215, 105393,  72743, 153458, 108670, 121621,
       118933, 118254, 135823, 131320, 131709,   6896, 108912, 106197,
        96012, 134888,  58795,  72744, 151205, 137120, 106889,  70110,
        72739,  72709, 139601, 123304, 158341,  80025, 142468, 151502,
        76092, 138727, 104700, 142815,   8446,     -1,     -1,     -1,
           -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
           -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
           -1,     -1,     -1,     -1], dtype=int64)

In [47]:
target_concepts.iloc[batch_indices[0]]

Unnamed: 0,concept_id,concept_name,model0_embedding,model1_embedding
144762,4296478,Stilbestrol-related vaginal adenosis,"[-0.058009956032037735, -0.09925009310245514, ...","[-0.03850644454360008, 0.025586023926734924, -..."
75366,4242227,Vaginal adenosis,"[-0.01245673093944788, -0.007761321030557156, ...","[-0.022294696420431137, 0.03283543512225151, -..."
149478,4338106,Vaginal hyperplasia,"[0.03229188174009323, 0.052380699664354324, -0...","[-0.026677146553993225, 0.05071567744016647, -..."
87619,4048525,Chronic hypertrophic vulvitis,"[0.007250822149217129, 0.02138751558959484, -0...","[-0.05998328700661659, 0.002729268977418542, -..."
121544,194412,Dysplasia of vagina,"[-0.021183805540204048, 0.03292396664619446, 0...","[-0.020671933889389038, 0.06681733578443527, -..."
...,...,...,...,...
160287,42605008,"Hemagglutination detected, in vitro","[0.019596392288804054, 0.05268771946430206, -0...","[-0.037472911179065704, 0.06335137039422989, -..."
160287,42605008,"Hemagglutination detected, in vitro","[0.019596392288804054, 0.05268771946430206, -0...","[-0.037472911179065704, 0.06335137039422989, -..."
160287,42605008,"Hemagglutination detected, in vitro","[0.019596392288804054, 0.05268771946430206, -0...","[-0.037472911179065704, 0.06335137039422989, -..."
160287,42605008,"Hemagglutination detected, in vitro","[0.019596392288804054, 0.05268771946430206, -0...","[-0.037472911179065704, 0.06335137039422989, -..."


Unnamed: 0,concept_id,concept_name,model0_embedding,model1_embedding
144762,4296478,Stilbestrol-related vaginal adenosis,"[-0.058009956032037735, -0.09925009310245514, ...","[-0.03850644454360008, 0.025586023926734924, -..."
