In [2]:
import os, sys, warnings, re, wandb
warnings.filterwarnings("ignore")
import pandas as pd

sys.path.append('../')
from src.CrossEncoder import CrossEncoderReranker
import src.FaissEncoder as faiss_enc
from src.metrics import calculate_topk_accuracy


In [4]:
CORPUS = "SympTEMIST"
CORPUS_PATH = f"../../EntityLinking/data/{CORPUS}/"
DATA_PATH = os.path.join(CORPUS_PATH, "processed_data/")
TOP_K_VALUES = [1,5,25,50,100,200]
F_TYPE = "FlatIP"
MAX_LENGTH = 256

mapping_dict = {
    "Spanish_SapBERT_grandparents": "ClinLinker-KB-GP",
    "Spanish_SapBERT_parents": "ClinLinker-KB-P",
    "SapBERT-XLM-R": "SapBERT-XLM-R-base",
    "SapBERT-XLM-R-large": "SapBERT-XLM-R-large"
}

In [16]:
def log_metrics(data, category):
    for k, accuracy in data.items():
        wandb.log({"k_value": k, f"Accuracy/{category}": accuracy})

def log_columns_acc_table(dict, category):
    for k,v in dict.items():
        wandb.log({f"{category}-{k}": v})

def evaluate(model_path, mapping_dict, df_gs, df_link_train, df_gaz, corpus, top_k_values):
    print(model_path.split('/')[-1])
    regex = r'/cef_([a-zA-Z0-9]+)_((?:[\w-]+_)*[\w-]+?)_(sim|kg|bkg)(?:_(\d+))?_cand_(\d+)_epoch_(\d+)_bs_(\d+)'
    match = re.search(regex, model_path)
    if not match:
        raise ValueError("Invalid model_path format.")

    _ = match.group(1)
    model = match.group(2)
    model = mapping_dict.get(model, model)
    triplet_type = match.group(3)
    depth = match.group(4) if match.group(4) is not None else '0'  # Default depth to '0' if not specified
    _ = int(match.group(5))
    epoch = int(match.group(6))
    batch_size = int(match.group(7))

    wandb.init(project='Cross-encoder_KB_Enrichment', entity='fgallego', reinit=True)
    wandb.config.update({
        'model': model,
        'corpus': corpus,
        'hard_triplet_type': triplet_type,
        'depth': depth,
        'epoch': epoch,
        'batch_size': batch_size
    })

    crossreranker = CrossEncoderReranker(model_name=model_path, model_type="st", max_seq_length=120)
    df_preds_gs = crossreranker.rerank_candidates(df_gs, "term", "candidates", "codes")
    res_gs = calculate_topk_accuracy(df_preds_gs, top_k_values)

    df_preds_uc = df_preds_gs[~df_preds_gs['code'].isin(df_link_train['code'])]
    res_uc = calculate_topk_accuracy(df_preds_uc, top_k_values)
    
    df_preds_um = df_preds_uc[~df_preds_uc['term'].isin(df_gaz['term'])]
    res_um = calculate_topk_accuracy(df_preds_um, top_k_values)

    artifact_gs = wandb.Artifact('predictions_gs', type='dataset')
    artifact_uc = wandb.Artifact('predictions_uc', type='dataset')
    artifact_um = wandb.Artifact('predictions_um', type='dataset')

    with artifact_gs.new_file('predictions_gs.tsv', mode='w') as f:
        df_preds_gs.to_csv(f, index=False)
    with artifact_uc.new_file('predictions_uc.tsv', mode='w') as f:
        df_preds_uc.to_csv(f, index=False)
    with artifact_um.new_file('predictions_um.tsv', mode='w') as f:
        df_preds_um.to_csv(f, index=False)

    wandb.log_artifact(artifact_gs)
    wandb.log_artifact(artifact_uc)
    wandb.log_artifact(artifact_um)
    log_metrics(res_gs, "Gold_Standard")
    log_metrics(res_uc, "Unseen_Codes")
    log_metrics(res_um, "Unseen_Mentions")

    log_columns_acc_table(res_gs, "GS")
    log_columns_acc_table(res_uc, "UC")
    log_columns_acc_table(res_um, "UM")

    wandb.finish()

In [3]:
def evaluate_test(model_path, mapping_dict, df_gs, df_link_train, df_gaz, corpus, top_k_values):

    crossreranker = CrossEncoderReranker(model_name=model_path, model_type="st", max_seq_length=120)
    df_preds_gs = crossreranker.rerank_candidates(df_gs, "term", "candidates", "codes")
    res_gs = calculate_topk_accuracy(df_preds_gs, top_k_values)

    df_preds_uc = df_preds_gs[~df_preds_gs['code'].isin(df_link_train['code'])]
    res_uc = calculate_topk_accuracy(df_preds_uc, top_k_values)
    
    df_preds_um = df_preds_uc[~df_preds_uc['term'].isin(df_gaz['term'])]
    res_um = calculate_topk_accuracy(df_preds_um, top_k_values)
    print(res_gs, res_uc, res_um)



In [6]:
df_train = pd.read_csv(os.path.join(DATA_PATH, "df_link_gaz_train.tsv"), sep="\t", header=0, dtype={"code": str})
df_link_train = pd.read_csv(os.path.join(DATA_PATH, "df_link_train.tsv"), sep="\t", header=0, dtype={"code": str})
df_gaz = pd.read_csv(os.path.join(DATA_PATH, "gazetteer_term_code.tsv"), sep="\t", header=0, dtype={"code": str})

if CORPUS == "SympTEMIST":
    df_test = pd.read_csv("../../data/SympTEMIST/symptemist-complete_240208/symptemist_test/subtask2-linking/symptemist_tsv_test_subtask2.tsv", sep="\t", header=0, dtype={"code": str})
    df_test = df_test.rename(columns={'text': 'term'})
elif CORPUS == "MedProcNER":
    df_test = pd.read_csv("../../data/MedProcNER/medprocner_gs_train+test+gazz+multilingual+crossmap_230808/medprocner_test/tsv/medprocner_tsv_test_subtask2.tsv", sep="\t", header=0, dtype={"code": str})
    df_test = df_test.rename(columns={'text': 'term'})
elif CORPUS == "DisTEMIST":
    df_test = pd.read_csv("../../data/DisTEMIST/distemist_zenodo/test_annotated/subtrack2_linking/distemist_subtrack2_test_linking.tsv", sep="\t", header=0, dtype={"code": str})
    df_test = df_test.rename(columns={'span': 'term'})


In [7]:
faiss_encoder = faiss_enc.FaissEncoder("cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR-large", F_TYPE, MAX_LENGTH, df_train)
faiss_encoder.fitFaiss()

Encoding: 100%|██████████| 5218/5218 [04:41<00:00, 18.55it/s]


In [8]:
candidates, codes, similarities = faiss_encoder.getCandidates(df_test["term"].tolist(), k=200)
df_test["candidates"] = candidates
df_test["codes"] = codes
evaluate_test(f"../../models/cross-encoders/SapBERT_Multilingue_XLMR-large/cef_{CORPUS.lower()}_SapBERT_Multilingue_XLMR-large_sim_cand_200_epoch_1_bs_64", mapping_dict, df_test, df_link_train, df_gaz, CORPUS, TOP_K_VALUES)

Encoding: 100%|██████████| 45/45 [00:05<00:00,  8.39it/s]
Reranking candidates: 100%|██████████| 2848/2848 [22:18<00:00,  2.13it/s]


{1: 0.5807584269662921, 5: 0.7155898876404494, 25: 0.8058286516853933, 50: 0.8279494382022472, 100: 0.8472612359550562, 200: 0.8542837078651685} {1: 0.2967309304274937, 5: 0.47778709136630343, 25: 0.622799664710813, 50: 0.6596814752724225, 100: 0.6898575020955574, 200: 0.7032690695725062} {1: 0.234375, 5: 0.43106617647058826, 25: 0.5882352941176471, 50: 0.6286764705882353, 100: 0.6617647058823529, 200: 0.6764705882352942}


In [6]:
candidates, codes, similarities = faiss_encoder.getCandidates(df_test["term"].tolist(), k=200)
df_test["candidates"] = candidates
df_test["codes"] = codes
evaluate(f"../../models/cross-encoders/SapBERT_Multilingue_XLMR-large/cef_{CORPUS.lower()}_SapBERT_Multilingue_XLMR-large_sim_cand_200_epoch_1_bs_64", mapping_dict, df_test, df_link_train, df_gaz, CORPUS, TOP_K_VALUES)

Encoding: 100%|██████████| 45/45 [00:05<00:00,  8.51it/s]
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


Reranking candidates: 100%|██████████| 2848/2848 [22:00<00:00,  2.16it/s]


0,1
Accuracy/Gold_Standard,▁▄▇▇██
Accuracy/Unseen_Codes,▁▄▇▇██
Accuracy/Unseen_Mentions,▁▄▇▇██
GS-1,▁
GS-100,▁
GS-200,▁
GS-25,▁
GS-5,▁
GS-50,▁
UC-1,▁

0,1
Accuracy/Gold_Standard,0.85428
Accuracy/Unseen_Codes,0.70327
Accuracy/Unseen_Mentions,0.67647
GS-1,0.58076
GS-100,0.84726
GS-200,0.85428
GS-25,0.80583
GS-5,0.71559
GS-50,0.82795
UC-1,0.29673


In [7]:
candidates, codes, similarities = faiss_encoder.getCandidates(df_test["term"].tolist(), k=200)
df_test["candidates"] = candidates
df_test["codes"] = codes
evaluate(f"../../models/cross-encoders/SapBERT_Multilingue_XLMR-large/cef_{CORPUS.lower()}_SapBERT_Multilingue_XLMR-large_kg_1_cand_200_epoch_1_bs_64", mapping_dict, df_test, df_link_train, df_gaz, CORPUS, TOP_K_VALUES)

Encoding: 100%|██████████| 45/45 [00:04<00:00, 10.00it/s]


Reranking candidates: 100%|██████████| 2848/2848 [22:04<00:00,  2.15it/s]


0,1
Accuracy/Gold_Standard,▁▃▅▆▇█
Accuracy/Unseen_Codes,▁▃▅▆▇█
Accuracy/Unseen_Mentions,▁▂▄▆▇█
GS-1,▁
GS-100,▁
GS-200,▁
GS-25,▁
GS-5,▁
GS-50,▁
UC-1,▁

0,1
Accuracy/Gold_Standard,0.85428
Accuracy/Unseen_Codes,0.70327
Accuracy/Unseen_Mentions,0.67647
GS-1,0.32268
GS-100,0.79916
GS-200,0.85428
GS-25,0.62781
GS-5,0.47296
GS-50,0.71278
UC-1,0.14585


In [8]:
candidates, codes, similarities = faiss_encoder.getCandidates(df_test["term"].tolist(), k=200)
df_test["candidates"] = candidates
df_test["codes"] = codes
evaluate(f"../../models/cross-encoders/SapBERT_Multilingue_XLMR-large/cef_{CORPUS.lower()}_SapBERT_Multilingue_XLMR-large_bkg_1_cand_200_epoch_1_bs_64", df_test, df_link_train, df_gaz, CORPUS, TOP_K_VALUES)

Encoding: 100%|██████████| 45/45 [00:04<00:00, 10.02it/s]


Reranking candidates: 100%|██████████| 2848/2848 [21:58<00:00,  2.16it/s]


0,1
Accuracy/Gold_Standard,▁▃▅▆▇█
Accuracy/Unseen_Codes,▁▃▅▆▆█
Accuracy/Unseen_Mentions,▁▃▅▆▆█
GS-1,▁
GS-100,▁
GS-200,▁
GS-25,▁
GS-5,▁
GS-50,▁
UC-1,▁

0,1
Accuracy/Gold_Standard,0.85428
Accuracy/Unseen_Codes,0.70327
Accuracy/Unseen_Mentions,0.67647
GS-1,0.47999
GS-100,0.77739
GS-200,0.85428
GS-25,0.70506
GS-5,0.59199
GS-50,0.74298
UC-1,0.20034


In [11]:
BIENCODER_PATH = "../../models/spanish_sapbert_models/sapbert_15_parents_1epoch/"
faiss_encoder = faiss_enc.FaissEncoder(BIENCODER_PATH, F_TYPE, MAX_LENGTH, df_train)
faiss_encoder.fitFaiss()
candidates, codes, similarities = faiss_encoder.getCandidates(df_test["term"].tolist(), k=200)
df_test["candidates"] = candidates
df_test["codes"] = codes

Encoding: 100%|██████████| 5218/5218 [01:04<00:00, 81.42it/s]
Encoding: 100%|██████████| 45/45 [00:01<00:00, 40.34it/s]


In [15]:
candidates, codes, similarities = faiss_encoder.getCandidates(df_test["term"].tolist(), k=200)
df_test["candidates"] = candidates
df_test["codes"] = codes
evaluate(f"../../models/cross-encoders/Spanish_SapBERT_parents/cef_{CORPUS.lower()}_Spanish_SapBERT_parents_sim_cand_200_epoch_1_bs_128", mapping_dict, df_test, df_link_train, df_gaz, CORPUS, TOP_K_VALUES)

Encoding: 100%|██████████| 45/45 [00:01<00:00, 39.79it/s]


Reranking candidates: 100%|██████████| 2848/2848 [05:01<00:00,  9.43it/s]


0,1
Accuracy/Gold_Standard,▁▄▇▇██
Accuracy/Unseen_Codes,▁▄▇▇██
Accuracy/Unseen_Mentions,▁▄▇▇██
GS-1,▁
GS-100,▁
GS-200,▁
GS-25,▁
GS-5,▁
GS-50,▁
UC-1,▁

0,1
Accuracy/Gold_Standard,0.92837
Accuracy/Unseen_Codes,0.8399
Accuracy/Unseen_Mentions,0.82445
GS-1,0.59129
GS-100,0.92135
GS-200,0.92837
GS-25,0.86798
GS-5,0.74192
GS-50,0.90063
UC-1,0.3202


In [17]:
BIENCODER_PATH = "../../models/spanish_sapbert_models/sapbert_15_grandparents_1epoch/"
faiss_encoder = faiss_enc.FaissEncoder(BIENCODER_PATH, F_TYPE, MAX_LENGTH, df_train)
faiss_encoder.fitFaiss()
candidates, codes, similarities = faiss_encoder.getCandidates(df_test["term"].tolist(), k=200)
df_test["candidates"] = candidates
df_test["codes"] = codes

Encoding: 100%|██████████| 5218/5218 [01:02<00:00, 83.31it/s]
Encoding: 100%|██████████| 45/45 [00:01<00:00, 40.54it/s]


In [18]:
candidates, codes, similarities = faiss_encoder.getCandidates(df_test["term"].tolist(), k=200)
df_test["candidates"] = candidates
df_test["codes"] = codes
evaluate(f"../../models/cross-encoders/Spanish_SapBERT_grandparents/cef_{CORPUS.lower()}_Spanish_SapBERT_grandparents_sim_cand_200_epoch_1_bs_128", mapping_dict, df_test, df_link_train, df_gaz, CORPUS, TOP_K_VALUES)

Encoding: 100%|██████████| 45/45 [00:01<00:00, 38.51it/s]


cef_symptemist_Spanish_SapBERT_grandparents_sim_cand_200_epoch_1_bs_128


Reranking candidates: 100%|██████████| 2848/2848 [05:09<00:00,  9.19it/s]


0,1
Accuracy/Gold_Standard,▁▄▇▇██
Accuracy/Unseen_Codes,▁▄▇▇██
Accuracy/Unseen_Mentions,▁▄▇▇██
GS-1,▁
GS-100,▁
GS-200,▁
GS-25,▁
GS-5,▁
GS-50,▁
UC-1,▁

0,1
Accuracy/Gold_Standard,0.93013
Accuracy/Unseen_Codes,0.84409
Accuracy/Unseen_Mentions,0.82996
GS-1,0.59059
GS-100,0.92486
GS-200,0.93013
GS-25,0.87254
GS-5,0.74508
GS-50,0.90379
UC-1,0.3135
