In [1]:
import os, sys
import pandas as pd
import json

sys.path.append(os.path.join(os.getcwd(), '../src'))
from crossEncoder import CrossEncoderReranker
import faissEncoder as faiss_enc
from metrics import calculate_topk_accuracy
from utils import *


In [2]:
CORPUS = "MedProcNER"
ClinLinker_model = {
    "DisTEMIST" : "ClinLinker-KB-GP",
    "MedProcNER" : "ClinLinker-KB-GP",
    "SympTEMIST" : "ClinLinker-KB-P"
}
F_TYPE = "FlatIP"
MAX_LENGTH = 256

DATA_PATH = "../../data/"
TOP_K_VALUES = [1, 5, 25, 50, 100, 200]


In [3]:
um_results, uc_results = dict(), dict()
um_df = pd.read_csv(f"../data/{CORPUS}/df_um.tsv", sep="\t", dtype={"code":str})
uc_df = pd.read_csv(f"../data/{CORPUS}/df_uc.tsv", sep="\t", dtype={"code":str})
_, train_df, gaz_df = load_corpus_data(DATA_PATH, CORPUS)
train_gaz_df = pd.concat([train_df[["term", "code"]], gaz_df[["term","code"]]], ignore_index=True)

In [4]:
MODEL = "cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR-large"
faiss_encoder = faiss_enc.FaissEncoder(MODEL, F_TYPE, MAX_LENGTH, train_gaz_df)
faiss_encoder.fitFaiss()

candidates, codes, _ = faiss_encoder.getCandidates(um_df["term"].tolist(), k=200)
xlmr_um_preds = um_df.copy()
xlmr_um_preds["candidates"] = candidates
xlmr_um_preds["codes"] = codes

candidates, codes, _ = faiss_encoder.getCandidates(uc_df["term"].tolist(), k=200)
xlmr_uc_preds = uc_df.copy()
xlmr_uc_preds["candidates"] = candidates
xlmr_uc_preds["codes"] = codes

um_results[MODEL.split("/")[-1]+"-bi"] = calculate_topk_accuracy(xlmr_um_preds, TOP_K_VALUES)
uc_results[MODEL.split("/")[-1]+"-bi"] = calculate_topk_accuracy(xlmr_uc_preds, TOP_K_VALUES)


Encoding:   0%|          | 0/7486 [00:00<?, ?it/s]

Encoding:   0%|          | 0/28 [00:00<?, ?it/s]

Encoding:   0%|          | 0/14 [00:00<?, ?it/s]

In [5]:
if ClinLinker_model[CORPUS] == "ClinLinker-KB-GP":
        be_path = f"../../models/NEL/spanish_sapbert_models/sapbert_15_grandparents_1epoch/"
        ce_path = f"../../models/NEL/cross-encoders/Spanish_SapBERT_grandparents/cef_{CORPUS.lower()}_Spanish_SapBERT_grandparents_sim_cand_200_epoch_1_bs_128/"
elif ClinLinker_model[CORPUS] == "ClinLinker-KB-P":
    be_path = "../../models/NEL/spanish_sapbert_models/sapbert_15_parents_1epoch/"
    ce_path = f"../../models/NEL/cross-encoders/Spanish_SapBERT_parents/cef_{CORPUS.lower()}_Spanish_SapBERT_parents_sim_cand_200_epoch_1_bs_128/"

faiss_encoder = faiss_enc.FaissEncoder(be_path, F_TYPE, MAX_LENGTH, train_gaz_df)
faiss_encoder.fitFaiss()
candidates, codes, _ = faiss_encoder.getCandidates(um_df["term"].tolist(), k=200)
cl_um_preds = um_df.copy()
cl_um_preds["candidates"] = candidates
cl_um_preds["codes"] = codes

candidates, codes, _ = faiss_encoder.getCandidates(uc_df["term"].tolist(), k=200)
cl_uc_preds = uc_df.copy()
cl_uc_preds["candidates"] = candidates
cl_uc_preds["codes"] = codes

um_results[be_path.split("/")[5]+"-bi"] = calculate_topk_accuracy(cl_um_preds, TOP_K_VALUES)
uc_results[be_path.split("/")[5]+"-bi"] = calculate_topk_accuracy(cl_uc_preds, TOP_K_VALUES)

Encoding:   0%|          | 0/7486 [00:00<?, ?it/s]

Encoding:   0%|          | 0/28 [00:00<?, ?it/s]

Encoding:   0%|          | 0/14 [00:00<?, ?it/s]

In [6]:
pd.DataFrame.from_dict(um_results, orient='index').reset_index().rename(columns={'index': 'name'}).to_csv(f"../results/{CORPUS}/um_results-bi.tsv", sep='\t', index=False)

In [7]:
pd.DataFrame.from_dict(uc_results, orient='index').reset_index().rename(columns={'index': 'name'}).to_csv(f"../results/{CORPUS}/uc_results-bi.tsv", sep='\t', index=False)