In [1]:
import os, sys
import pandas as pd
import json

sys.path.append(os.path.join(os.getcwd(), '../src'))
from crossEncoder import CrossEncoderReranker
import faissEncoder as faiss_enc
from metrics import calculate_topk_accuracy
from utils import *


In [2]:
CORPUS = "DisTEMIST"
ClinLinker_model = {
    "DisTEMIST" : "ClinLinker-KB-GP",
    "MedProcNER" : "ClinLinker-KB-GP",
    "SympTEMIST" : "ClinLinker-KB-P"
}
F_TYPE = "FlatIP"
MAX_LENGTH = 256

DATA_PATH = "../../data/"
TOP_K_VALUES = [1, 5, 25, 50, 100, 200]


In [3]:
um_results, uc_results = dict(), dict()
um_df = pd.read_csv(f"../data/{CORPUS}/df_um.tsv", sep="\t", dtype={"code":str})
uc_df = pd.read_csv(f"../data/{CORPUS}/df_uc.tsv", sep="\t", dtype={"code":str})
_, train_df, gaz_df = load_corpus_data(DATA_PATH, CORPUS)
train_gaz_df = pd.concat([train_df[["term", "code"]], gaz_df[["term","code"]]], ignore_index=True)

In [4]:
MODEL = "cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR-large"
faiss_encoder = faiss_enc.FaissEncoder(MODEL, F_TYPE, MAX_LENGTH, train_gaz_df)
faiss_encoder.fitFaiss()

candidates, codes, _ = faiss_encoder.getCandidates(um_df["term"].tolist(), k=200)
xlmr_um_preds = um_df.copy()
xlmr_um_preds["candidates"] = candidates
xlmr_um_preds["codes"] = codes

candidates, codes, _ = faiss_encoder.getCandidates(uc_df["term"].tolist(), k=200)
xlmr_uc_preds = uc_df.copy()
xlmr_uc_preds["candidates"] = candidates
xlmr_uc_preds["codes"] = codes

del faiss_encoder


  return self.fget.__get__(instance, owner)()


Encoding:   0%|          | 0/4716 [00:00<?, ?it/s]

Encoding:   0%|          | 0/22 [00:00<?, ?it/s]

Encoding:   0%|          | 0/18 [00:00<?, ?it/s]

In [5]:
crossreranker = CrossEncoderReranker(model_name=f"../../models/NEL/cross-encoders/SapBERT_Multilingue_XLMR-large/cef_{CORPUS.lower()}_SapBERT_Multilingue_XLMR-large_sim_cand_200_epoch_1_bs_64/", model_type="st", max_seq_length=MAX_LENGTH)
um_results[MODEL.split("/")[-1]+"-sim"] = calculate_topk_accuracy(crossreranker.rerank_candidates(xlmr_um_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)
uc_results[MODEL.split("/")[-1]+"-sim"] = calculate_topk_accuracy(crossreranker.rerank_candidates(xlmr_uc_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)

Reranking candidates: 100%|██████████| 1375/1375 [13:36<00:00,  1.68it/s]
Reranking candidates: 100%|██████████| 1115/1115 [11:02<00:00,  1.68it/s]


In [6]:
crossreranker = CrossEncoderReranker(model_name=f"../../models/NEL/cross-encoders/SapBERT_Multilingue_XLMR-large/cef_{CORPUS.lower()}_SapBERT_Multilingue_XLMR-large_kg_1_cand_200_epoch_1_bs_64/", model_type="st", max_seq_length=MAX_LENGTH)
um_results[MODEL.split("/")[-1]+"-kg-1"] = calculate_topk_accuracy(crossreranker.rerank_candidates(xlmr_um_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)
uc_results[MODEL.split("/")[-1]+"-kg-1"] = calculate_topk_accuracy(crossreranker.rerank_candidates(xlmr_uc_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)

Reranking candidates: 100%|██████████| 1375/1375 [13:49<00:00,  1.66it/s]
Reranking candidates: 100%|██████████| 1115/1115 [10:55<00:00,  1.70it/s]


In [7]:
crossreranker = CrossEncoderReranker(model_name=f"../../models/NEL/cross-encoders/SapBERT_Multilingue_XLMR-large/cef_{CORPUS.lower()}_SapBERT_Multilingue_XLMR-large_kg_2_cand_200_epoch_1_bs_64/", model_type="st", max_seq_length=MAX_LENGTH)
um_results[MODEL.split("/")[-1]+"-kg-2"] = calculate_topk_accuracy(crossreranker.rerank_candidates(xlmr_um_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)
uc_results[MODEL.split("/")[-1]+"-kg-2"] = calculate_topk_accuracy(crossreranker.rerank_candidates(xlmr_uc_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)

Reranking candidates: 100%|██████████| 1375/1375 [13:34<00:00,  1.69it/s]
Reranking candidates: 100%|██████████| 1115/1115 [10:45<00:00,  1.73it/s]


In [8]:
crossreranker = CrossEncoderReranker(model_name=f"../../models/NEL/cross-encoders/SapBERT_Multilingue_XLMR-large/cef_{CORPUS.lower()}_SapBERT_Multilingue_XLMR-large_bkg_1_cand_200_epoch_1_bs_64/", model_type="st", max_seq_length=MAX_LENGTH)
um_results[MODEL.split("/")[-1]+"-bkg-1"] = calculate_topk_accuracy(crossreranker.rerank_candidates(xlmr_um_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)
uc_results[MODEL.split("/")[-1]+"-bkg-1"] = calculate_topk_accuracy(crossreranker.rerank_candidates(xlmr_uc_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)

Reranking candidates: 100%|██████████| 1375/1375 [13:40<00:00,  1.68it/s]
Reranking candidates: 100%|██████████| 1115/1115 [10:50<00:00,  1.71it/s]


In [10]:
crossreranker = CrossEncoderReranker(model_name=f"../../models/NEL/cross-encoders/SapBERT_Multilingue_XLMR-large/cef_{CORPUS.lower()}_SapBERT_Multilingue_XLMR-large_bkg_2_cand_200_epoch_1_bs_64/", model_type="st", max_seq_length=MAX_LENGTH)
um_results[MODEL.split("/")[-1]+"-bkg-2"] = calculate_topk_accuracy(crossreranker.rerank_candidates(xlmr_um_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)
uc_results[MODEL.split("/")[-1]+"-bkg-2"] = calculate_topk_accuracy(crossreranker.rerank_candidates(xlmr_uc_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)

Reranking candidates: 100%|██████████| 1375/1375 [12:47<00:00,  1.79it/s]
Reranking candidates: 100%|██████████| 1115/1115 [10:54<00:00,  1.70it/s]


In [11]:
pd.DataFrame.from_dict(um_results, orient='index').reset_index().rename(columns={'index': 'name'}).head(20)

Unnamed: 0,name,1,5,25,50,100,200
0,SapBERT-UMLS-2020AB-all-lang-from-XLMR-large-sim,0.373818,0.583273,0.689455,0.721455,0.741091,0.749091
1,SapBERT-UMLS-2020AB-all-lang-from-XLMR-large-kg-1,0.284364,0.448727,0.581818,0.634182,0.672,0.749091
2,SapBERT-UMLS-2020AB-all-lang-from-XLMR-large-kg-2,0.050909,0.144727,0.332364,0.453818,0.608,0.749091
3,SapBERT-UMLS-2020AB-all-lang-from-XLMR-large-b...,0.088,0.128,0.341818,0.444364,0.587636,0.749091
4,SapBERT-UMLS-2020AB-all-lang-from-XLMR-large-b...,0.357818,0.529455,0.643636,0.68,0.712727,0.749091


In [12]:
pd.DataFrame.from_dict(uc_results, orient='index').reset_index().rename(columns={'index': 'name'}).head(20)

Unnamed: 0,name,1,5,25,50,100,200
0,SapBERT-UMLS-2020AB-all-lang-from-XLMR-large-sim,0.408072,0.591031,0.69148,0.719283,0.740807,0.747085
1,SapBERT-UMLS-2020AB-all-lang-from-XLMR-large-kg-1,0.346188,0.487892,0.593722,0.641256,0.678924,0.747085
2,SapBERT-UMLS-2020AB-all-lang-from-XLMR-large-kg-2,0.068161,0.172197,0.350673,0.460987,0.609865,0.747085
3,SapBERT-UMLS-2020AB-all-lang-from-XLMR-large-b...,0.114798,0.146188,0.356054,0.456502,0.596413,0.747085
4,SapBERT-UMLS-2020AB-all-lang-from-XLMR-large-b...,0.403587,0.560538,0.651121,0.683408,0.713004,0.747085


In [13]:
if ClinLinker_model[CORPUS] == "ClinLinker-KB-GP":
        be_path = f"../../models/NEL/spanish_sapbert_models/sapbert_15_grandparents_1epoch/"
        ce_path = f"../../models/NEL/cross-encoders/Spanish_SapBERT_grandparents/cef_{CORPUS.lower()}_Spanish_SapBERT_grandparents_sim_cand_200_epoch_1_bs_128/"
elif ClinLinker_model[CORPUS] == "ClinLinker-KB-P":
    be_path = "../../models/NEL/spanish_sapbert_models/sapbert_15_parents_1epoch/"
    ce_path = f"../../models/NEL/cross-encoders/Spanish_SapBERT_parents/cef_{CORPUS.lower()}_Spanish_SapBERT_parents_sim_cand_200_epoch_1_bs_128/"

faiss_encoder = faiss_enc.FaissEncoder(be_path, F_TYPE, MAX_LENGTH, train_gaz_df)
faiss_encoder.fitFaiss()
candidates, codes, _ = faiss_encoder.getCandidates(um_df["term"].tolist(), k=200)
cl_um_preds = um_df.copy()
cl_um_preds["candidates"] = candidates
cl_um_preds["codes"] = codes

candidates, codes, _ = faiss_encoder.getCandidates(uc_df["term"].tolist(), k=200)
cl_uc_preds = uc_df.copy()
cl_uc_preds["candidates"] = candidates
cl_uc_preds["codes"] = codes
del faiss_encoder

Encoding:   0%|          | 0/4716 [00:00<?, ?it/s]

Encoding:   0%|          | 0/22 [00:00<?, ?it/s]

Encoding:   0%|          | 0/18 [00:00<?, ?it/s]

In [14]:
crossreranker = CrossEncoderReranker(model_name=ce_path, model_type="st", max_seq_length=MAX_LENGTH)
um_results[be_path.split("/")[5]+"-sim"] = calculate_topk_accuracy(crossreranker.rerank_candidates(cl_um_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)
uc_results[be_path.split("/")[5]+"-sim"] = calculate_topk_accuracy(crossreranker.rerank_candidates(cl_uc_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)

Reranking candidates: 100%|██████████| 1375/1375 [03:19<00:00,  6.90it/s]
Reranking candidates: 100%|██████████| 1115/1115 [02:41<00:00,  6.91it/s]


In [15]:
crossreranker = CrossEncoderReranker(model_name=ce_path.replace("sim", "kg_1"), model_type="st", max_seq_length=MAX_LENGTH)
um_results[be_path.split("/")[5]+"-kg-1"] = calculate_topk_accuracy(crossreranker.rerank_candidates(cl_um_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)
uc_results[be_path.split("/")[5]+"-kg-1"] = calculate_topk_accuracy(crossreranker.rerank_candidates(cl_uc_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)

Reranking candidates: 100%|██████████| 1375/1375 [03:19<00:00,  6.88it/s]
Reranking candidates: 100%|██████████| 1115/1115 [02:41<00:00,  6.89it/s]


In [16]:
crossreranker = CrossEncoderReranker(model_name=ce_path.replace("sim", "kg_2"), model_type="st", max_seq_length=MAX_LENGTH)
um_results[be_path.split("/")[5]+"-kg-2"] = calculate_topk_accuracy(crossreranker.rerank_candidates(cl_um_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)
uc_results[be_path.split("/")[5]+"-kg-2"] = calculate_topk_accuracy(crossreranker.rerank_candidates(cl_uc_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)

Reranking candidates: 100%|██████████| 1375/1375 [03:19<00:00,  6.90it/s]
Reranking candidates: 100%|██████████| 1115/1115 [02:41<00:00,  6.91it/s]


In [17]:
crossreranker = CrossEncoderReranker(model_name=ce_path.replace("sim", "bkg_1"), model_type="st", max_seq_length=MAX_LENGTH)
um_results[be_path.split("/")[5]+"-bkg-1"] = calculate_topk_accuracy(crossreranker.rerank_candidates(cl_um_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)
uc_results[be_path.split("/")[5]+"-bkg-1"] = calculate_topk_accuracy(crossreranker.rerank_candidates(cl_uc_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)

Reranking candidates: 100%|██████████| 1375/1375 [03:20<00:00,  6.86it/s]
Reranking candidates: 100%|██████████| 1115/1115 [02:41<00:00,  6.90it/s]


In [18]:
crossreranker = CrossEncoderReranker(model_name=ce_path.replace("sim", "bkg_2"), model_type="st", max_seq_length=MAX_LENGTH)
um_results[be_path.split("/")[5]+"-bkg-2"] = calculate_topk_accuracy(crossreranker.rerank_candidates(cl_um_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)
uc_results[be_path.split("/")[5]+"-bkg-2"] = calculate_topk_accuracy(crossreranker.rerank_candidates(cl_uc_preds.copy(deep=True), "term", "candidates", "codes"), TOP_K_VALUES)

Reranking candidates: 100%|██████████| 1375/1375 [03:18<00:00,  6.94it/s]
Reranking candidates: 100%|██████████| 1115/1115 [02:40<00:00,  6.94it/s]


In [19]:
pd.DataFrame.from_dict(um_results, orient='index').reset_index().rename(columns={'index': 'name'}).to_csv(f"../results/{CORPUS}/um_results.tsv", sep='\t', index=False)

In [20]:
pd.DataFrame.from_dict(uc_results, orient='index').reset_index().rename(columns={'index': 'name'}).to_csv(f"../results/{CORPUS}/uc_results.tsv", sep='\t', index=False)