In [43]:
import os
import sys
import warnings
import re
warnings.filterwarnings("ignore")
import pandas as pd

sys.path.append(os.path.join(os.getcwd(), '../src'))
from crossEncoder import CrossEncoderReranker
import faissEncoder as faiss_enc
import utils
from logger import setup_custom_logger  

In [44]:
CORPUS = "SympTEMIST"
TOP_K_VALUES = [1,5,25,50,100,200]
F_TYPE = "FlatIP"
MAX_LENGTH = 256

res_gs, res_uc, res_um = [], [], [] 

In [45]:
test_df, train_df, gaz_df = utils.load_corpus_data(CORPUS)
link_gaz_df = pd.concat([train_df[['code', 'term']], gaz_df[['code', 'term']]], ignore_index=True)

In [46]:
faiss_encoder = faiss_enc.FaissEncoder("../../../models/NEL/spanish_sapbert_models/sapbert_15_grandparents_1epoch", F_TYPE, MAX_LENGTH, link_gaz_df)
faiss_encoder.fitFaiss()

Encoding:   0%|          | 0/5410 [00:00<?, ?it/s]

In [47]:
candidates, codes, _ = faiss_encoder.getCandidates(test_df["term"].tolist(), k=200)
clinlinker_granparent_preds = test_df.copy()
clinlinker_granparent_preds["candidates"] = candidates
clinlinker_granparent_preds["codes"] = codes
gs_aux, uc_aux, um_aux = utils.evaluate_model("ClinLinker-KB-GP", clinlinker_granparent_preds, train_df, gaz_df, TOP_K_VALUES)
res_gs.append(gs_aux)
res_uc.append(uc_aux)
res_um.append(um_aux)

Encoding:   0%|          | 0/45 [00:00<?, ?it/s]

In [48]:
faiss_encoder = faiss_enc.FaissEncoder("../../../models/NEL/spanish_sapbert_models/sapbert_15_parents_1epoch", F_TYPE, MAX_LENGTH, link_gaz_df)
faiss_encoder.fitFaiss()

Encoding:   0%|          | 0/5410 [00:00<?, ?it/s]

In [49]:
candidates, codes, _ = faiss_encoder.getCandidates(test_df["term"].tolist(), k=200)
clinlinker_parent_preds = test_df.copy()
clinlinker_parent_preds["candidates"] = candidates
clinlinker_parent_preds["codes"] = codes
gs_aux, uc_aux, um_aux = utils.evaluate_model("ClinLinker-KB-P", clinlinker_parent_preds, train_df, gaz_df, TOP_K_VALUES)
res_gs.append(gs_aux)
res_uc.append(uc_aux)
res_um.append(um_aux)

Encoding:   0%|          | 0/45 [00:00<?, ?it/s]

In [50]:
faiss_encoder = faiss_enc.FaissEncoder("../../../models/NEL/spanish_sapbert_models/sapbert_15_noparents_1epoch", F_TYPE, MAX_LENGTH, link_gaz_df)
faiss_encoder.fitFaiss()

Encoding:   0%|          | 0/5410 [00:00<?, ?it/s]

In [51]:
candidates, codes, _ = faiss_encoder.getCandidates(test_df["term"].tolist(), k=200)
clinlinker_preds = test_df.copy()
clinlinker_preds["candidates"] = candidates
clinlinker_preds["codes"] = codes
gs_aux, uc_aux, um_aux = utils.evaluate_model("ClinLinker", clinlinker_preds, train_df, gaz_df, TOP_K_VALUES)
res_gs.append(gs_aux)
res_uc.append(uc_aux)
res_um.append(um_aux)

Encoding:   0%|          | 0/45 [00:00<?, ?it/s]

In [52]:
faiss_encoder = faiss_enc.FaissEncoder(f"../../../models/NEL/corpus-specific_bi-encoders/{CORPUS.lower()}-biencoder", F_TYPE, MAX_LENGTH, link_gaz_df)
faiss_encoder.fitFaiss()

Encoding:   0%|          | 0/5410 [00:00<?, ?it/s]

In [53]:
candidates, codes, _ = faiss_encoder.getCandidates(test_df["term"].tolist(), k=200)
corpus_specific_preds = test_df.copy()
corpus_specific_preds["candidates"] = candidates
corpus_specific_preds["codes"] = codes
gs_aux, uc_aux, um_aux = utils.evaluate_model(f"{CORPUS}-bi-encoder", corpus_specific_preds, train_df, gaz_df, TOP_K_VALUES)
res_gs.append(gs_aux)
res_uc.append(uc_aux)
res_um.append(um_aux)

Encoding:   0%|          | 0/45 [00:00<?, ?it/s]

In [54]:
faiss_encoder = faiss_enc.FaissEncoder("cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR-large", F_TYPE, MAX_LENGTH, link_gaz_df)
faiss_encoder.fitFaiss()

Encoding:   0%|          | 0/5410 [00:00<?, ?it/s]

In [55]:
candidates, codes, _ = faiss_encoder.getCandidates(test_df["term"].tolist(), k=200)
baseline_preds = test_df.copy()
baseline_preds["candidates"] = candidates
baseline_preds["codes"] = codes
gs_aux, uc_aux, um_aux = utils.evaluate_model("SapBERT-XLM-R-large", baseline_preds, train_df, gaz_df, TOP_K_VALUES)
res_gs.append(gs_aux)
res_uc.append(uc_aux)
res_um.append(um_aux)

Encoding:   0%|          | 0/45 [00:00<?, ?it/s]

In [56]:
faiss_encoder = faiss_enc.FaissEncoder("cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR", F_TYPE, MAX_LENGTH, link_gaz_df)
faiss_encoder.fitFaiss()

Encoding:   0%|          | 0/5410 [00:00<?, ?it/s]

In [57]:
candidates, codes, _ = faiss_encoder.getCandidates(test_df["term"].tolist(), k=200)
xlmr_base_preds = test_df.copy()
xlmr_base_preds["candidates"] = candidates
xlmr_base_preds["codes"] = codes
gs_aux, uc_aux, um_aux = utils.evaluate_model("SapBERT-XLM-R-base", xlmr_base_preds, train_df, gaz_df, TOP_K_VALUES)
res_gs.append(gs_aux)
res_uc.append(uc_aux)
res_um.append(um_aux)

Encoding:   0%|          | 0/45 [00:00<?, ?it/s]

In [58]:
faiss_encoder = faiss_enc.FaissEncoder("PlanTL-GOB-ES/roberta-base-biomedical-clinical-es", F_TYPE, MAX_LENGTH, link_gaz_df)
faiss_encoder.fitFaiss()

Some weights of RobertaModel were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Encoding:   0%|          | 0/5410 [00:00<?, ?it/s]

In [59]:
candidates, codes, _ = faiss_encoder.getCandidates(test_df["term"].tolist(), k=200)
roberta_base_preds = test_df.copy()
roberta_base_preds["candidates"] = candidates
roberta_base_preds["codes"] = codes
gs_aux, uc_aux, um_aux = utils.evaluate_model("Roberta-base-biomedical-es", roberta_base_preds, train_df, gaz_df, TOP_K_VALUES)
res_gs.append(gs_aux)
res_uc.append(uc_aux)
res_um.append(um_aux)

Encoding:   0%|          | 0/45 [00:00<?, ?it/s]

In [60]:
cross_encoder = CrossEncoderReranker(f"../../../models/NEL/cross-encoders/Spanish_SapBERT_grandparents/cef_{CORPUS.lower()}_Spanish_SapBERT_grandparents_sim_cand_200_epoch_1_bs_128/", model_type="mask", max_seq_length=MAX_LENGTH)
gs_aux, uc_aux, um_aux = utils.evaluate_crossencoder("ClinLinker-KB-GP_CE",cross_encoder, clinlinker_granparent_preds, train_df, gaz_df, TOP_K_VALUES)
res_gs.append(gs_aux)
res_uc.append(uc_aux)
res_um.append(um_aux)

Reranking candidates: 100%|██████████| 2848/2848 [06:32<00:00,  7.26it/s]


In [61]:
cross_encoder = CrossEncoderReranker(f"../../../models/NEL/cross-encoders/Spanish_SapBERT_parents/cef_{CORPUS.lower()}_Spanish_SapBERT_parents_sim_cand_200_epoch_1_bs_128/", model_type="mask", max_seq_length=MAX_LENGTH)
gs_aux, uc_aux, um_aux = utils.evaluate_crossencoder("ClinLinker-KB-P_CE",cross_encoder, clinlinker_parent_preds, train_df, gaz_df, TOP_K_VALUES)
res_gs.append(gs_aux)
res_uc.append(uc_aux)
res_um.append(um_aux)

Reranking candidates: 100%|██████████| 2848/2848 [06:26<00:00,  7.37it/s]


In [62]:
cross_encoder = CrossEncoderReranker(f"../../../models/NEL/cross-encoders/Spanish_SapBERT_noparents/cef_{CORPUS.lower()}_Spanish_SapBERT_noparents_sim_cand_200_epoch_1_bs_128/", model_type="mask", max_seq_length=MAX_LENGTH)
gs_aux, uc_aux, um_aux = utils.evaluate_crossencoder("ClinLinker_CE",cross_encoder, clinlinker_preds, train_df, gaz_df, TOP_K_VALUES)
res_gs.append(gs_aux)
res_uc.append(uc_aux)
res_um.append(um_aux)

Reranking candidates: 100%|██████████| 2848/2848 [05:58<00:00,  7.95it/s]


In [63]:
res_gs_df = utils.results2tsv(res_gs)
res_gs_df.to_csv(f"../results/{CORPUS}/gs_results.tsv", sep="\t", index=False)
res_uc_df = utils.results2tsv(res_uc)
res_uc_df.to_csv(f"../results/{CORPUS}/uc_results.tsv", sep="\t", index=False)
res_um_df = utils.results2tsv(res_um)
res_um_df.to_csv(f"../results/{CORPUS}/um_results.tsv", sep="\t", index=False)