In [22]:
import os
import sys
import warnings
import re
warnings.filterwarnings("ignore")
import pandas as pd

sys.path.append(os.path.join(os.getcwd(), '../src'))
from crossEncoder import CrossEncoderReranker
import faissEncoder as faiss_enc
import utils
from logger import setup_custom_logger  

In [23]:
CORPUS = "SympTEMIST"
TOP_K_VALUES = [1,5,25,50,100,200]
F_TYPE = "FlatIP"
MAX_LENGTH = 256

res_cleaned = []

In [24]:
test_df, train_df, gaz_df = utils.load_corpus_data(CORPUS)
link_gaz_df = pd.concat([train_df[['code', 'term']], gaz_df[['code', 'term']]], ignore_index=True)
test_df = test_df[~test_df['code'].str.contains('NO_CODE|\+', regex=True)]

In [25]:
faiss_encoder = faiss_enc.FaissEncoder("../../../models/NEL/spanish_sapbert_models/sapbert_15_grandparents_1epoch", F_TYPE, MAX_LENGTH, link_gaz_df)
faiss_encoder.fitFaiss()

Encoding:   0%|          | 0/7486 [00:00<?, ?it/s]

In [26]:
candidates, codes, _ = faiss_encoder.getCandidates(test_df["term"].tolist(), k=200)
clinlinker_granparent_preds = test_df.copy()
clinlinker_granparent_preds["candidates"] = candidates
clinlinker_granparent_preds["codes"] = codes
cleaned_aux, _, _ = utils.evaluate_model("ClinLinker-KB-GP", clinlinker_granparent_preds, train_df, gaz_df, TOP_K_VALUES)
res_cleaned.append(cleaned_aux)

Encoding:   0%|          | 0/55 [00:00<?, ?it/s]

In [27]:
faiss_encoder = faiss_enc.FaissEncoder("../../../models/NEL/spanish_sapbert_models/sapbert_15_parents_1epoch", F_TYPE, MAX_LENGTH, link_gaz_df)
faiss_encoder.fitFaiss()

Encoding:   0%|          | 0/7486 [00:00<?, ?it/s]

In [28]:
candidates, codes, _ = faiss_encoder.getCandidates(test_df["term"].tolist(), k=200)
clinlinker_parent_preds = test_df.copy()
clinlinker_parent_preds["candidates"] = candidates
clinlinker_parent_preds["codes"] = codes
cleaned_aux, _, _ = utils.evaluate_model("ClinLinker-KB-P", clinlinker_parent_preds, train_df, gaz_df, TOP_K_VALUES)
res_cleaned.append(cleaned_aux)

Encoding:   0%|          | 0/55 [00:00<?, ?it/s]

In [29]:
faiss_encoder = faiss_enc.FaissEncoder("../../../models/NEL/spanish_sapbert_models/sapbert_15_noparents_1epoch", F_TYPE, MAX_LENGTH, link_gaz_df)
faiss_encoder.fitFaiss()

Encoding:   0%|          | 0/7486 [00:00<?, ?it/s]

In [30]:
candidates, codes, _ = faiss_encoder.getCandidates(test_df["term"].tolist(), k=200)
clinlinker_preds = test_df.copy()
clinlinker_preds["candidates"] = candidates
clinlinker_preds["codes"] = codes
cleaned_aux, _, _ = utils.evaluate_model("ClinLinker", clinlinker_preds, train_df, gaz_df, TOP_K_VALUES)
res_cleaned.append(cleaned_aux)

Encoding:   0%|          | 0/55 [00:00<?, ?it/s]

In [31]:
faiss_encoder = faiss_enc.FaissEncoder(f"../../../models/NEL/corpus-specific_bi-encoders/{CORPUS.lower()}-biencoder", F_TYPE, MAX_LENGTH, link_gaz_df)
faiss_encoder.fitFaiss()

Encoding:   0%|          | 0/7486 [00:00<?, ?it/s]

In [32]:
candidates, codes, _ = faiss_encoder.getCandidates(test_df["term"].tolist(), k=200)
corpus_specific_preds = test_df.copy()
corpus_specific_preds["candidates"] = candidates
corpus_specific_preds["codes"] = codes
cleaned_aux, _, _ = utils.evaluate_model(f"{CORPUS}-bi-encoder", corpus_specific_preds, train_df, gaz_df, TOP_K_VALUES)
res_cleaned.append(cleaned_aux)

Encoding:   0%|          | 0/55 [00:00<?, ?it/s]

In [33]:
faiss_encoder = faiss_enc.FaissEncoder("cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR-large", F_TYPE, MAX_LENGTH, link_gaz_df)
faiss_encoder.fitFaiss()

Encoding:   0%|          | 0/7486 [00:00<?, ?it/s]

In [34]:
candidates, codes, _ = faiss_encoder.getCandidates(test_df["term"].tolist(), k=200)
baseline_preds = test_df.copy()
baseline_preds["candidates"] = candidates
baseline_preds["codes"] = codes
cleaned_aux, _, _ = utils.evaluate_model("SapBERT-XLM-R-large", baseline_preds, train_df, gaz_df, TOP_K_VALUES)
res_cleaned.append(cleaned_aux)

Encoding:   0%|          | 0/55 [00:00<?, ?it/s]

In [35]:
faiss_encoder = faiss_enc.FaissEncoder("cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR", F_TYPE, MAX_LENGTH, link_gaz_df)
faiss_encoder.fitFaiss()

Encoding:   0%|          | 0/7486 [00:00<?, ?it/s]

In [36]:
candidates, codes, _ = faiss_encoder.getCandidates(test_df["term"].tolist(), k=200)
xlmr_base_preds = test_df.copy()
xlmr_base_preds["candidates"] = candidates
xlmr_base_preds["codes"] = codes
cleaned_aux, _, _ = utils.evaluate_model("SapBERT-XLM-R-base", xlmr_base_preds, train_df, gaz_df, TOP_K_VALUES)
res_cleaned.append(cleaned_aux)

Encoding:   0%|          | 0/55 [00:00<?, ?it/s]

In [37]:
faiss_encoder = faiss_enc.FaissEncoder("PlanTL-GOB-ES/roberta-base-biomedical-clinical-es", F_TYPE, MAX_LENGTH, link_gaz_df)
faiss_encoder.fitFaiss()

Some weights of RobertaModel were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Encoding:   0%|          | 0/7486 [00:00<?, ?it/s]

In [38]:
candidates, codes, _ = faiss_encoder.getCandidates(test_df["term"].tolist(), k=200)
roberta_base_preds = test_df.copy()
roberta_base_preds["candidates"] = candidates
roberta_base_preds["codes"] = codes
cleaned_aux, _, _ = utils.evaluate_model("Roberta-base-biomedical", roberta_base_preds, train_df, gaz_df, TOP_K_VALUES)
res_cleaned.append(cleaned_aux)

Encoding:   0%|          | 0/55 [00:00<?, ?it/s]

In [39]:
cross_encoder = CrossEncoderReranker(f"../../../models/NEL/cross-encoders/Spanish_SapBERT_grandparents/cef_{CORPUS.lower()}_Spanish_SapBERT_grandparents_sim_cand_200_epoch_1_bs_128/", model_type="mask", max_seq_length=MAX_LENGTH)
cleaned_aux, _, _ = utils.evaluate_crossencoder("ClinLinker-KB-GP_CE",cross_encoder, clinlinker_granparent_preds, train_df, gaz_df, TOP_K_VALUES)
res_cleaned.append(cleaned_aux)

Reranking candidates: 100%|██████████| 3512/3512 [08:17<00:00,  7.06it/s]


In [40]:
cross_encoder = CrossEncoderReranker(f"../../../models/NEL/cross-encoders/Spanish_SapBERT_parents/cef_{CORPUS.lower()}_Spanish_SapBERT_parents_sim_cand_200_epoch_1_bs_128/", model_type="mask", max_seq_length=MAX_LENGTH)
cleaned_aux, _, _ = utils.evaluate_crossencoder("ClinLinker-KB-P_CE",cross_encoder, clinlinker_parent_preds, train_df, gaz_df, TOP_K_VALUES)
res_cleaned.append(cleaned_aux)


Reranking candidates: 100%|██████████| 3512/3512 [08:16<00:00,  7.07it/s]


In [41]:
cross_encoder = CrossEncoderReranker(f"../../../models/NEL/cross-encoders/Spanish_SapBERT_noparents/cef_{CORPUS.lower()}_Spanish_SapBERT_noparents_sim_cand_200_epoch_1_bs_128/", model_type="mask", max_seq_length=MAX_LENGTH)
cleaned_aux, _, _ = utils.evaluate_crossencoder("ClinLinker_CE",cross_encoder, clinlinker_preds, train_df, gaz_df, TOP_K_VALUES)
res_cleaned.append(cleaned_aux)

Reranking candidates: 100%|██████████| 3512/3512 [07:29<00:00,  7.82it/s]


In [42]:
res_cleaned_df = utils.results2tsv(res_cleaned)
res_cleaned_df.to_csv(f"../results/{CORPUS}/cleaned_results.tsv", sep="\t", index=False)