In [1]:
import os, sys, warnings
warnings.filterwarnings("ignore")
import pandas as pd
import pickle
from tqdm.auto import tqdm
sys.path.append('../')
import src.FaissEncoder as faiss_enc
import src.TripletsGeneration as tg

In [2]:
CORPUS = "SympTEMIST"
CORPUS_PATH = f"../../EntityLinking/data/{CORPUS}/"
DATA_PATH = os.path.join(CORPUS_PATH, "processed_data/")
TOP_K_VALUES = [1,5,25,50,100,200]
F_TYPE = "FlatIP"
MAX_LENGTH = 256
DEPTH = 1

In [3]:
df_train = pd.read_csv(os.path.join(DATA_PATH, "df_link_gaz_train.tsv"), sep="\t", header=0, dtype={"code": str})
df_link_train = pd.read_csv(os.path.join(DATA_PATH, "df_link_train.tsv"), sep="\t", header=0, dtype={"code": str})
df_gaz = pd.read_csv(os.path.join(DATA_PATH, "gazetteer_term_code.tsv"), sep="\t", header=0, dtype={"code": str})
df_train_link = pd.read_csv(os.path.join(DATA_PATH, "df_link_train.tsv"), sep="\t", header=0, dtype={"code": str}, low_memory=False)

if CORPUS == "SympTEMIST":
    df_test = pd.read_csv("../../data/SympTEMIST/symptemist-complete_240208/symptemist_test/subtask2-linking/symptemist_tsv_test_subtask2.tsv", sep="\t", header=0, dtype={"code": str})
    df_test = df_test.rename(columns={'text': 'term'})
elif CORPUS == "MedProcNER":
    df_test = pd.read_csv("../../data/MedProcNER/medprocner_gs_train+test+gazz+multilingual+crossmap_230808/medprocner_test/tsv/medprocner_tsv_test_subtask2.tsv", sep="\t", header=0, dtype={"code": str})
    df_test = df_test.rename(columns={'text': 'term'})
elif CORPUS == "DisTEMIST":
    df_test = pd.read_csv("../../data/DisTEMIST/distemist_zenodo/test_annotated/subtrack2_linking/distemist_subtrack2_test_linking.tsv", sep="\t", header=0, dtype={"code": str})
    df_test = df_test.rename(columns={'span': 'term'})


In [4]:
with open("graph_G.pkl", "rb") as f:
    G = pickle.load(f)
with open("scui_to_cui_dict.pkl", "rb") as handle:
    mapping_dict = pickle.load(handle)

In [5]:
BIENCODER_PATH = "../../models/spanish_sapbert_models/sapbert_15_parents_1epoch/"
faiss_encoder = faiss_enc.FaissEncoder(BIENCODER_PATH, F_TYPE, MAX_LENGTH, df_gaz)
faiss_encoder.fitFaiss()

Encoding: 100%|██████████| 5150/5150 [00:59<00:00, 86.47it/s]


In [6]:
candidates, codes, similarities = faiss_encoder.getCandidates(df_test["term"].tolist(), k=5)
df_test["candidates"] = candidates
df_test["codes"] = codes
df_test["similarities"] = similarities
df_hard_triplets = tg.SimilarityHardTriplets(df_test).generate_triplets(similarity_threshold=0.35)
df_hard_triplets.shape

Encoding: 100%|██████████| 45/45 [00:01<00:00, 42.68it/s]


(20, 3)

In [7]:
df_hard_triplets.head()

Unnamed: 0,anchor,positive,negative
0,falleció,falleció,fallecido en domicilio
1,falleció,falleció,fallecido en cirugía
2,falleció,falleció,fallecido en el hospital
3,falleció,falleció,fallecido en la calle
4,disnea,disnea,disnea espiratoria


In [8]:
df_hard_triplets = tg.HardTripletsKG(df_test, G, mapping_dict, DEPTH, bidirectional=False).generate_triplets()
df_hard_triplets.shape

(222, 3)

In [9]:
df_hard_triplets.head()

Unnamed: 0,anchor,positive,negative
0,falleció,muerto,fallecido en su domicilio
1,falleció,muerto,fallecido en el hospital (hallazgo)
2,falleció,muerto,fallecido en la calle (hallazgo)
3,falleció,muerto,lugar de la muerte - hallazgo
4,falleció,muerto,fallecido en la calle


In [10]:
df_hard_triplets = tg.HardTripletsKG(df_test, G, mapping_dict, DEPTH, bidirectional=True).generate_triplets()
df_hard_triplets.shape

(418, 3)

In [11]:
df_hard_triplets.head()

Unnamed: 0,anchor,positive,negative
0,falleció,muerto,muerte - esperada (hallazgo)
1,falleció,muerto,muerto - muerte sin testigos (hallazgo)
2,falleció,muerto,fallecido en su domicilio
3,falleció,muerto,fallecido en el hospital (hallazgo)
4,falleció,muerto,muerto sin signos de enfermedad (hallazgo)


In [None]:
DEPTH = 2
candidates, codes, similarities = faiss_encoder.getCandidates(df_test["term"].tolist(), k=5)
df_test["candidates"] = candidates
df_test["codes"] = codes
df_test["similarities"] = similarities
df_hard_triplets = tg.SimilarityHardTriplets(df_test).generate_triplets(similarity_threshold=0.35)
df_hard_triplets.shape

In [None]:
df_hard_triplets = tg.HardTripletsKG(df_test, G, mapping_dict, DEPTH, bidirectional=False).generate_triplets()
df_hard_triplets.shape

In [None]:
df_hard_triplets = tg.HardTripletsKG(df_test, G, mapping_dict, DEPTH, bidirectional=True).generate_triplets()
df_hard_triplets.shape