In [1]:
import os, sys, warnings
warnings.filterwarnings("ignore")
import pandas as pd
import pickle
import gc

from tqdm.auto import tqdm
sys.path.append(os.path.join(os.getcwd(), '../src'))
import faissEncoder as faiss_enc
import tripletsGeneration as tg
from utils import *

In [2]:
DATA_PATH = "../../data/"
F_TYPE = "FlatIP"
MAX_LENGTH = 256
CORPORA = ["DisTEMIST", "MedProcNER", "SympTEMIST"]
ClinLinker_model = {
    "DisTEMIST" : "ClinLinker-KB-GP",
    "MedProcNER" : "ClinLinker-KB-GP",
    "SympTEMIST" : "ClinLinker-KB-P"
}

In [3]:
with open("../src/graph_G.pkl", "rb") as f:
    G = pickle.load(f)
with open("../src/scui_to_cui_dict.pkl", "rb") as handle:
    mapping_dict = pickle.load(handle)
triplets_size = list()
for CORPUS in CORPORA:
    
    _, df_train_link, df_gaz = load_corpus_data(DATA_PATH, CORPUS)

    if ClinLinker_model[CORPUS] == "ClinLinker-KB-GP":
            be_path = f"../../models/NEL/spanish_sapbert_models/sapbert_15_grandparents_1epoch/"
    elif ClinLinker_model[CORPUS] == "ClinLinker-KB-P":
        be_path = "../../models/NEL/spanish_sapbert_models/sapbert_15_parents_1epoch/"

    faiss_encoder = faiss_enc.FaissEncoder(be_path, F_TYPE, MAX_LENGTH, df_gaz)
    faiss_encoder.fitFaiss()
    candidates, codes, similarities = faiss_encoder.getCandidates(df_train_link["term"].tolist(), 200, MAX_LENGTH)

    df_train_link["candidates"] = candidates
    df_train_link["codes"] = codes
    df_train_link["similarities"] = similarities
    sim_triplets = tg.SimilarityHardTriplets(df_train_link).generate_triplets(similarity_threshold=0.35).drop_duplicates().reset_index(drop=True).shape[0]
    kg_1_triplets = tg.HardTripletsKG(df_train_link, G, mapping_dict, 1, bidirectional=False).generate_triplets().drop_duplicates().reset_index(drop=True).shape[0]
    bkg_1_triplets = tg.HardTripletsKG(df_train_link, G, mapping_dict, 1, bidirectional=True).generate_triplets().drop_duplicates().reset_index(drop=True).shape[0]
    kg_2_triplets = tg.HardTripletsKG(df_train_link, G, mapping_dict, 2, bidirectional=False).generate_triplets().drop_duplicates().reset_index(drop=True).shape[0]
    bkg_2_triplets = tg.HardTripletsKG(df_train_link, G, mapping_dict, 2, bidirectional=True).generate_triplets().drop_duplicates().reset_index(drop=True).shape[0]
    triplets_size.append([CORPUS,sim_triplets, kg_1_triplets, bkg_1_triplets, kg_2_triplets, bkg_2_triplets])
    gc.collect()

Encoding:   0%|          | 0/4603 [00:00<?, ?it/s]

Encoding:   0%|          | 0/15 [00:00<?, ?it/s]

Encoding:   0%|          | 0/7334 [00:00<?, ?it/s]

Encoding:   0%|          | 0/19 [00:00<?, ?it/s]

Encoding:   0%|          | 0/5151 [00:00<?, ?it/s]

Encoding:   0%|          | 0/33 [00:00<?, ?it/s]

In [4]:
pd.DataFrame(triplets_size, columns=["corpus", "sim", "kg_1", "bkg_1", "kg_2", "bkg_2"]).head(10)

Unnamed: 0,corpus,sim,kg_1,bkg_1,kg_2,bkg_2
0,DisTEMIST,426646,5484631,16131267,7645351,121865655
1,MedProcNER,514424,7252631,24825237,9321604,139258253
2,SympTEMIST,986499,12063769,20892503,15989653,181462527


In [5]:
triplets_size

[['DisTEMIST', 426646, 5484631, 16131267, 7645351, 121865655],
 ['MedProcNER', 514424, 7252631, 24825237, 9321604, 139258253],
 ['SympTEMIST', 986499, 12063769, 20892503, 15989653, 181462527]]