In [1]:
import os, sys

import pandas as pd
import torch
from torch.utils.data import DataLoader
import pickle

sys.path.append('../')
import src.faissEncoder as faiss_enc
from src.crossEncoder import CrossEncoderReranker
from src.tripletsGeneration import  HardTripletsKG, SimilarityHardTriplets, TopHardTriplets

In [2]:
model_mapping = {
    "SapBERT-UMLS-2020AB-all-lang-from-XLMR/": "SapBERT_Multilingue_XLMR",
    "SapBERT-UMLS-2020AB-all-lang-from-XLMR-large/": "SapBERT_Multilingue_XLMR-large",
    "spanish_sapbert_models/sapbert_15_grandparents_1epoch/": "Spanish_SapBERT_grandparents",
    "spanish_sapbert_models/sapbert_15_parents_1epoch/": "Spanish_SapBERT_parents",
    "spanish_sapbert_models/sapbert_15_noparents_1epoch/": "Spanish_SapBERT_no_parents",
    "sapbert_B_256_E_1/" : "Spanish_SapBERT_ICB",
    "biencoder_symptemist_enriched_triplets_5_epoch_32_batch_5_parents_stag/" : "BiEncoder_SympTEMIST",
    "biencoder_medprocner_enriched_triplets_1_epoch_32_batch_5_parents_stag/" : "BiEncoder_MedProcNER",
    "biencoder_distemist_enriched_triplets_5_epoch_32_batch_5_parents_stag/" : "BiEncoder_DisTEMIST",
    "roberta-base-biomedical-clinical-es/": "Roberta-base-biomedical-clinical-es"
}
CORPUS = "MedProcNER"
CORPUS_PATH = f"../../EntityLinking/data/{CORPUS}/processed_data/"
MODEL_PATH = "cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR"
HARD_TRIPLETS_TYPE = "kg"
BATCH_SIZE = 128
MAX_LENGTH = 128
EPOCHS = 1
CANDIDATES = 200
NUM_NEGATIVES = 200
DEPTH = 1
F_TYPE = "FlatIP"
LR = 2e-5
WEIGHT_DECAY = 0.01
EVAL_STEPS = 250000
TEST_SIZE = None


In [3]:
model = MODEL_PATH.split("/")[-1]
mapped_name = model_mapping.get(model, model)
output_path = os.path.join("../models/", f"cef_{CORPUS.lower()}_{mapped_name}_{HARD_TRIPLETS_TYPE}_cand_{NUM_NEGATIVES}_epoch_{EPOCHS}_bs_{BATCH_SIZE}" if DEPTH == 0 else f"cef_{CORPUS.lower()}_{mapped_name}_{HARD_TRIPLETS_TYPE}_{DEPTH}_cand_{NUM_NEGATIVES}_epoch_{EPOCHS}_bs_{BATCH_SIZE}")
df_test = pd.read_csv(os.path.join(CORPUS_PATH, "df_link_test.tsv"), sep="\t", header=0, dtype={"code": str})
df_unseen_codes = pd.read_csv(os.path.join(CORPUS_PATH, "df_unseen_codes.tsv"), header=0, sep='\t', dtype={"code": str})
df_train = pd.read_csv(os.path.join(CORPUS_PATH, "df_link_gaz_train.tsv"), sep="\t", header=0, dtype={"code": str})
df_gaz = pd.read_csv(os.path.join(CORPUS_PATH, "gazetteer_term_code.tsv"), sep="\t", header=0, dtype={"code": str}, low_memory=False)
df_train_link = pd.read_csv(os.path.join(CORPUS_PATH, "df_link_train.tsv"), sep="\t", header=0, dtype={"code": str}, low_memory=False)


In [4]:
faiss_encoder = faiss_enc.FaissEncoder(MODEL_PATH, F_TYPE, MAX_LENGTH, df_gaz)
faiss_encoder.fitFaiss()

candidates, codes, similarities = faiss_encoder.getCandidates(df_train_link["term"].tolist(), CANDIDATES , MAX_LENGTH)
df_train_link["candidates"], df_train_link["codes"] = candidates, codes

if HARD_TRIPLETS_TYPE == 'top':
    df_hard_triplets = TopHardTriplets(df_train_link).generate_triplets(NUM_NEGATIVES)
elif HARD_TRIPLETS_TYPE == 'sim':
    df_train_link["similarities"] = similarities
    df_hard_triplets = SimilarityHardTriplets(df_train_link).generate_triplets(similarity_threshold=0.35)
elif HARD_TRIPLETS_TYPE == 'kg':
    with open("../src/utils/graph_G.pkl", "rb") as f:
        G = pickle.load(f)
    with open("../src/utils/scui_to_cui_dict.pkl", "rb") as handle:
        mapping_dict = pickle.load(handle)
    df_hard_triplets = HardTripletsKG(df_train_link, G, mapping_dict, DEPTH, bidirectional=False).generate_triplets()
elif HARD_TRIPLETS_TYPE == 'bkg':
    with open("../src/utils/graph_G.pkl", "rb") as f:
        G = pickle.load(f)
    with open("scui_to_cui_dict.pkl", "rb") as handle:
        mapping_dict = pickle.load(handle)
    df_hard_triplets = HardTripletsKG(df_train_link, G, mapping_dict, DEPTH, bidirectional=True).generate_triplets()

df_hard_triplets = df_hard_triplets.drop_duplicates().reset_index(drop=True)


Encoding:   0%|          | 0/7321 [00:00<?, ?it/s]

Encoding:   0%|          | 0/37 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: 'graph_G.pkl'

In [None]:
df_hard_triplets.head()

Unnamed: 0,anchor,positive,negative
0,auscultación pulmonar,auscultación del tracto respiratorio inferior,incentivo de una conducta
1,auscultación pulmonar,auscultación del tracto respiratorio inferior,valvulotomía pulmonar
2,auscultación pulmonar,auscultación del tracto respiratorio inferior,procedimiento de medición
3,auscultación pulmonar,auscultación del tracto respiratorio inferior,percusión mediata (procedimiento)
4,auscultación pulmonar,auscultación del tracto respiratorio inferior,condensado de aliento exhalado


In [None]:
cross_encoder = CrossEncoderReranker(MODEL_PATH, model_type="mask", max_seq_length=MAX_LENGTH)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 3 GPUs!


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/19092 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
cross_encoder.prepare_triplets(df_hard_triplets)