In [1]:
import os
import sys
import pandas as pd

sys.path.append(os.path.join(os.getcwd(), '../src'))
import faissEncoder as faiss_enc
import utils

In [2]:
corpus = "SympTEMIST"
ensemble_preds = "../data/icb-uma-ensemble.tsv"
baseline_model = "cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR-large"
p_be_model = f"../../../models/NEL/spanish_sapbert_models/sapbert_15_parents_1epoch/"
gp_be_model = f"../../../models/NEL/spanish_sapbert_models/sapbert_15_grandparents_1epoch/"

In [3]:
test_df, train_df, gaz_df = utils.load_corpus_data(corpus)

In [4]:
df_ensemble = pd.read_csv(ensemble_preds, sep="\t", header=0, dtype={"code": str}).rename(columns={'start_span': 'span_ini', "end_span": "span_end", "text": "term"})
df_ensemble = df_ensemble.sort_values(by=['filename','span_ini', 'span_end'])
df_ensemble.head()

Unnamed: 0,filename,label,span_ini,span_end,term
0,S0004-06142006000100010-1,SINTOMA,118,169,"dolor lumbar derecho, esporádico de baja inten..."
1,S0004-06142006000100010-1,SINTOMA,174,203,exploración física fue normal
2,S0004-06142006000100010-1,SINTOMA,251,289,masa suprarrenal derecha hipoecogénica
3,S0004-06142006000100010-1,SINTOMA,370,405,proceso expansivo en la suprarrenal
4,S0004-06142006000100010-1,SINTOMA,418,471,polo superior de riñón derecho desplazado haci...


In [5]:
test_df = test_df.sort_values(by=['filename','span_ini', 'span_end'])
test_df.head()

Unnamed: 0,filename,label,span_ini,span_end,term,code,sem_rel,is_abbrev,is_composite,need_context
651,S0004-06142006000100010-1,SINTOMA,118,130,dolor lumbar,40709005,EXACT,False,False,False
652,S0004-06142006000100010-1,SINTOMA,174,203,exploración física fue normal,NO_CODE,NO_CODE,False,False,False
2140,S0004-06142006000100010-1,SINTOMA,251,289,masa suprarrenal derecha hipoecogénica,237783006,EXACT,False,False,False
2428,S0004-06142006000100010-1,SINTOMA,370,405,proceso expansivo en la suprarrenal,237783006,EXACT,False,False,False
2411,S0004-06142006000100010-1,SINTOMA,418,471,polo superior de riñón derecho desplazado haci...,366263008,NARROW,False,False,False


In [6]:
ner_scores = utils.calculate_ner(test_df, df_ensemble, f"../logs/{corpus}_ner.log")["total"]
ner_scores

{'recall': 0.7124, 'precision': 0.752, 'f_score': 0.7317}

In [7]:
train_gaz_df = (
    pd.concat([
        train_df[['code', 'term']],
        gaz_df[['code', 'term']]
    ])
    .drop_duplicates()
    .assign(term=lambda x: x['term'].str.replace('«', '').str.replace('»', '').str.lower())
    .query('code != "" and code != "NO_CODE"')
)
train_gaz_df.head()


Unnamed: 0,code,term
0,246658005,manchas en el campo visual
1,171250001,5hiaa en orina de 24 horas estaba dentro de lo...
2,166315009,a nivel analítico no presentaba alteración
3,126825008,a nivel del cardias masa mamelonada y ulcerada
4,9209005,abdomen agudo


In [8]:
F_TYPE = "FlatIP"
MAX_LENGTH = 256
faiss_encoder = faiss_enc.FaissEncoder(baseline_model, F_TYPE, MAX_LENGTH, train_gaz_df)
faiss_encoder.fitFaiss()

candidates, codes, similarities = faiss_encoder.getCandidates(df_ensemble["term"].tolist(), k=200)
baseline_preds = df_ensemble.copy()
baseline_preds["candidates"] = candidates
baseline_preds["codes"] = codes

baseline_preds['code'] = baseline_preds['codes'].apply(lambda x: x[0] if x else None)
baseline_scores = utils.calculate_norm(test_df, baseline_preds, f"../logs/{corpus}_baseline_preds.log")["total"]
baseline_scores


  return self.fget.__get__(instance, owner)()


Encoding:   0%|          | 0/5303 [00:00<?, ?it/s]

Encoding:   0%|          | 0/43 [00:00<?, ?it/s]

{'recall': 0.4972, 'precision': 0.5248, 'f_score': 0.5106}

In [9]:
faiss_encoder = faiss_enc.FaissEncoder(p_be_model, F_TYPE, MAX_LENGTH, train_gaz_df)
faiss_encoder.fitFaiss()

candidates, codes, similarities = faiss_encoder.getCandidates(df_ensemble["term"].tolist(), k=200)
p_be_preds = df_ensemble.copy()
p_be_preds["candidates"] = candidates
p_be_preds["codes"] = codes

p_be_preds['code'] = p_be_preds['codes'].apply(lambda x: x[0] if x else None)
p_be_scores = utils.calculate_norm(test_df, p_be_preds, f"../logs/{corpus}_p_be_preds.log")["total"]
p_be_scores

Encoding:   0%|          | 0/5303 [00:00<?, ?it/s]

Encoding:   0%|          | 0/43 [00:00<?, ?it/s]

{'recall': 0.5179, 'precision': 0.5467, 'f_score': 0.5319}

In [10]:
faiss_encoder = faiss_enc.FaissEncoder(gp_be_model, F_TYPE, MAX_LENGTH, train_gaz_df)
faiss_encoder.fitFaiss()

candidates, codes, similarities = faiss_encoder.getCandidates(df_ensemble["term"].tolist(), k=200)
gp_be_preds = df_ensemble.copy()

gp_be_preds["candidates"] = candidates
gp_be_preds["codes"] = codes

gp_be_preds['code'] = p_be_preds['codes'].apply(lambda x: x[0] if x else None)

gp_be_scores = utils.calculate_norm(test_df, gp_be_preds, f"../logs/{corpus}_gp_be_preds.log")["total"]
gp_be_scores

Encoding:   0%|          | 0/5303 [00:00<?, ?it/s]

Encoding:   0%|          | 0/43 [00:00<?, ?it/s]

{'recall': 0.5179, 'precision': 0.5467, 'f_score': 0.5319}

In [11]:
df_corpus = pd.DataFrame.from_dict({
    'Model': ['SapBERT-XLM-R-large', 'Clinlinker-KB-P', 'ClinLinker-KB-GP'],
    'Precision': [baseline_scores['precision'], p_be_scores['precision'], gp_be_scores['precision']],
    'Recall': [baseline_scores['recall'], p_be_scores['recall'], gp_be_scores['recall']],
    'F1-score': [baseline_scores['f_score'], p_be_scores['f_score'], p_be_scores['f_score']]
})
df_corpus.to_csv(f"../results/{corpus}.tsv", sep="\t", index=False)