In [13]:
import os
import sys
import pandas as pd

sys.path.append(os.path.join(os.getcwd(), '../src'))
import utils
from logger import setup_custom_logger  

"""
Author: Fernando Gallego
Affiliation: Researcher at the Computational Intelligence (ICB) Group, University of Málaga
Code adapted from https://github.com/cambridgeltl/sapbert/blob/main/training_data/generate_pretraining_data.ipynb
"""

'\nAuthor: Fernando Gallego\nAffiliation: Researcher at the Computational Intelligence (ICB) Group, University of Málaga\nCode adapted from https://github.com/cambridgeltl/sapbert/blob/main/training_data/generate_pretraining_data.ipynb\n'

In [14]:
UMLS_PATH = "../../../data/UMLS/2023AA/META/"
CHUNK_SIZE = 250000
CORPUS = "DisTEMIST"
logger = setup_custom_logger('data generation')

In [15]:
_, train_df, gaz_df = utils.load_corpus_data(CORPUS)
gaz_df['term'] = gaz_df['term'].replace({'«': '', '»': ''}, regex=True)
train_df['term'] = train_df['term'].replace({'«': '', '»': ''}, regex=True)
train_df = train_df[~train_df['code'].str.contains("NO_CODE|\+", regex=True)]

In [16]:
train_df.head()

Unnamed: 0,filename,mark,label,off0,off1,term,code,semantic_rel
0,es-S0210-56912007000900007-3,T1,ENFERMEDAD,164,166,DM,73211009,EXACT
1,es-S0210-56912007000900007-3,T2,ENFERMEDAD,362,376,deshidratación,34095006,EXACT
2,es-S0210-56912007000900007-3,T3,ENFERMEDAD,575,590,hiperamilasemia,275739007,EXACT
3,es-S0210-56912007000900007-3,T4,ENFERMEDAD,715,733,pancreatitis aguda,197456007,EXACT
4,es-S0210-56912007000900007-3,T5,ENFERMEDAD,1402,1459,formación polipoidea sésil situada junto al es...,88580009,EXACT


In [17]:
gaz_df.head()

Unnamed: 0,code,language,term,semantic_tag,mainterm
0,9989000,es,anomalía congénita de dedo del pie,disorder,1
1,9989000,es,malformación congénita de dedo del pie,disorder,0
2,9984005,es,exfoliación de dientes por enfermedad sistémica,disorder,1
3,9982009,es,intoxicación causada por cocaína,disorder,1
4,998008,es,enfermedad de Chagas con compromiso del corazón,disorder,1


In [18]:
colnames = utils.extract_column_names_from_ctl_file(os.path.join(UMLS_PATH, "MRCONSO.ctl"))
df_conso = utils.read_rrf_file_in_chunks(os.path.join(UMLS_PATH, "MRCONSO.RRF"), CHUNK_SIZE, colnames, dtype_dict={"CUI": str})
logger.info("Processed MRCONSO.RRF")

Processing:   0%|          | 0/13609918 [00:00<?, ?line/s]

2024-07-19 13:17:29,367 - INFO - Processed MRCONSO.RRF (2933879257.py:3)
2024-07-19 13:17:29,367 - INFO - Processed MRCONSO.RRF (2933879257.py:3)


In [19]:
df_conso_sn = df_conso[df_conso['SAB'].isin(["SCTSPA"])]
scui_to_cui_dict = df_conso_sn.groupby('SCUI')['CUI'].agg(lambda x: list(set(x))).to_dict()

In [20]:
train_df['CUI'] = train_df['code'].map(scui_to_cui_dict)
train_df['CUI'] = train_df['CUI'].apply(lambda x: x[0] if isinstance(x, list) else x)

In [21]:
merged_df = pd.merge(train_df, gaz_df, on='code')

In [22]:
triplets = merged_df.apply(lambda row: f"{row['CUI']}||{row['term_x']}||{row['term_y']}", axis=1).tolist()

In [23]:
triplets[0:3]

['C0011849||DM||diabetes mellitus',
 'C0011849||DM||diabetes sacarina',
 'C0011849||diabético||diabetes mellitus']

In [24]:
output_file_path = f'../data/training_file_umls2023aa_esp_uncased_{CORPUS.lower()}.txt'

with open(output_file_path, 'w') as f:
    for triplet in triplets:
        f.write("%s\n" % triplet)

print(f"Pairs saved in {output_file_path}")


Pairs saved in ../data/training_file_umls2023aa_esp_uncased_distemist.txt
