In [2]:
import pandas as pd
import os
import numpy as np
from umls_api import API
from dotenv import load_dotenv
import pickle
import itertools
from collections import Counter
from tqdm.notebook import tqdm
import joblib 


load_dotenv()

True

In [3]:
# Get a list of all the cuis
# df_cuis = pd.concat([pd.read_csv("..\\data\\raw\\train_concepts.csv").set_index("ID"),pd.read_csv("..\\data\\raw\\test_concepts.csv").set_index("ID")])
df_cuis = pd.read_csv("..\\data\\raw\\cui_mapping.csv")
xss = [cuis.split(";") for cuis in df_cuis["CUI"]]
cuis = [x for xs in xss for x in xs]

In [5]:


cuis_semantic_types = dict()

def get_cui_semantic_types(cui):
    try:
        api = API(api_key=os.getenv("UMLS_API_KEY"))  # Initialize once
        resp = api.get_cui(cui=cui)
        cui_semantic_types = resp["result"]["semanticTypes"]
        names = [st["name"] for st in cui_semantic_types]
        return {cui: set(names)}
    except Exception as e:
        print(f"Error processing CUI {cui}: {e}")
        return {cui: set()}


results = joblib.Parallel(n_jobs=-1, verbose=10)(
    joblib.delayed(get_cui_semantic_types)(cui) 
    for cui in tqdm(cuis, desc="Processing CUIs")
)
    
cuis_semantic_types = dict()
for d in results:
    key = [*d][0]
    cuis_semantic_types[key] = d[key]

with open("..\\data\\interim\\cuis_semantic_types.pkl", "wb") as f:
    pickle.dump(cuis_semantic_types, f)


Processing CUIs:   0%|          | 0/1935 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   29.3s
[Parallel(n_jobs=-1)]: Done 122 tasks      | elapsed:   33.2s
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:   37.2s
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:   39.6s
[Parallel(n_jobs=-1)]: Done 181 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 202 tasks      | elapsed:   51.8s
[Parallel(n_jobs=-1)]: Done 225 tasks      | elapsed:  

In [4]:
cuis_semantic_types = pickle.load(open("..\\data\\interim\\cuis_semantic_types.pkl", "rb"))

In [5]:
for cui, st in cuis_semantic_types.items():
    if len(st) == 0:
        print(f"CUI {cui} has no semantic types.")

CUI C0241790 has no semantic types.
CUI C1134719 has no semantic types.
CUI C0206702 has no semantic types.


In [None]:
# CUI C0241790 has no semantic types.
# CUI C0021818 has no semantic types.
# CUI C1134719 has no semantic types.
# CUI C0206702 has no semantic types.

In [6]:
cuis_semantic_types["C0241790"] = {'Congenital Abnormality'}
cuis_semantic_types["C0021818"] = {'Disease or Syndrome'}
cuis_semantic_types["C1134719"] = {'Neoplastic Process'}
cuis_semantic_types["C0206702"] = {'Neoplastic Process'}


In [7]:
semantic_types_freq = {k:v for k,v in Counter(itertools.chain.from_iterable(cuis_semantic_types.values())).items()}
semantic_types_freq

{'Disease or Syndrome': 271,
 'Congenital Abnormality': 53,
 'Body Part, Organ, or Organ Component': 809,
 'Finding': 2,
 'Medical Device': 43,
 'Acquired Abnormality': 26,
 'Body Location or Region': 157,
 'Body Space or Junction': 115,
 'Anatomical Abnormality': 59,
 'Pathologic Function': 119,
 'Substance': 2,
 'Neoplastic Process': 98,
 'Diagnostic Procedure': 27,
 'Functional Concept': 40,
 'Qualitative Concept': 21,
 'Tissue': 38,
 'Organ or Tissue Function': 8,
 'Manufactured Object': 4,
 'Body Substance': 26,
 'Spatial Concept': 4,
 'Organism Function': 6,
 'Body System': 9}

In [8]:
df = pd.read_csv("..\\data\\raw\\train_concepts.csv")
df.head()

Unnamed: 0,ID,CUIs
0,ROCOv2_2023_train_000001,C0040405
1,ROCOv2_2023_train_000002,C0041618
2,ROCOv2_2023_train_000003,C0040405;C0817096;C0205271
3,ROCOv2_2023_train_000004,C0002978;C0036033;C1266909;C0225317
4,ROCOv2_2023_train_000005,C0040405;C0817096;C0497156


Disease or Syndrome (271) → è il più importante, rappresenta direttamente malattie e sindromi.<br>

Neoplastic Process (98) → include tumori e neoplasie, molto rilevante in radiologia.<br>

Pathologic Function (119) → processi patologici generici (es. infiammazione).<br>

Congenital Abnormality (53) → malformazioni congenite.<br>

Acquired Abnormality (26) → anomalie acquisite (traumi, degenerazioni).<br>

Anatomical Abnormality (59) → strutture anatomiche anormali (es. aneurisma).<br>

In [9]:
relevant_st = {'Disease or Syndrome': 0, 
               'Neoplastic Process': 0, 
               'Pathologic Function': 0, 
               'Congenital Abnormality': 0, 
               'Acquired Abnormality': 0, 
               'Anatomical Abnormality': 0
               }

for cuis in df["CUIs"]:
    cuis = cuis.split(";")
    for cui in cuis:
        if cui not in cuis_semantic_types:
            continue
        st = cuis_semantic_types[cui]
        for semantic in st:
            if semantic in relevant_st:
                relevant_st[semantic] += 1

relevant_st
    

{'Disease or Syndrome': 7676,
 'Neoplastic Process': 3427,
 'Pathologic Function': 11213,
 'Congenital Abnormality': 902,
 'Acquired Abnormality': 828,
 'Anatomical Abnormality': 2857}

Disease or Syndrome + Neoplastic Process + Anatomical Abnormality

In [22]:
important_st = ['Disease or Syndrome', 'Neoplastic Process', 'Anatomical Abnormality']

def get_dataframe(split):
    cuis_semantic_types = pickle.load(open("..\\data\\interim\\cuis_semantic_types.pkl", "rb"))

    base_dir = "..\\data\\raw\\"
    df = pd.read_csv(os.path.join(base_dir, f"{split}_concepts.csv"))
    
    print(f"Initial shape: {df.shape}")
    # split cuis in a list
    df["CUIs"] = df["CUIs"].apply(lambda x: x.split(";"))
    
    # get cuis of interest (Disease or Syndrome, Neoplastic Process, Anatomical Abnormality)
    def filter_cuis(cuis):
        filtered = [
            cui 
            for cui in cuis 
            if cui in cuis_semantic_types
            for semantic in cuis_semantic_types[cui] 
            if semantic in important_st
        ]
        return filtered if filtered else None
    
    df["CUIs"] = df["CUIs"].apply(filter_cuis)
    
    # eliminate empty cuis
    df = df.dropna(subset=["CUIs"])
    print(f"Shape after filtering CUIs: {df.shape}")
    
    # get captions
    df_captions = pd.read_csv(os.path.join(base_dir, f"{split}_captions.csv")).set_index("ID")
    df = df.set_index("ID").join(df_captions, how="inner").reset_index()
    print(f"Shape after joining with captions: {df.shape}")
    
    # get images
    df_images = pd.DataFrame()
    image_files = os.listdir(os.path.join(base_dir, split))
    df_images["Image"] = image_files
    df_images["ID"] = df_images["Image"].apply(lambda x: x.split(".")[0])
    
    df = df.set_index("ID").join(df_images.set_index("ID"), how="inner").reset_index()
    print(f"Shape after joining with images: {df.shape}")
    
    return df

df = get_dataframe("train")
df.to_pickle("..\\data\\interim\\train_interim.pkl")

Initial shape: (59958, 2)
Shape after filtering CUIs: (12122, 2)
Shape after joining with captions: (12122, 3)
Shape after joining with images: (12119, 4)


In [4]:
df = pd.read_pickle("..\\data\\interim\\train_interim.pkl")

Now, let's see the frequence of every cui in the dataframe

In [25]:
cui_freq = Counter(itertools.chain.from_iterable(df["CUIs"]))
cui_freq = {k: v for k, v in cui_freq.items() if v >= 100}
cui_freq

{'C0497156': 300,
 'C0001304': 270,
 'C0028259': 738,
 'C0003962': 249,
 'C0027651': 1392,
 'C1290884': 229,
 'C2733397': 228,
 'C0006826': 141,
 'C0032326': 313,
 'C0000833': 206,
 'C2939419': 304,
 'C0009450': 131,
 'C5203670': 141,
 'C0006267': 105,
 'C0032285': 178,
 'C0031039': 366,
 'C1510420': 589,
 'C0016169': 546,
 'C0020295': 163,
 'C0025062': 222,
 'C0032320': 121}

In [26]:
df_cui_map = pd.read_csv("..\\data\\raw\\cui_mapping.csv")

df_cui_map = df_cui_map[df_cui_map["CUI"].apply(lambda cui: cui in cui_freq)]
df_cui_map["Frequency"] = df_cui_map["CUI"].apply(lambda cui: cui_freq.get(cui, 0))

df_cui_map = df_cui_map.sort_values(by="Frequency", ascending=False)

df_cui_map

Unnamed: 0,CUI,Canonical name,Frequency
706,C0027651,Neoplasms,1392
1371,C0028259,Nodule,738
618,C1510420,Cavitation,589
1232,C0016169,pathologic fistula,546
1796,C0031039,Pericardial effusion,366
642,C0032326,Pneumothorax,313
1823,C2939419,Secondary Neoplasm,304
1584,C0497156,Lymphadenopathy,300
1169,C0001304,Acute abscess,270
1732,C0003962,Ascites,249


In [13]:
df_cui_map

Unnamed: 0,CUI,Canonical name,Frequency
706,C0027651,Neoplasms,1392
1371,C0028259,Nodule,738
618,C1510420,Cavitation,589
1232,C0016169,pathologic fistula,546
1796,C0031039,Pericardial effusion,366
642,C0032326,Pneumothorax,313
1823,C2939419,Secondary Neoplasm,304
1584,C0497156,Lymphadenopathy,300


In [27]:
def filter_cuis_final(cuis):
    filtered = [cui for cui in cuis if cui in cui_freq]
    return filtered if filtered else None
print(f"Shape before filtering by frequency: {df.shape}")
df["CUIs"] = df["CUIs"].apply(filter_cuis_final)
df = df.dropna(subset=["CUIs"])
print(f"Shape after filtering by frequency: {df.shape}")
df.to_pickle("..\\data\\interim\\train_interim_filtered.pkl")

Shape before filtering by frequency: (12119, 4)
Shape after filtering by frequency: (6458, 4)


In [5]:
df = pd.read_pickle("..\\data\\interim\\train_interim_filtered.pkl")

In [28]:
sum(1 for cuis in df["CUIs"] if len(cuis) > 1)

450

In [29]:
all_cuis = set(df["CUIs"].explode().tolist())
print(len(all_cuis))

21


The ones with < 300 may cause imbalance

In [30]:
all_cuis = set(df["CUIs"].explode().tolist())

all_cuis = {k:v for k,v in zip(all_cuis, range(1,len(all_cuis)+1))}

cuis_vec = [
            [
                all_cuis[cui] 
                for cui in cuis
            ]
            for cuis in df["CUIs"] 
            ]

df["CUIs_vec"] = cuis_vec

df.to_pickle("..\\data\\interim\\train_interim_filtered_vec.pkl")

In [31]:
len(df)

6458

## Making the dataset single-label

In [32]:
print(len(df))
# remove multiple labels
df = df[df["CUIs"].map(len) == 1].reset_index(drop=True)
print(len(df))

6458
6008


In [33]:
df["CUI"] = df["CUIs"].apply(lambda x: x[0])
df["CUI_vec"] = df["CUIs_vec"].apply(lambda x: x[0])
df

Unnamed: 0,ID,CUIs,Caption,Image,CUIs_vec,CUI,CUI_vec
0,ROCOv2_2023_train_000005,[C0497156],Thoracic CT scan showing perihilar pulmonary l...,ROCOv2_2023_train_000005.jpg,[10],C0497156,10
1,ROCOv2_2023_train_000007,[C0001304],Repeat CT abdomen and pelvis showing resolutio...,ROCOv2_2023_train_000007.jpg,[7],C0001304,7
2,ROCOv2_2023_train_000013,[C0028259],Enhanced magnetic resonance imaging of spinal ...,ROCOv2_2023_train_000013.jpg,[18],C0028259,18
3,ROCOv2_2023_train_000042,[C0003962],A slide from CT abdomen that shows that the pa...,ROCOv2_2023_train_000042.jpg,[21],C0003962,21
4,ROCOv2_2023_train_000044,[C0027651],Retroperitoneal abscess adjacent to the sigmoi...,ROCOv2_2023_train_000044.jpg,[3],C0027651,3
...,...,...,...,...,...,...,...
6003,ROCOv2_2023_train_060111,[C0016169],Coronal T2-weighted HR/SENSE MR image depictin...,ROCOv2_2023_train_060111.jpg,[8],C0016169,8
6004,ROCOv2_2023_train_060121,[C0028259],Chest CT scan that shows a nodule at the right...,ROCOv2_2023_train_060121.jpg,[18],C0028259,18
6005,ROCOv2_2023_train_060122,[C0028259],Chest CT scan that shows multiple small round ...,ROCOv2_2023_train_060122.jpg,[18],C0028259,18
6006,ROCOv2_2023_train_060145,[C0027651],CT Scan of The Mediastina. Post-Chemotherapy F...,ROCOv2_2023_train_060145.jpg,[3],C0027651,3


In [34]:
df.to_pickle("..\\data\\interim\\train_interim_filtered_singlelabel.pkl")

In [35]:
selected_cuis = [
    "C0032285",  # Pneumonia
    "C0020295",  # Hydronephrosis
    "C5203670",  # COVID19 (disease)
    "C0006826",  # Malignant neoplastic disease
    "C0006267",  # Bronchiectasis
    "C0031039",  # Pericardial effusion
    "C0032326",  # Pneumothorax
    "C0003962",  # Ascites
    "C0497156",  # Lymphadenopathy
    "C0001304",  # Acute abscess
    "C0000833",  # Abscess
    "C0025062",  # Mediastinal emphysema
]

print(f"Shape before filtering by selected CUIs: {df.shape}")
df = df[df["CUI"].isin(selected_cuis)]
print(f"Shape after filtering by selected CUIs: {df.shape}")
df.to_pickle("..\\data\\interim\\train_interim_filtered_singlelabel_selectedcuis.pkl")

Shape before filtering by selected CUIs: (6008, 7)
Shape after filtering by selected CUIs: (2227, 7)


In [36]:
df

Unnamed: 0,ID,CUIs,Caption,Image,CUIs_vec,CUI,CUI_vec
0,ROCOv2_2023_train_000005,[C0497156],Thoracic CT scan showing perihilar pulmonary l...,ROCOv2_2023_train_000005.jpg,[10],C0497156,10
1,ROCOv2_2023_train_000007,[C0001304],Repeat CT abdomen and pelvis showing resolutio...,ROCOv2_2023_train_000007.jpg,[7],C0001304,7
3,ROCOv2_2023_train_000042,[C0003962],A slide from CT abdomen that shows that the pa...,ROCOv2_2023_train_000042.jpg,[21],C0003962,21
7,ROCOv2_2023_train_000090,[C0006826],"FDG-PET showed accumulation, with a maximum st...",ROCOv2_2023_train_000090.jpg,[20],C0006826,20
8,ROCOv2_2023_train_000093,[C0032326],"Chest radiography, posterior view showing pneu...",ROCOv2_2023_train_000093.jpg,[13],C0032326,13
...,...,...,...,...,...,...,...
5992,ROCOv2_2023_train_060034,[C0032285],Chest X-ray demonstrating bilateral pulmonary ...,ROCOv2_2023_train_060034.jpg,[11],C0032285,11
5993,ROCOv2_2023_train_060039,[C0031039],Transgastric apical short-axis view by transes...,ROCOv2_2023_train_060039.jpg,[15],C0031039,15
5994,ROCOv2_2023_train_060040,[C0031039],A straying tip of the IMPELLA® in the left ven...,ROCOv2_2023_train_060040.jpg,[15],C0031039,15
5999,ROCOv2_2023_train_060071,[C0020295],"A 45-year-old woman with a large, asymptomatic...",ROCOv2_2023_train_060071.jpg,[17],C0020295,17
