In [None]:
import pandas as pd
import os
import numpy as np
from umls_api import API
from dotenv import load_dotenv
import pickle
import itertools

load_dotenv()

: 

: 

In [9]:
# Get a list of all the cuis
df_cuis = pd.concat([pd.read_csv("..\\data\\raw\\train_concepts.csv").set_index("ID"),pd.read_csv("..\\data\\raw\\test_concepts.csv").set_index("ID")])
xss = [cuis.split(";") for cuis in df_cuis["CUIs"]]
cuis = [x for xs in xss for x in xs]

In [12]:
# get the 200 most frequent cuis
def get_top_cuis_rank(cui_list, k_items=200):
    # conta le occorrenze
    counts = {}
    for cui in cui_list:
        counts[cui] = counts.get(cui, 0) + 1

    # ordina per frequenza decrescente
    sorted_cuis = sorted(counts.items(), key=lambda item: item[1], reverse=True)

    # prendi solo i k più frequenti
    top_k = itertools.islice(sorted_cuis, k_items)

    # crea mapping CUI -> rank (partendo da 1)
    rank_mapping = {cui: rank for rank, (cui, _) in enumerate(top_k, start=1)}
    return rank_mapping


most_frequent_cuis = get_top_cuis_rank(cuis)

# Creating interim dataframe
Get image, text, CUIs of the train and test split.

In [None]:
def create_dataframe(split):
    """Creates a pandas dataframe for the RocoV2 dataset"""
    base_dir = "..\\data\\raw"
    images_dir = f"{base_dir}\\{split}"
    captions_path = f"{base_dir}\\{split}_captions.csv"
    concepts_path = f"{base_dir}\\{split}_concepts.csv"
    
    
    # ---------------------- CUIs filering ----------------------
    
    # create cuis_list and make cuis into a list
    df = pd.read_csv(concepts_path)    
    df["CUIs"] = df["CUIs"].str.split(";") 

    
    def filter_and_clean_cuis(cuis_list, frequent_cuis_set):
        """Deletes every non-frequent CUI"""
        if not isinstance(cuis_list, list):
            return None
        
        filtered = [c for c in cuis_list if c in frequent_cuis_set]
        return filtered if filtered else None
    
    # deletes non frequent cuis
    df["CUIs"] = df["CUIs"].apply(lambda x: filter_and_clean_cuis(x, set(most_frequent_cuis.keys())))
    # deletes rows without cuis
    df= df.dropna(subset=["CUIs"])
    
    df = df.set_index("ID")

    # --------------------- CUIs filered! ----------------------

    # add vec
    df["CUIs_vec"] = df["CUIs"].apply(lambda cui_list: [most_frequent_cuis[cui] for cui in cui_list])

    # add text
    df = df.join(pd.read_csv(captions_path).set_index("ID"), how="inner")

    # -------------------- images filering ---------------------
    
    print("Checking images, this might take a while...")
    def check_image(data_id):
        img_path = f"{images_dir}\\{data_id}.jpg"
        return img_path if os.path.isfile(img_path) else None

    df["image_path"] = df.index.map(check_image) 

    # delete rows without image
    df = df.dropna(subset=["image_path"])
    
    # ------------------- images filered! ----------------------
    print(f"Created dataframe for {split} with {len(df)} items")
    return df

In [16]:
df_train = create_dataframe("train")
df_train.to_pickle("..\\data\\interim\\df_train_interim.pkl")

df_test = create_dataframe("test")
df_test.to_pickle("..\\data\\interim\\df_test_interim.pkl")


Checking images, this might take a while...
Checking images, this might take a while...


In [18]:
df_train.head()

Unnamed: 0_level_0,CUIs,CUIs_vec,Caption,image_path
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ROCOv2_2023_train_000001,[C0040405],[1],Head CT demonstrating left parotiditis.,..\data\raw\train\ROCOv2_2023_train_000001.jpg
ROCOv2_2023_train_000002,[C0041618],[4],Acquired renal cysts in end-stage renal failur...,..\data\raw\train\ROCOv2_2023_train_000002.jpg
ROCOv2_2023_train_000003,"[C0040405, C0817096, C0205271]","[1, 5, 38]",Computed tomography of the chest showing the r...,..\data\raw\train\ROCOv2_2023_train_000003.jpg
ROCOv2_2023_train_000004,"[C0002978, C0036033, C1266909, C0225317]","[7, 157, 22, 28]",Lateral view of the sacrum showing the low con...,..\data\raw\train\ROCOv2_2023_train_000004.jpg
ROCOv2_2023_train_000005,"[C0040405, C0817096, C0497156]","[1, 5, 94]",Thoracic CT scan showing perihilar pulmonary l...,..\data\raw\train\ROCOv2_2023_train_000005.jpg


# Adding Semantic types
To do this, we first have to connect to the UMLS API and iterate through every cui to get it's semantic. <br>
To use this, you need an UMLS API KEY that you can request for free at https://www.nlm.nih.gov/research/umls/index.html <br>
Unfortunately, the api does not support batch requests, so the process will take a while.


In [19]:
cui_set = set()
for cui in df_train["CUIs"].explode().to_list() + df_test["CUIs"].explode().to_list():
    cui_set.add(cui)

In [None]:
cuis_semantic_types = dict()
for idx,cui in enumerate(cui_set):
    print(f"\rProcessing CUI {idx+1}/{len(cui_set)}", flush=True, end="")
    # get cui information
    api = API(api_key=os.getenv("UMLS_API_KEY"))
    resp = api.get_cui(cui=cui)
    
    # get semantic types
    cui_semantic_types = resp["result"]["semanticTypes"]
    names = [st["name"] for st in cui_semantic_types]
    
    # get all the semantic types for that cui 
    cuis_semantic_types[cui] = set()
    for name in names:
        cuis_semantic_types[cui].add(name)

with open("..\\data\\interim\\cuis_semantic_types.pkl", "wb") as f:
    pickle.dump(cuis_semantic_types, f)

# Creating final dataframe
Now, we get the final dataframe with the semantic types

In [25]:
def add_semantic_types(df):
    semantic_types = []
    for cuis in df["CUIs"]:
        data_semantic_types = set()
        for cui in cuis:
            for semantic_type in cuis_semantic_types.get(cui):
                data_semantic_types.add(semantic_type)
        semantic_types.append(list(data_semantic_types))
    
    df["Semantic"] = semantic_types

In [27]:
add_semantic_types(df_train)
add_semantic_types(df_test)

In [28]:
all_semantic_types = set(df_train["Semantic"].explode().tolist() + df_train["Semantic"].explode().tolist())

semantic_types_num = {k: i for k, i in zip(all_semantic_types, range(1, len(all_semantic_types)+1))} # get a number for each semantic type

In [29]:
def add_semantic_vec(df):
    semantic_vec = []
    for semantics in df["Semantic"]:
        semantic_nums = []
        for semantic in semantics:
            semantic_nums.append(semantic_types_num.get(semantic))
        semantic_vec.append(semantic_nums)
    df["Semantic_vec"] = semantic_vec

In [30]:
add_semantic_vec(df_train)
add_semantic_vec(df_test)

In [31]:
df_train.head()

Unnamed: 0_level_0,CUIs,CUIs_vec,Caption,image_path,Semantic,Semantic_vec
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ROCOv2_2023_train_000001,[C0040405],[1],Head CT demonstrating left parotiditis.,..\data\raw\train\ROCOv2_2023_train_000001.jpg,[Diagnostic Procedure],[5]
ROCOv2_2023_train_000002,[C0041618],[4],Acquired renal cysts in end-stage renal failur...,..\data\raw\train\ROCOv2_2023_train_000002.jpg,[Diagnostic Procedure],[5]
ROCOv2_2023_train_000003,"[C0040405, C0817096, C0205271]","[1, 5, 38]",Computed tomography of the chest showing the r...,..\data\raw\train\ROCOv2_2023_train_000003.jpg,"[Body Location or Region, Qualitative Concept,...","[1, 8, 5]"
ROCOv2_2023_train_000004,"[C0002978, C0036033, C1266909, C0225317]","[7, 157, 22, 28]",Lateral view of the sacrum showing the low con...,..\data\raw\train\ROCOv2_2023_train_000004.jpg,"[Body Location or Region, Tissue, Diagnostic P...","[1, 15, 5, 6]"
ROCOv2_2023_train_000005,"[C0040405, C0817096, C0497156]","[1, 5, 94]",Thoracic CT scan showing perihilar pulmonary l...,..\data\raw\train\ROCOv2_2023_train_000005.jpg,"[Body Location or Region, Diagnostic Procedure...","[1, 5, 4]"


In [32]:
df_train.to_pickle("..\\data\\processed\\df_train.pkl")
df_test.to_pickle("..\\data\\processed\\df_test.pkl")