In [1]:
from pathlib import Path
import os
import random
import pandas as pd


In [12]:
all_test_files = Path('/vol/biomedic3/bglocker/ugproj2324/nns20/datasets/VinDr-CXR/image_text_reasoning_datasets/test_all_left_or_right')
no_finding = set()
all_images = set()
with open(all_test_files,'r') as f:
    for line in f:
        image_id = line.split(',')[0] 
        all_images.add(image_id)
        if 'No finding' in line:
            no_finding.add(image_id)

In [26]:
train_proportion = 0.75
val_proportion = 0.10
test_proportion = 0.15

# randomly split all_images into train, val, test
all_images = list(all_images)
random.shuffle(all_images)
num_images = len(all_images)
train_images = all_images[:int(train_proportion*num_images)]
val_images = all_images[int(train_proportion*num_images):int((train_proportion+val_proportion)*num_images)]
test_images = all_images[int((train_proportion+val_proportion)*num_images):]


In [27]:
# print the number of no finding images in each split

print(f"train set images with findings: {len(train_images) - len(set(train_images).intersection(no_finding))}")
print(f"val set images with findings: {len(val_images) - len(set(val_images).intersection(no_finding))}")
print(f"test set images with findings: {len(test_images) - len(set(test_images).intersection(no_finding))}")


train set images with findings: 728
val set images with findings: 96
test set images with findings: 125


In [18]:
with open('VinDr_test_train_split.txt','w') as f:
    for image_id in train_images:
        f.write(image_id+'\n')

with open('VinDr_test_val_split.txt','w') as f:
    for image_id in val_images:
        f.write(image_id+'\n')

with open('VinDr_test_test_split.txt','w') as f:
    for image_id in test_images:
        f.write(image_id+'\n')

In [7]:

image_labels_test_csv = Path('/vol/biodata/data/chest_xray/VinDr-CXR/1.0.0_png_512/raw/annotations/image_labels_test.csv')

# Read the data using pandas
df = pd.read_csv(image_labels_test_csv)

# Convert the dataframe to a dictionary where the key is image_id and the value is a list of pathologies
image_to_pathologies = {}
for index, row in df.iterrows():
    image_id = row['image_id']
    pathologies = [col for col in df.columns if row[col] == 1 and col != 'image_id']
    image_to_pathologies[image_id] = pathologies



In [2]:
# go through the train, val, test split files and write the corresponding labels to the row corresponding to the image_id
train_split_file = Path('VinDr_test_train_split.txt')
val_split_file = Path('VinDr_test_val_split.txt')
test_split_file = Path('VinDr_test_test_split.txt')

train_split_file_with_labels = Path('VinDr_test_train_split_with_labels.csv')
val_split_file_with_labels = Path('VinDr_test_val_split_with_labels.csv')
test_split_file_with_labels = Path('VinDr_test_test_split_with_labels.csv')

# with open(train_split_file,'r') as f:
#     train_images = f.readlines()
#     train_images = [image.strip() for image in train_images]

# with open(val_split_file,'r') as f:
#     val_images = f.readlines()
#     val_images = [image.strip() for image in val_images]
    
# with open(test_split_file,'r') as f:
#     test_images = f.readlines()
#     test_images = [image.strip() for image in test_images]

# with open(train_split_file_with_labels,'w') as f:

#     for image_id in train_images:
#         pathologies = image_to_pathologies[image_id]
#         f.write(image_id+','+','.join(pathologies)+'\n')

# with open(val_split_file_with_labels,'w') as f:

#     for image_id in val_images:
#         pathologies = image_to_pathologies[image_id]
#         f.write(image_id+','+','.join(pathologies)+'\n')

# with open(test_split_file_with_labels,'w') as f:

#     for image_id in test_images:
#         pathologies = image_to_pathologies[image_id]
#         f.write(image_id+','+','.join(pathologies)+'\n')
        



In [5]:
# convert labels to tensors with 0s and 1s depending on whether the pathology is present or not
test_split_file_with_one_hot_labels = Path('VinDr_test_test_split_with_one_hot_labels.csv')

vindr_pathologies = ["Aortic enlargement", "Atelectasis", "Calcification", "Cardiomegaly",
            "Clavicle fracture", "Consolidation", "Emphysema", "Enlarged PA",
            "ILD", "Infiltration", "Lung Opacity", "Lung cavity", "Lung cyst",
            "Mediastinal shift","Nodule/Mass", "Pleural effusion", "Pleural thickening",
            "Pneumothorax", "Pulmonary fibrosis","Rib fracture", "Other lesion",
            "No finding"] 

pathology_indices = {pathology: i for i, pathology in enumerate(vindr_pathologies)}

def convert_pathologies_to_array(pathology_list, pathology_indices):
    """ Convert list of pathologies to an array of 0s and 1s """
    pathology_array = [0] * len(pathology_indices)
    for pathology in pathology_list:
        index = pathology_indices.get(pathology)
        if index is not None:
            pathology_array[index] = 1
    return pathology_array

def process_file(input_file_path, output_file_path):
    with open(input_file_path, 'r') as file, open(output_file_path, 'w') as out_file:
        out_file.write('image_id,' + ','.join(vindr_pathologies) + '\n')
        for line in file:
            parts = line.strip().split(',')
            image_id = parts[0]
            pathologies = parts[1:]
            pathology_array = convert_pathologies_to_array(pathologies, pathology_indices)
            output_line = f"{image_id}," + ','.join(map(str, pathology_array))
            out_file.write(output_line + '\n')

process_file(test_split_file_with_labels, test_split_file_with_one_hot_labels)
