In [15]:
# Filter all the entries of "train.csv" which have class_name as "No finding" or "Pleural Effusion"

# Author: Marc Padrós 
# Date: 28th April 2022

# Find dataset at: https://www.kaggle.com/competitions/vinbigdata-chest-xray-abnormalities-detection/

# Read "train.csv" and at the same time write the filtered entries on "train_process_stage-1.csv"
# This cell was implemented by taking into account the following documentation:
# https://docs.python.org/3/library/csv.html
import csv
with open('train.csv','r', newline='') as fin, open ('train_process_stage-1.csv','w',newline='') as fout:
    reader = csv.DictReader(fin)
    fieldnames = ['image_id', 'class_name']
    writer = csv.DictWriter(fout, fieldnames=fieldnames)  
    writer.writeheader()         
    for row in reader:
        if row['class_name'] == 'No finding' or row['class_name'] == 'Pleural effusion':
            writer.writerow({'image_id': row['image_id'], 'class_name': row['class_name']})

In [16]:
# Split train and validation images inside "normal" and "effusion" folders
# NOTE: There has to be the same number of "normal" and "effusion" images

# Process "train_process_stage-1.csv"

import os 

pathTrainVal = "processed_ds/train_val/"

trainValImgsIds = [fname[:-10] for fname in os.listdir(pathTrainVal)] # [:-10] is for extracting ".dicom.png" suffix

print("Number of train/valid ims stored: ", len(trainValImgsIds))


nEffusionNormalIms = 0

with open('train_process_stage-1.csv','r', newline='') as fin, open ('train_process_stage-2.csv','w',newline='') as fout:
    reader = csv.DictReader(fin)
    fieldnames = ['image_id', 'class_name']
    writer = csv.DictWriter(fout, fieldnames=fieldnames)  
    writer.writeheader()         
    for row in reader:
        if trainValImgsIds.__contains__(row['image_id']):
            trainValImgsIds.remove(row['image_id']) # to avoid having rows on the new csv with dupplicated 'image_id'
            writer.writerow({'image_id': row['image_id'], 'class_name': row['class_name']})
            nEffusionNormalIms += 1

print("How many of the train/valid ims stored are normal/effusion?: ", nEffusionNormalIms)


Number of train/valid ims stored:  1999
How many of the train/valid ims stored are normal/effusion?:  1555


In [17]:
# Read "train_process_stage-2.csv"

effusionImgsNames = []

normalImgsNames = []

with open('train_process_stage-2.csv','r', newline='') as fin:
    reader = csv.DictReader(fin)      
    for row in reader:
        if row['class_name'] == 'No finding':
            normalImgsNames.append(row['image_id'] + ".dicom.png")
        else: # 'class_name' = 'Pleural effusion'
            effusionImgsNames.append(row['image_id'] + ".dicom.png")

print("Total effusion images: ", len(effusionImgsNames))

print("Total normal images: ", len(normalImgsNames))


Total effusion images:  133
Total normal images:  1422


In [24]:
import cv2

# equal number of effusion and normal images (133 according to the previous cell output)
nEffusionNormalImgstoSplit = min(len(effusionImgsNames), len(normalImgsNames)) 

pathTrainValProcess = pathTrainVal + "train_val_process/"
pathNormalImgs = pathTrainValProcess + "normal/"
pathEffusionImgs = pathTrainValProcess + "effusion/"

# Before storing the splitted normal/effusion imgs, remove the content of the path where the imgs will be stored
#import shutil
#shutil.rmtree(pathTrainValProcess)

os.makedirs(pathNormalImgs, exist_ok=True) # create normal folder
os.makedirs(pathEffusionImgs, exist_ok=True) # create effusion folder

# 1st option: store effusion/normal imgs at the same time
for i in range(nEffusionNormalImgstoSplit):
    # Store a normal image inside "normal" folder
    imgOri = cv2.imread(pathTrainVal + normalImgsNames[i], cv2.IMREAD_GRAYSCALE) 
    cv2.imwrite(os.path.join(pathNormalImgs, normalImgsNames[i]), imgOri)
    # Store an effusion image inside "effusion" folder
    imgOri = cv2.imread(pathTrainVal + effusionImgsNames[i], cv2.IMREAD_GRAYSCALE) 
    cv2.imwrite(os.path.join(pathEffusionImgs, effusionImgsNames[i]), imgOri)

'''
# 2nd option: store normal imgs first and then store effusion imgs
for i in range(nEffusionNormalImgstoSplit):
    # Store a normal image inside "normal" folder
    imgOri = cv2.imread(pathTrainVal + normalImgsNames[i], cv2.IMREAD_GRAYSCALE) 
    cv2.imwrite(os.path.join(pathNormalImgs, normalImgsNames[i]), imgOri)

for j in range(nEffusionNormalImgstoSplit):
    # Store an effusion image inside "effusion" folder
    imgOri = cv2.imread(pathTrainVal + effusionImgsNames[j], cv2.IMREAD_GRAYSCALE) 
    cv2.imwrite(os.path.join(pathEffusionImgs, effusionImgsNames[j]), imgOri)
'''