In [14]:
import os,sys
sys.path.insert(0,"..")
from glob import glob
import matplotlib.pyplot as plt
import shutil
import numpy as np
import pandas as pd
import pathlib
import torch
import torchvision
import torchxrayvision as xrv
import ast
import math
import re
import skimage
from PIL import Image
from torchvision.transforms.functional import to_pil_image

from concurrent.futures import ThreadPoolExecutor
import os
import skimage.io
import skimage.color

In [12]:
def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return f"{s} {size_name[i]}"

def get_dir_size(start_path = '.'):
    total_size = 0
    total_files = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_files += 1
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
    return convert_size(total_size), total_files

In [16]:
print(get_dir_size("/ssd/jpmokc/analysis/Notebooks/test_imgs_pad"))
print(get_dir_size("/ssd/jpmokc/analysis/Notebooks/test_imgs_pad/PNG"))
print(get_dir_size("/ssd/jpmokc/analysis/Notebooks/test_imgs_pad/JPEG"))
print(get_dir_size("/ssd/jpmokc/analysis/Notebooks/test_imgs_mim"))
print(get_dir_size("/ssd/jpmokc/analysis/Notebooks/test_imgs_mim/PNG"))
print(get_dir_size("/ssd/jpmokc/analysis/Notebooks/test_imgs_mim/JPEG"))

('65.64 MB', 27)
('186.6 KB', 9)
('47.85 KB', 9)
('14.09 MB', 27)
('200.27 KB', 9)
('52.92 KB', 9)


In [31]:
#Avery's code
def print_image_shapes(folder_path):
    for dirpath, _, filenames in os.walk(folder_path):
        for filename in filenames:
            if filename.endswith('.jpg') or filename.endswith('.png'):
                image_path = os.path.join(dirpath, filename)
                img = skimage.io.imread(image_path)
                print(f"Image {filename} shape: {img.shape}")
                
def process_image(source_file_path, image_folder, file_format):
    transform = torchvision.transforms.Compose([xrv.datasets.XRayCenterCrop(), xrv.datasets.XRayResizer(224)])
    dest_dir_path = os.path.join(image_folder, file_format.upper())
    os.makedirs(dest_dir_path, exist_ok=True)
    base_filename, _ = os.path.splitext(os.path.basename(source_file_path))
    dest_file_path = os.path.join(dest_dir_path, f"{base_filename}.{file_format.lower()}")

    img = skimage.io.imread(source_file_path)
    img = img / img.max()

    if len(img.shape) == 3:
        if img.shape[2] == 4:
            img = skimage.color.rgba2rgb(img)
        img = skimage.color.rgb2gray(img)
    elif len(img.shape) < 2:
        print("error, dimension lower than 2 for image")

    img = img[None, :, :]

    if transform is not None:
        img = transform(img)

    img = torch.from_numpy(img)

    pil_transform = torchvision.transforms.ToPILImage(mode='L')
    img = pil_transform(img)

    save_format = file_format if file_format != 'JPG' else 'JPEG'  # Use 'JPEG' for PIL saving
    img.save(dest_file_path, format=save_format)

def process_images_in_folder(image_folder):
    original_size, original_files = get_dir_size(image_folder)
    print(f"Original size of folder {image_folder}: {original_size}")

    image_paths = [os.path.join(image_folder, filename) for filename in os.listdir(image_folder) if filename.endswith('.jpg') or filename.endswith('.png')]

    for file_format in ['JPG', 'PNG']:
        with ThreadPoolExecutor() as executor:
            list(executor.map(process_image, image_paths, [image_folder]*len(image_paths), [file_format]*len(image_paths)))

    jpg_size, jpg_files = get_dir_size(os.path.join(image_folder, 'JPG'))
    png_size, png_files = get_dir_size(os.path.join(image_folder, 'PNG'))
    print(f"New JPG folder size: {jpg_size}, New PNG folder size: {png_size}")
    # print_image_shapes(os.path.join(image_folder, 'JPG'))
    # print_image_shapes(os.path.join(image_folder, 'PNG'))


In [36]:
mimic_path = "/ssd2/jpmokc/datasets/physionet.org/files/mimic-cxr-jpg/2.0.0/files"
mimic_reduced_path = "/ssd2/jpmokc/datasets/training/data/mimic"

pad_path= "/ssd2/jpmokc/datasets/PADCHEST_SJ/image_zips"
pad_reduced_path = "/ssd2/jpmokc/datasets/training/data/pad-chest/no-finding/ap"

print(get_dir_size(mimic_path))
print(get_dir_size(mimic_reduced_path))
print(get_dir_size(pad_path))
print(get_dir_size(pad_reduced_path))

('488.25 GB', 620669)
('83.46 GB', 148998)
('840.79 GB', 139409)
('2.27 GB', 291)


In [32]:
test_img_dir_pad = "/ssd/jpmokc/analysis/Notebooks/test_imgs_pad/"
test_img_dir_mim = "/ssd/jpmokc/analysis/Notebooks/test_imgs_mim/"

pad_dir_eff_ap = "/ssd2/jpmokc/datasets/training/data/pad-chest/effusion/ap"
pad_dir_eff_pa = "/ssd2/jpmokc/datasets/training/data/pad-chest/effusion/pa"
pad_dir_cardio_ap = "/ssd2/jpmokc/datasets/training/data/pad-chest/cardiomegaly/ap"
pad_dir_cardio_pa = "/ssd2/jpmokc/datasets/training/data/pad-chest/cardiomegaly/pa"
pad_dir_norm_ap = "/ssd2/jpmokc/datasets/training/data/pad-chest/no-finding/ap"
pad_dir_norm_pa = "/ssd2/jpmokc/datasets/training/data/pad-chest/no-finding/pa"
pad_dir_pnu_ap = "/ssd2/jpmokc/datasets/training/data/pad-chest/pneumonia/ap"
pad_dir_pnu_pa = "/ssd2/jpmokc/datasets/training/data/pad-chest/pneumonia/pa"

mimic_dir_eff_ap = "/ssd2/jpmokc/datasets/training/data/mimic/effusion/ap"
mimic_dir_eff_pa = "/ssd2/jpmokc/datasets/training/data/mimic/effusion/pa"
mimic_dir_cardio_ap = "/ssd2/jpmokc/datasets/training/data/mimic/cardiomegaly/ap"
mimic_dir_cardio_pa = "/ssd2/jpmokc/datasets/training/data/mimic/cardiomegaly/pa"
mimic_dir_norm_ap = "/ssd2/jpmokc/datasets/training/data/mimic/no-finding/ap"
mimic_dir_norm_pa = "/ssd2/jpmokc/datasets/training/data/mimic/no-finding/pa"
mimic_dir_pnu_ap = "/ssd2/jpmokc/datasets/training/data/mimic/pneumonia/ap"
mimic_dir_pnu_pa = "/ssd2/jpmokc/datasets/training/data/mimic/pneumonia/pa"


In [33]:

process_images_in_folder(mimic_dir_eff_pa)
process_images_in_folder(mimic_dir_eff_ap)
process_images_in_folder(mimic_dir_norm_pa)
process_images_in_folder(mimic_dir_norm_ap)
process_images_in_folder(mimic_dir_pnu_pa)
process_images_in_folder(mimic_dir_pnu_ap)
process_images_in_folder(pad_dir_eff_pa)
process_images_in_folder(pad_dir_eff_ap)
process_images_in_folder(pad_dir_norm_pa)
process_images_in_folder(pad_dir_norm_ap)
process_images_in_folder(pad_dir_pnu_pa)
process_images_in_folder(pad_dir_pnu_ap)


Original size of folder /ssd2/jpmokc/datasets/training/data/mimic/effusion/pa: 1.11 GB
New JPG folder size: 4.52 MB, New PNG folder size: 17.17 MB
Original size of folder /ssd2/jpmokc/datasets/training/data/mimic/effusion/ap: 1.05 GB
New JPG folder size: 3.54 MB, New PNG folder size: 13.51 MB
Original size of folder /ssd2/jpmokc/datasets/training/data/mimic/no-finding/pa: 51.75 GB
New JPG folder size: 186.69 MB, New PNG folder size: 705.22 MB
Original size of folder /ssd2/jpmokc/datasets/training/data/mimic/no-finding/ap: 19.18 GB
New JPG folder size: 63.9 MB, New PNG folder size: 243.9 MB
Original size of folder /ssd2/jpmokc/datasets/training/data/mimic/pneumonia/pa: 2.01 GB
New JPG folder size: 7.45 MB, New PNG folder size: 28.03 MB
Original size of folder /ssd2/jpmokc/datasets/training/data/mimic/pneumonia/ap: 699.74 MB
New JPG folder size: 2.39 MB, New PNG folder size: 8.99 MB
Original size of folder /ssd2/jpmokc/datasets/training/data/pad-chest/effusion/pa: 6.62 GB
New JPG folder 

SyntaxError: broken PNG file (chunk b'\x00\x00\x00\x00') (<string>)