In [1]:
# This is used to increase the notebook's width to fill the screen, allowing for better plot visualization
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import os
import cv2
import time
import shutil
import numpy as np
import pandas as pd

from tqdm import tqdm

  from IPython.core.display import display, HTML


# Path to Dataset

In [2]:
# Relative path to dataset
data_dir = os.path.join( "..", "..", "..", "..", "Datasets", "COVID19", "Tomografia", "COVIDx CT-3A" )
assert os.path.exists( data_dir ), "Unable to find the relative path to COVIDx CT-3A, please check data_dir..."

# Path to metadata csv
csv_path = os.path.join( data_dir, "new_split_metadata.csv" )

# Reads metadata as dataframe, "age" column is treated as str since "N/A" can't be int
samples_df = pd.read_csv(csv_path, sep = ";", na_filter = False, dtype={"age": str})
print( len(samples_df) )
samples_df.head()

425024


Unnamed: 0,filename,patient_id,source,class,country,sex,age,partition,slice_selection,x_min,y_min,x_max,y_max,verified_finding,view,modality
0,NCP_96_1328_0032.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,9,94,512,405,Yes,Axial,CT
1,NCP_96_1328_0035.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,10,106,512,405,Yes,Axial,CT
2,NCP_96_1328_0036.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,10,105,512,406,Yes,Axial,CT
3,NCP_96_1328_0037.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,11,104,512,406,Yes,Axial,CT
4,NCP_96_1328_0038.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,11,103,512,406,Yes,Axial,CT


# Resize Images

In [3]:
def save_img( exp_path, src_img ):
    # Saves the resized image (creates saving directory if needed)
    exp_dir = os.path.dirname(exp_path)
    if not os.path.exists(exp_dir):
        os.makedirs(exp_dir)
    cv2.imwrite(exp_path, src_img)
    return

def resize_samples( df, import_dir, sub_dir, export_dir, dataset, sizes, seed = 42 ):
    
    # Sets a seed for reprodutibility
    np.random.seed( seed )
    
    # Filters inputed df to select only rows from the current source
    source_df = df[ df["source"] == dataset ].copy(deep = True)
    source_df.reset_index(drop = True, inplace = True)
    
    # Converts to dict for faster iteration through rows
    df_dict = source_df.to_dict( "records" )
    for row in tqdm(df_dict):
        
        # Paths to import the original image (512x512) and export the resized one (256x256)
        import_path = os.path.join( import_dir, sub_dir, row["filename"] )
        ex_csv_path = os.path.join( row["source"], row["partition"], row["filename"] )
        
        if all([os.path.exists(os.path.join( export_dir, f"{size}x{size}", ex_csv_path )) for size in sizes]):
            continue
        
        # Loads original image
        src_img = cv2.imread( import_path, -1 )
                
        for size in sizes:
            export_path = os.path.join( export_dir, f"{size}x{size}", ex_csv_path )
            if os.path.exists(export_path):
                continue

            # Resizes using a random interpolation method
            # As a seed was set at the start, the interpolation for each sample will be the same for every execution
            # Used Interpolations are: 0 = INTER_NEAREST, 1 = INTER_LINEAR, 2 = INTER_CUBIC, 3 = INTER_AREA, 4 = INTER_LANCZOS4
            dst_img = cv2.resize( src_img, (size, size), interpolation = np.random.randint(0, 5) )

            # Copies image (creates saving directory if needed)
            save_img( export_path, dst_img )
    
    # Saving dataset with updated paths
    source_df["path"] = source_df.apply( lambda row: os.path.join(row["source"], row["partition"], row["filename"]), axis = 1)
    
    # Saves the dataframe
    dataset_csv_path = os.path.join( export_dir, "{}_data.csv".format(dataset) )
    source_df.to_csv( dataset_csv_path, index = False, sep = ";" )
    
    return

In [4]:
# export_dir = os.path.join( "..", "data", "classification" )
export_dir = os.path.join( "..", "..", "..", "..", "Datasets", "COVID19", "CT", "classification" )

# List of unique sources (datasets used to build COVIDx CT-3A)
source_list = [ "radiopaedia.org", "LIDC-IDRI", "COVID-CTset", "Stony Brook", 
                "COVID-CT-MD", "iCTCF", "CNCB", "COVID-19-CT-Seg", "TCIA", "STOIC" ]

for src in np.unique( samples_df["source"].to_list() ):
    print(src in source_list, src)
print("\n")

sizes_list = [224, 240, 256, 260, 299, 300, 384, 456, 480, 512]

for idx, src in enumerate(source_list):
    print( "{}/{} - {}:\n".format(idx+1, len(source_list), src) )
    resize_samples( samples_df, data_dir, "3A_images", export_dir, src, sizes_list )
    print("\n")

True CNCB
True COVID-19-CT-Seg
True COVID-CT-MD
True COVID-CTset
True LIDC-IDRI
True STOIC
True Stony Brook
True TCIA
True iCTCF
True radiopaedia.org


1/10 - radiopaedia.org:



100%|██████████████████████████████████████████████████████████████████████████████| 3574/3574 [02:46<00:00, 21.47it/s]




2/10 - LIDC-IDRI:



100%|██████████████████████████████████████████████████████████████████████████████| 3999/3999 [03:56<00:00, 16.90it/s]




3/10 - COVID-CTset:



100%|████████████████████████████████████████████████████████████████████████████| 12058/12058 [16:20<00:00, 12.29it/s]




4/10 - Stony Brook:



100%|████████████████████████████████████████████████████████████████████████████| 14461/14461 [22:34<00:00, 10.68it/s]




5/10 - COVID-CT-MD:



100%|██████████████████████████████████████████████████████████████████████████| 23280/23280 [1:17:07<00:00,  5.03it/s]




6/10 - iCTCF:



100%|██████████████████████████████████████████████████████████████████████████| 45912/45912 [1:04:53<00:00, 11.79it/s]




7/10 - CNCB:



100%|████████████████████████████████████████████████████████████████████████| 115837/115837 [2:31:36<00:00, 12.73it/s]




8/10 - COVID-19-CT-Seg:



100%|██████████████████████████████████████████████████████████████████████████████| 1726/1726 [01:56<00:00, 14.80it/s]




9/10 - TCIA:



100%|████████████████████████████████████████████████████████████████████████████| 11816/11816 [12:33<00:00, 15.69it/s]




10/10 - STOIC:



100%|████████████████████████████████████████████████████████████████████████| 192361/192361 [4:51:39<00:00, 10.99it/s]






# Verify Paths

In [5]:
def check_paths( i_dir, data_source, sizes ):
    
    metadata_csv_path = os.path.join(i_dir, "{}_data.csv".format(data_source))
    df = pd.read_csv(metadata_csv_path, sep = ";", na_filter = False, dtype={"age": str})
    
    # List of all paths
    path_list = df["path"].to_list()

    # List of paths per partition
    train_path_list = df[df["partition"] == "train"]["path"].to_list()
    val_path_list   = df[df["partition"] ==   "val"]["path"].to_list()
    test_path_list  = df[df["partition"] ==  "test"]["path"].to_list()

    for p in tqdm(path_list):
    
        for size in sizes:

            path = os.path.join( i_dir, f"{size}x{size}", p )

            if not os.path.exists( path ):
                print("\n\tPath '{}' does not exist...".format(path))

            num_partitions = np.sum([ (p in train_path_list), (p in val_path_list), (p in test_path_list) ])

            if num_partitions > 1:
                print("\n\tSample '{}' belongs to >1 partition...".format(path))

            if num_partitions < 1:
                print("\n\tSample '{}' does not belong to any partition...".format(path))
    
    return

In [6]:
dataset_list = list(source_list)

# Relative path to dataset
image_dir = os.path.join( "..", "..", "..", "..", "Datasets", "COVID19", "CT", "classification" )
assert os.path.exists( image_dir ), "Unable to find the relative path to resized images, please check image_dir..."

for idx, dataset_name in enumerate(dataset_list):
    print( "{}/{} - {}:\n".format(idx+1, len(dataset_list), dataset_name) )
    check_paths( image_dir, dataset_name, sizes_list )
    print("\n\n")
    

1/10 - radiopaedia.org:



100%|█████████████████████████████████████████████████████████████████████████████| 3574/3574 [00:35<00:00, 101.48it/s]





2/10 - LIDC-IDRI:



100%|█████████████████████████████████████████████████████████████████████████████| 3999/3999 [00:28<00:00, 139.08it/s]





3/10 - COVID-CTset:



100%|███████████████████████████████████████████████████████████████████████████| 12058/12058 [01:17<00:00, 155.39it/s]





4/10 - Stony Brook:



100%|███████████████████████████████████████████████████████████████████████████| 14461/14461 [02:12<00:00, 109.06it/s]





5/10 - COVID-CT-MD:



100%|███████████████████████████████████████████████████████████████████████████| 23280/23280 [02:21<00:00, 164.73it/s]





6/10 - iCTCF:



100%|███████████████████████████████████████████████████████████████████████████| 45912/45912 [06:04<00:00, 125.95it/s]





7/10 - CNCB:



100%|██████████████████████████████████████████████████████████████████████████| 115837/115837 [31:55<00:00, 60.47it/s]





8/10 - COVID-19-CT-Seg:



100%|█████████████████████████████████████████████████████████████████████████████| 1726/1726 [00:11<00:00, 152.74it/s]





9/10 - TCIA:



100%|███████████████████████████████████████████████████████████████████████████| 11816/11816 [01:02<00:00, 190.46it/s]





10/10 - STOIC:



100%|████████████████████████████████████████████████████████████████████████| 192361/192361 [1:27:57<00:00, 36.45it/s]







