In [1]:
# This is used to increase the notebook's width to fill the screen, allowing for better plot visualization
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import os
import cv2
import time
import shutil
import numpy as np
import pandas as pd

from tqdm import tqdm

  from IPython.core.display import display, HTML


# Path to Dataset

In [2]:
# Relative path to dataset
data_dir = os.path.join( "..", "..", "..", "..", "Datasets", "COVID19", "Tomografia", "COVIDx CT-3A" )
assert os.path.exists( data_dir ), "Unable to find the relative path to COVIDx CT-3A, please check data_dir..."

# Path to metadata csv
csv_path = os.path.join( data_dir, "new_split_metadata.csv" )

# Reads metadata as dataframe, "age" column is treated as str since "N/A" can't be int
samples_df = pd.read_csv(csv_path, sep = ";", na_filter = False, dtype={"age": str})
print( len(samples_df) )
samples_df.head()

425024


Unnamed: 0,filename,patient_id,source,class,country,sex,age,partition,slice_selection,x_min,y_min,x_max,y_max,verified_finding,view,modality
0,NCP_96_1328_0032.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,9,94,512,405,Yes,Axial,CT
1,NCP_96_1328_0035.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,10,106,512,405,Yes,Axial,CT
2,NCP_96_1328_0036.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,10,105,512,406,Yes,Axial,CT
3,NCP_96_1328_0037.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,11,104,512,406,Yes,Axial,CT
4,NCP_96_1328_0038.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,11,103,512,406,Yes,Axial,CT


# Resize Images

In [3]:
def save_img( exp_path, src_img ):
    # Saves the resized image (creates saving directory if needed)
    exp_dir = os.path.dirname(exp_path)
    if not os.path.exists(exp_dir):
        os.makedirs(exp_dir)
    cv2.imwrite(exp_path, src_img)
    return

def resize_samples( df, import_dir, sub_dir, export_dir, dataset, seed = 42 ):
    
    # Sets a seed for reprodutibility
    np.random.seed( seed )
    
    # Filters inputed df to select only rows from the current source
    source_df = df[ df["source"] == dataset ].copy(deep = True)
    source_df.reset_index(drop = True, inplace = True)
    
    # Converts to dict for faster iteration through rows
    df_dict = source_df.to_dict( "records" )
    for row in tqdm(df_dict):
        
        # Paths to import the original image (512x512) and export the resized one (256x256)
        import_path = os.path.join( import_dir, sub_dir, row["filename"] )
        export_path_256 = os.path.join( export_dir, "256x256", row["source"], row["partition"], row["filename"] )
        export_path_512 = os.path.join( export_dir, "512x512", row["source"], row["partition"], row["filename"] )
        
        if os.path.exists(export_path_512) and os.path.exists(export_path_256):
            continue
        
        # Loads original image
        src_img = cv2.imread( import_path, -1 )
        
        # Resizes using a random interpolation method
        # As a seed was set at the start, the interpolation for each sample will be the same for every execution
        # Used Interpolations are: 0 = INTER_NEAREST, 1 = INTER_LINEAR, 2 = INTER_CUBIC, 3 = INTER_AREA, 4 = INTER_LANCZOS4
        dst_512_img = cv2.resize( src_img, (512, 512), interpolation = np.random.randint(0, 5) )
        dst_256_img = cv2.resize( src_img, (256, 256), interpolation = np.random.randint(0, 5) )
        
        # Copies image (creates saving directory if needed)
        save_img( export_path_512, dst_512_img )
        
        # Saves the resized image (creates saving directory if needed)
        save_img( export_path_256, dst_256_img )
    
    # Saving dataset with updated paths
    source_df["path_512"] = source_df.apply( lambda row: os.path.join("512x512", row["source"], row["partition"], row["filename"]), axis = 1)
    
    # Saving dataset with updated paths
    source_df["path_256"] = source_df.apply( lambda row: os.path.join("256x256", row["source"], row["partition"], row["filename"]), axis = 1)
    
    # Saves the dataframe
    dataset_csv_path = os.path.join( export_dir, "{}_data.csv".format(dataset) )
    source_df.to_csv( dataset_csv_path, index = False, sep = ";" )
    
    return

In [4]:
export_dir = os.path.join( "..", "data", "classification" )

# List of unique sources (datasets used to build COVIDx CT-2A)
source_list = np.unique( samples_df["source"].to_list() )

for idx, src in enumerate(reversed(source_list)):
    print( "{}/{} - {}:\n".format(idx+1, len(source_list), src) )
    resize_samples( samples_df, data_dir, "3A_images", export_dir, src )
    print("\n")

1/10 - radiopaedia.org:



100%|████████████████████████████████████████████████████████████████████████████| 3574/3574 [00:02<00:00, 1426.25it/s]




2/10 - iCTCF:



100%|██████████████████████████████████████████████████████████████████████████| 45912/45912 [00:16<00:00, 2720.09it/s]




3/10 - TCIA:



100%|██████████████████████████████████████████████████████████████████████████| 11816/11816 [00:02<00:00, 3939.80it/s]




4/10 - Stony Brook:



100%|██████████████████████████████████████████████████████████████████████████| 14461/14461 [00:06<00:00, 2375.92it/s]




5/10 - STOIC:



100%|████████████████████████████████████████████████████████████████████████| 192361/192361 [01:10<00:00, 2721.63it/s]




6/10 - LIDC-IDRI:



100%|████████████████████████████████████████████████████████████████████████████| 3999/3999 [00:01<00:00, 3376.19it/s]




7/10 - COVID-CTset:



100%|██████████████████████████████████████████████████████████████████████████| 12058/12058 [00:06<00:00, 1778.35it/s]




8/10 - COVID-CT-MD:



100%|██████████████████████████████████████████████████████████████████████████| 23280/23280 [00:10<00:00, 2291.78it/s]




9/10 - COVID-19-CT-Seg:



100%|█████████████████████████████████████████████████████████████████████████████| 1726/1726 [00:02<00:00, 802.40it/s]




10/10 - CNCB:



100%|████████████████████████████████████████████████████████████████████████| 115837/115837 [00:46<00:00, 2483.91it/s]






# CSVs for COVIDx CT-2A

In [5]:
# # Saving dataset with updated paths
# samples_df["path"] = samples_df.apply( lambda row: os.path.join(row["source"], row["partition"], row["filename"]), axis = 1)
    
# # Saves the dataframe
# dataset_csv_path = os.path.join( data_dir, "resized", "metadata.csv" )
# samples_df.to_csv( dataset_csv_path, index = False, sep = ";" )

# for partition in ["train", "val", "test"]:

#     # Filters isource_df to select only rows from the current partition
#     part_df = samples_df[ samples_df["partition"] == partition ].copy(deep = True)
#     part_df.reset_index(drop = True, inplace = True)

#     # Saves the dataframe
#     part_csv_path = os.path.join( data_dir, "resized", "{}_data.csv".format(partition) )
#     part_df.to_csv( part_csv_path, index = False, sep = ";" )

# Verify Paths

In [6]:
def check_paths( i_dir, data_source ):
    
    metadata_csv_path = os.path.join(i_dir, "{}_data.csv".format(data_source))
    df = pd.read_csv(metadata_csv_path, sep = ";", na_filter = False, dtype={"age": str})
    
    for path_key in ["path_256", "path_512"]:
    
        # List of all paths
        path_list = df[path_key].to_list()

        # List of paths per partition
        train_path_list = df[df["partition"] == "train"][path_key].to_list()
        val_path_list   = df[df["partition"] ==   "val"][path_key].to_list()
        test_path_list  = df[df["partition"] ==  "test"][path_key].to_list()

        for p in tqdm(path_list):

            path = os.path.join( i_dir, p )

            if not os.path.exists( path ):
                print("\n\tPath '{}' does not exist...".format(path))

            num_partitions = np.sum([ (p in train_path_list), (p in val_path_list), (p in test_path_list) ])

            if num_partitions > 1:
                print("\n\tSample '{}' belongs to >1 partition...".format(path))

            if num_partitions < 1:
                print("\n\tSample '{}' does not belong to any partition...".format(path))
    
    return

In [7]:
dataset_list = list(source_list)

# Relative path to dataset
image_dir = os.path.join( "..", "data", "classification" )
assert os.path.exists( image_dir ), "Unable to find the relative path to resized images, please check image_dir..."

for idx, dataset_name in enumerate(dataset_list):
    print( "{}/{} - {}:\n".format(idx+1, len(dataset_list), dataset_name) )
    check_paths( image_dir, dataset_name )
    print("\n\n")
    

1/10 - CNCB:



100%|█████████████████████████████████████████████████████████████████████████| 115837/115837 [02:54<00:00, 662.06it/s]
100%|█████████████████████████████████████████████████████████████████████████| 115837/115837 [03:07<00:00, 617.18it/s]





2/10 - COVID-19-CT-Seg:



100%|████████████████████████████████████████████████████████████████████████████| 1726/1726 [00:00<00:00, 6470.15it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1726/1726 [00:00<00:00, 6492.27it/s]





3/10 - COVID-CT-MD:



100%|██████████████████████████████████████████████████████████████████████████| 23280/23280 [00:10<00:00, 2195.54it/s]
100%|██████████████████████████████████████████████████████████████████████████| 23280/23280 [00:11<00:00, 2026.33it/s]





4/10 - COVID-CTset:



100%|██████████████████████████████████████████████████████████████████████████| 12058/12058 [00:03<00:00, 3629.13it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12058/12058 [00:03<00:00, 3196.64it/s]





5/10 - LIDC-IDRI:



100%|████████████████████████████████████████████████████████████████████████████| 3999/3999 [00:00<00:00, 4708.76it/s]
100%|████████████████████████████████████████████████████████████████████████████| 3999/3999 [00:00<00:00, 4664.42it/s]





6/10 - STOIC:



100%|█████████████████████████████████████████████████████████████████████████| 192361/192361 [08:19<00:00, 385.19it/s]
100%|█████████████████████████████████████████████████████████████████████████| 192361/192361 [08:05<00:00, 396.50it/s]





7/10 - Stony Brook:



100%|██████████████████████████████████████████████████████████████████████████| 14461/14461 [00:04<00:00, 3153.05it/s]
100%|██████████████████████████████████████████████████████████████████████████| 14461/14461 [00:04<00:00, 3348.68it/s]





8/10 - TCIA:



100%|██████████████████████████████████████████████████████████████████████████| 11816/11816 [00:03<00:00, 3911.74it/s]
100%|██████████████████████████████████████████████████████████████████████████| 11816/11816 [00:03<00:00, 3935.33it/s]





9/10 - iCTCF:



100%|██████████████████████████████████████████████████████████████████████████| 45912/45912 [00:32<00:00, 1421.66it/s]
100%|██████████████████████████████████████████████████████████████████████████| 45912/45912 [00:32<00:00, 1394.21it/s]





10/10 - radiopaedia.org:



100%|████████████████████████████████████████████████████████████████████████████| 3574/3574 [00:00<00:00, 5223.33it/s]
100%|████████████████████████████████████████████████████████████████████████████| 3574/3574 [00:00<00:00, 5863.02it/s]







