In [1]:
# This is used to increase the notebook's width to fill the screen, allowing for better plot visualization
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import os
import cv2
import time
import shutil
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

from tqdm import tqdm
from utils import plot
from IPython.display import display, HTML

  from IPython.core.display import display, HTML


# Loading Dataframes

In [2]:
# Relative path to dataset
data_dir = os.path.join( "..", "..", "..", "..", "Datasets", "COVID19", "Tomografia", "COVIDx CT-3A" )
assert os.path.exists( data_dir ), "Unable to find the relative path to COVIDx CT-3A, please check data_dir..."

## Samples Dataframe

In [3]:
# Path to metadata csv
csv_path = os.path.join( data_dir, "new_split_metadata.csv" )

# Reads metadata as dataframe, "age" column is treated as str since "N/A" can't be int
samples_df = pd.read_csv(csv_path, sep = ";", na_filter = False, dtype={"age": str})

samples_df.head()

Unnamed: 0,filename,patient_id,source,class,country,sex,age,partition,slice_selection,x_min,y_min,x_max,y_max,verified_finding,view,modality
0,NCP_96_1328_0032.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,9,94,512,405,Yes,Axial,CT
1,NCP_96_1328_0035.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,10,106,512,405,Yes,Axial,CT
2,NCP_96_1328_0036.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,10,105,512,406,Yes,Axial,CT
3,NCP_96_1328_0037.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,11,104,512,406,Yes,Axial,CT
4,NCP_96_1328_0038.png,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,11,103,512,406,Yes,Axial,CT


## Patients Dataframe

In [4]:
# Creates a new dataframe with metadata sorted by patient
patient_df = plot.convert_df_sample2patient( samples_df )

patient_df.head()

Unnamed: 0,patient_id,source,class,country,sex,age,partition,slice_selection,verified_finding,view,modality,sample_count
0,NCP_96,CNCB,COVID-19,China,M,74.0,train,Expert,Yes,Axial,CT,121
1,NCP_270,CNCB,COVID-19,China,,,val,Expert,Yes,Axial,CT,124
2,NCP_322,CNCB,COVID-19,China,,,train,Expert,Yes,Axial,CT,40
3,NCP_1276,CNCB,COVID-19,China,,,test,Expert,Yes,Axial,CT,32
4,NCP_2683,CNCB,COVID-19,China,,,test,Expert,Yes,Axial,CT,9


# Stats per Source

In [12]:
def get_sub_dfs(s_df, p_df, src):
    # Extracts the patients dataframe for the current data source
    sub_p_df = p_df[p_df["source"] == src]
    sub_p_df.reset_index(inplace = True)
    
    # Extracts the samples dataframe for the current data source
    sub_s_df = s_df[s_df["source"] == src]
    sub_s_df.reset_index(inplace = True)
    
    return sub_s_df, sub_p_df

def path_from_fname(import_dir, filename):
    return os.path.join(import_dir, "3A_images", filename)

def get_sample_stats(s_df):
    
    cols = []
    # cols.extend(s_df.columns)
    new_cols = ["image_height", "image_width", "image_roi_frac", "partition",
                "image_min_pixel_value", "image_avg_pixel_value", "image_max_pixel_value"]
    cols.extend(new_cols)

    stats_dict = { col: [] for col in cols }

    # Converts to dict for faster iteration through rows
    s_dict = s_df.to_dict( "records" )
    for row in tqdm(s_dict):

        # Gets the path for the current file
        path = path_from_fname(data_dir, row["filename"])

        # Reads image if the path exists
        if not os.path.exists(path):
            print(f"\nPath '{path}' does not exist...\n")
            continue
        img = cv2.imread(path, 0)

        # Gets image shape
        H, W = img.shape[:2]

        # Image total area in pixels
        img_area = H * W

        # Area for the inner crop
        roi_h = row["y_max"] - row["y_min"]
        roi_w = row["x_max"] - row["x_min"]
        img_roi_area = roi_h * roi_w

        # Fraction of the image occupied by the inner crop
        img_roi_frac = img_roi_area / img_area

        stats_dict["image_height"].append(H)
        stats_dict["image_width"].append(W)
        stats_dict["image_roi_frac"].append(img_roi_frac)
        stats_dict["partition"].append(row["partition"])
        stats_dict["image_min_pixel_value"].append( np.min(img).astype(int) )
        stats_dict["image_avg_pixel_value"].append( np.mean(img).astype(float) )
        stats_dict["image_max_pixel_value"].append( np.max(img).astype(int) )
        
#         for col in s_df.columns:
#             stats_dict[col].append(row[col])

    sub_sample_stats_df = pd.DataFrame.from_dict(stats_dict)
    return sub_sample_stats_df

def get_interval(values):
    vmin, vmax = np.min(values), np.max(values)
    is_equal = (vmin == vmax)
    if isinstance(vmin, float) or isinstance(vmax, float):
        vmin, vmax = f"{vmin:.3f}", f"{vmax:.3f}"
    if is_equal:
        return f"[{vmin}]"
    return f"[{vmin}, {vmax}]"

def get_distribution(values):
    vavg, vstd = np.mean(values), np.std(values)
    
    if vavg.is_integer():
        vavg = int(vavg)
    
    if vstd.is_integer():
        vstd = int(vstd)
    
    if isinstance(vavg, float) or isinstance(vstd, float):
        vavg, vstd = f"{vavg:.3f}", f"{vstd:.3f}"
    return f"{vavg} +- {vstd}"

def mount_sample_stats_table(s_stats_df):
    table_as_dict = { "Attribute": [], "Interval": [], "Distribution": [] }
    
    for col in s_stats_df.columns:
        table_as_dict["Attribute"].append(col)
        values = s_stats_df[col].to_list()
        
        # Gets the interval for values of the current column
        interval = get_interval(values)
        table_as_dict["Interval"].append(interval)
        
        # Gets the distribution for values of the current column
        distrib = get_distribution(values)
        table_as_dict["Distribution"].append(distrib)

    sample_stats_table = pd.DataFrame.from_dict(table_as_dict)
    sample_stats_table.set_index("Attribute", inplace=True)
    return sample_stats_table

def mount_stats_table_per_partition(s_stats_df):
    stats_table_list = []
    partitions = ["train", "val", "test"]
    
    # "dataset" stands for the whole dataset
    for part in ["dataset", "train", "val", "test"]:
        
        # Filters the input df for the rows corresponding to the current partition
        sub_stats_df = s_stats_df.copy(deep = True)
        if part in partitions:
            sub_stats_df = sub_stats_df[sub_stats_df["partition"] == part]
            sub_stats_df.reset_index(inplace=True, drop=True)
        
        if len(sub_stats_df) == 0:
            continue
            
        # Drops the partition column
        sub_stats_df.drop(columns=["partition"], inplace=True)
        
        part_stats_table = mount_sample_stats_table(sub_stats_df)
        main_header = f"{part.title()} Partition" if part in partitions else part.title()
        header = [ [main_header for _ in part_stats_table.columns], part_stats_table.columns ]
        part_stats_table.columns = header
        
        stats_table_list.append(part_stats_table)
    
    dst_df = pd.concat(stats_table_list, axis=1)
    return dst_df

## radiopaedia.org

In [6]:
src = "radiopaedia.org"
sub_samples_df, sub_patient_df = get_sub_dfs(samples_df, patient_df, src)
print(f"Found {len(sub_samples_df)} samples from {len(sub_patient_df)} patients for source '{src}'")

Found 3574 samples from 105 patients for source 'radiopaedia.org'


### Sample Stats

In [7]:
sub_sample_stats_df = get_sample_stats(sub_samples_df)
sample_stats_table = mount_stats_table_per_partition(sub_sample_stats_df)
display(sample_stats_table)

100%|█████████████████████████████████████████████████████████████████████████████| 3574/3574 [00:27<00:00, 128.64it/s]


Unnamed: 0_level_0,Dataset,Dataset,Test Partition,Test Partition
Unnamed: 0_level_1,Interval,Distribution,Interval,Distribution
Attribute,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
image_height,"[330, 630]",619.400 +- 41.344,"[330, 630]",619.400 +- 41.344
image_width,"[533, 630]",626.617 +- 14.354,"[533, 630]",626.617 +- 14.354
image_roi_frac,"[0.340, 1.000]",0.675 +- 0.131,"[0.340, 1.000]",0.675 +- 0.131
image_min_pixel_value,"[0, 56]",8.675 +- 12.807,"[0, 56]",8.675 +- 12.807
image_avg_pixel_value,"[58.607, 219.672]",123.052 +- 19.719,"[58.607, 219.672]",123.052 +- 19.719
image_max_pixel_value,[255],255.000 +- 0.000,[255],255.000 +- 0.000


### Patient Metadata

## COVID-CTset

In [8]:
src = "COVID-CTset"
sub_samples_df, sub_patient_df = get_sub_dfs(samples_df, patient_df, src)
print(f"Found {len(sub_samples_df)} samples from {len(sub_patient_df)} patients for source '{src}'")

Found 12058 samples from 377 patients for source 'COVID-CTset'


### Sample Stats

In [9]:
sub_sample_stats_df = get_sample_stats(sub_samples_df)
sample_stats_table = mount_stats_table_per_partition(sub_sample_stats_df)
display(sample_stats_table)

100%|███████████████████████████████████████████████████████████████████████████| 12058/12058 [01:10<00:00, 171.43it/s]


Unnamed: 0_level_0,Dataset,Dataset,Train Partition,Train Partition,Val Partition,Val Partition,Test Partition,Test Partition
Unnamed: 0_level_1,Interval,Distribution,Interval,Distribution,Interval,Distribution,Interval,Distribution
Attribute,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
image_height,[512],512.000 +- 0.000,[512],512.000 +- 0.000,[512],512.000 +- 0.000,[512],512.000 +- 0.000
image_width,[512],512.000 +- 0.000,[512],512.000 +- 0.000,[512],512.000 +- 0.000,[512],512.000 +- 0.000
image_roi_frac,"[0.324, 0.969]",0.655 +- 0.103,"[0.369, 0.922]",0.653 +- 0.102,"[0.324, 0.969]",0.657 +- 0.115,"[0.382, 0.910]",0.655 +- 0.091
image_min_pixel_value,[55],55.000 +- 0.000,[55],55.000 +- 0.000,[55],55.000 +- 0.000,[55],55.000 +- 0.000
image_avg_pixel_value,"[94.800, 186.098]",133.354 +- 14.358,"[94.800, 182.575]",132.870 +- 14.391,"[95.044, 186.098]",134.473 +- 16.298,"[103.086, 172.709]",133.684 +- 11.905
image_max_pixel_value,[255],255.000 +- 0.000,[255],255.000 +- 0.000,[255],255.000 +- 0.000,[255],255.000 +- 0.000


## LIDC-IDRI

In [10]:
src = "LIDC-IDRI"
sub_samples_df, sub_patient_df = get_sub_dfs(samples_df, patient_df, src)
print(f"Found {len(sub_samples_df)} samples from {len(sub_patient_df)} patients for source '{src}'")

Found 3999 samples from 39 patients for source 'LIDC-IDRI'


### Sample Stats

In [13]:
sub_sample_stats_df = get_sample_stats(sub_samples_df)
sample_stats_table = mount_stats_table_per_partition(sub_sample_stats_df)
display(sample_stats_table)

100%|█████████████████████████████████████████████████████████████████████████████| 3999/3999 [00:19<00:00, 204.38it/s]


Unnamed: 0_level_0,Dataset,Dataset,Train Partition,Train Partition,Val Partition,Val Partition,Test Partition,Test Partition
Unnamed: 0_level_1,Interval,Distribution,Interval,Distribution,Interval,Distribution,Interval,Distribution
Attribute,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
image_height,[512],512 +- 0,[512],512 +- 0,[512],512 +- 0,[512],512 +- 0
image_width,[512],512 +- 0,[512],512 +- 0,[512],512 +- 0,[512],512 +- 0
image_roi_frac,"[0.372, 0.855]",0.604 +- 0.101,"[0.372, 0.855]",0.586 +- 0.106,"[0.419, 0.757]",0.610 +- 0.073,"[0.400, 0.825]",0.647 +- 0.095
image_min_pixel_value,[0],0 +- 0,[0],0 +- 0,[0],0 +- 0,[0],0 +- 0
image_avg_pixel_value,"[89.507, 159.623]",116.165 +- 13.352,"[89.507, 156.058]",113.415 +- 13.662,"[101.500, 148.890]",118.347 +- 9.745,"[97.349, 159.623]",121.926 +- 13.141
image_max_pixel_value,[255],255 +- 0,[255],255 +- 0,[255],255 +- 0,[255],255 +- 0


## Stony Brook

In [None]:
src = "Stony Brook"
sub_samples_df, sub_patient_df = get_sub_dfs(samples_df, patient_df, src)
print(f"Found {len(sub_samples_df)} samples from {len(sub_patient_df)} patients for source '{src}'")

sub_sample_stats_df = get_sample_stats(sub_samples_df)
sample_stats_table = mount_sample_stats_table(sub_sample_stats_df)
sample_stats_table.head(len(sample_stats_table))

## COVID-CT-MD

In [None]:
src = "COVID-CT-MD"
sub_samples_df, sub_patient_df = get_sub_dfs(samples_df, patient_df, src)
print(f"Found {len(sub_samples_df)} samples from {len(sub_patient_df)} patients for source '{src}'")

sub_sample_stats_df = get_sample_stats(sub_samples_df)
sample_stats_table = mount_sample_stats_table(sub_sample_stats_df)
sample_stats_table.head(len(sample_stats_table))

## iCTCF

In [None]:
src = "iCTCF"
sub_samples_df, sub_patient_df = get_sub_dfs(samples_df, patient_df, src)
print(f"Found {len(sub_samples_df)} samples from {len(sub_patient_df)} patients for source '{src}'")

sub_sample_stats_df = get_sample_stats(sub_samples_df)
sample_stats_table = mount_sample_stats_table(sub_sample_stats_df)
sample_stats_table.head(len(sample_stats_table))

## CNCB

In [None]:
src = "CNCB"
sub_samples_df, sub_patient_df = get_sub_dfs(samples_df, patient_df, src)
print(f"Found {len(sub_samples_df)} samples from {len(sub_patient_df)} patients for source '{src}'")

sub_sample_stats_df = get_sample_stats(sub_samples_df)
sample_stats_table = mount_sample_stats_table(sub_sample_stats_df)
sample_stats_table.head(len(sample_stats_table))

## COVID-19-CT-Seg

In [None]:
src = "COVID-19-CT-Seg"
sub_samples_df, sub_patient_df = get_sub_dfs(samples_df, patient_df, src)
print(f"Found {len(sub_samples_df)} samples from {len(sub_patient_df)} patients for source '{src}'")

sub_sample_stats_df = get_sample_stats(sub_samples_df)
sample_stats_table = mount_sample_stats_table(sub_sample_stats_df)
sample_stats_table.head(len(sample_stats_table))

## TCIA

In [None]:
src = "TCIA"
sub_samples_df, sub_patient_df = get_sub_dfs(samples_df, patient_df, src)
print(f"Found {len(sub_samples_df)} samples from {len(sub_patient_df)} patients for source '{src}'")

sub_sample_stats_df = get_sample_stats(sub_samples_df)
sample_stats_table = mount_sample_stats_table(sub_sample_stats_df)
sample_stats_table.head(len(sample_stats_table))

## STOIC

In [None]:
src = "STOIC"
sub_samples_df, sub_patient_df = get_sub_dfs(samples_df, patient_df, src)
print(f"Found {len(sub_samples_df)} samples from {len(sub_patient_df)} patients for source '{src}'")

sub_sample_stats_df = get_sample_stats(sub_samples_df)
sample_stats_table = mount_sample_stats_table(sub_sample_stats_df)
sample_stats_table.head(len(sample_stats_table))