# COVIDx CT Dataset Constructor
This notebook constructs the COVIDx CT dataset from scratch using the raw data. See [docs/dataset.md](docs/dataset.md) for more details on manual steps which must be completed beforehand.

In [1]:
import os
import cv2
import glob
import numpy as np

## Setting Paths
Paths to the source data and output location should be set here. Note that the window width and window level for processing scans in Hounsfield units are defined in [data_utils.py](data_utils.py) as `HU_WINDOW_WIDTH = 1500` and `HU_WINDOW_LEVEL = -600`.

In [2]:
# Select dataset version. Major version (e.g., 2) should match current release.
# Checkout earlier versions of the repo to construct previous dataset versions.
DATASET_VERSION = '2A'

# CNCB AI Diagnosis paths
CNCB_EXCLUDE_FILE = 'dataset_construction/metadata/cncb_exclude_list.txt'
CNCB_EXTRA_LESION_FILE = 'dataset_construction/metadata/cncb_extra_lesions_slices.csv'
CNCB_DIR = 'data/CNCB_AIDiagnosis'

# Radiopaedia/coronacases paths
RADIOPAEDIA_CORONACASES_CT_DIR = 'data/Coronacases_Radiopaedia/COVID-19-CT-Seg_20cases'
RADIOPAEDIA_CORONACASES_SEG_DIR = 'data/Coronacases_Radiopaedia/Infection_Mask'
RADIOPAEDIA_META_CSV = 'dataset_construction/metadata/radiopaedia_metadata.csv'
RADIOPAEDIA_EXCLUDE_FILE = 'dataset_construction/metadata/radiopaedia_exclude_list.txt'

# LIDC-IDRI paths
LIDC_META_CSV = 'dataset_construction/metadata/lidc_idri_metadata.csv'

# COVID-19-20 paths
COVID_19_20_DIR = 'data/COVID-19-20_v2/Train'

# TCIA COVID-19 paths
TCIA_COVID_META_CSV = 'dataset_construction/metadata/tcia_covid_metadata.csv'
TCIA_DIR = 'data/CT_Images_in_COVID-19_August_2020'

# COVID-CTset paths
COVID_CTSET_META_CSV = 'data/COVID-CTset/Labels&Detailes/Patient_details.csv'
COVID_CTSET_DIR = 'data/COVID-CTset/Train&Validation'

# iCTCF paths
ICTCF_META_CSV = 'dataset_construction/metadata/ictcf_metadata.csv'
ICTCF_DIR = 'data/iCTCF'

# MosMedData paths
MOSMED_CT_DIR = 'data/MosMedData/COVID19_1110/studies'
MOSMED_SEG_DIR = 'data/MosMedData/COVID19_1110/masks'
MOSMED_META_CSV = 'dataset_construction/metadata/mosmed_metadata.csv'

# Output directory path
OUTPUT_DIR = 'data/COVIDx_CT-{}'.format(DATASET_VERSION)  # directory to save the images in

# Make output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

## Data Preparation
These cells process the data from each source.

In [3]:
# Filename and class lists
filenames, classes = [], []

In [4]:
# Process CNCB AI Diagnosis data
from dataset_construction import cncb
fnames, cls = cncb.process_cncb_data(CNCB_EXCLUDE_FILE, CNCB_DIR, OUTPUT_DIR, extra_lesion_files=[CNCB_EXTRA_LESION_FILE])
filenames.extend(fnames)
classes.extend(cls)

100%|████████████████████████████████████████████████████████████████████████| 115837/115837 [00:11<00:00, 9903.68it/s]


In [5]:
# Process segmented radiopaedia and coronacases data
from dataset_construction import radiopaedia_coronacases as rc
fnames, cls = rc.process_radiopaedia_and_coronacases_seg_data(
    RADIOPAEDIA_CORONACASES_CT_DIR, RADIOPAEDIA_CORONACASES_SEG_DIR, OUTPUT_DIR)
filenames.extend(fnames)
classes.extend(cls)

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:47<00:00,  2.35s/it]


In [6]:
# Process additional radiopaedia data
from dataset_construction import radiopaedia
fnames, cls = radiopaedia.process_radiopaedia_data(RADIOPAEDIA_META_CSV, RADIOPAEDIA_EXCLUDE_FILE, OUTPUT_DIR)
filenames.extend(fnames)
classes.extend(cls)

100%|████████████████████████████████████████████████████████████████████████████████| 118/118 [00:44<00:00,  2.68it/s]


In [7]:
# Process LIDC-IDRI data
from dataset_construction import lidc_idri
fnames, cls = lidc_idri.process_lidc_idri_data(LIDC_META_CSV, OUTPUT_DIR)
filenames.extend(fnames)
classes.extend(cls)

100%|██████████████████████████████████████████████████████████████████████████████████| 39/39 [02:03<00:00,  3.16s/it]


In [8]:
# Process COVID-19-20 challenge data
from dataset_construction import covid_19_20
fnames, cls = covid_19_20.process_covid_19_20_data(COVID_19_20_DIR, OUTPUT_DIR)
filenames.extend(fnames)
classes.extend(cls)

100%|████████████████████████████████████████████████████████████████████████████████| 199/199 [03:35<00:00,  1.08s/it]


In [9]:
# Process additional TCIA COVID-19 studies
from dataset_construction import tcia_covid
fnames, cls = tcia_covid.process_tcia_covid_data(TCIA_COVID_META_CSV, TCIA_DIR, OUTPUT_DIR)
filenames.extend(fnames)
classes.extend(cls)

100%|████████████████████████████████████████████████████████████████████████████████| 445/445 [06:33<00:00,  1.13it/s]


In [10]:
# Process COVID-CTset data
from dataset_construction import covid_ctset
fnames, cls = covid_ctset.process_covid_ctset_data(COVID_CTSET_META_CSV, COVID_CTSET_DIR, OUTPUT_DIR)
filenames.extend(fnames)
classes.extend(cls)

100%|████████████████████████████████████████████████████████████████████████████████| 377/377 [00:09<00:00, 40.33it/s]


In [11]:
# Process iCTCF data
from dataset_construction import ictcf
fnames, cls = ictcf.process_ictcf_data(ICTCF_META_CSV, ICTCF_DIR, OUTPUT_DIR)
filenames.extend(fnames)
classes.extend(cls)

100%|████████████████████████████████████████████████████████████████████████████████| 709/709 [01:03<00:00, 11.25it/s]


In [12]:
# Process MosMedData studies (only for "B" variant)
if DATASET_VERSION[-1] == 'B':
    from dataset_construction import mosmed
    ct_1_dir = os.path.join(MOSMED_CT_DIR, 'CT-1')
    fnames, cls = mosmed.process_mosmed_seg_data(ct_1_dir, MOSMED_SEG_DIR, OUTPUT_DIR)
    filenames.extend(fnames)
    classes.extend(cls)
    
    fnames, cls = mosmed.process_mosmed_unseg_data(MOSMED_META_CSV, MOSMED_CT_DIR, OUTPUT_DIR)
    filenames.extend(fnames)
    classes.extend(cls)

In [13]:
# Print image counts
from dataset_construction.utils import CLASS_MAP
uniq_classes, counts = np.unique(classes, return_counts=True)
print('Image Counts')
for name, cls in CLASS_MAP.items():
    print('{}: {}'.format(name, counts[uniq_classes == cls]))

Image Counts
Normal: [60083]
Pneumonia: [40291]
COVID-19: [94548]


## Verification
Perform a check to ensure that all files are present (optional).

In [14]:
# Get split files
split_files = glob.glob(os.path.join('splits/v' + DATASET_VERSION[0], '*_COVIDx_CT-{}.txt'.format(DATASET_VERSION)))
if not len(split_files):
    raise ValueError('Split files for COVIDx CT-{} not found'.format(DATASET_VERSION))

# Default to "A" variant when "B" variant files are not present
if len(split_files) != 3:
    a_var = DATASET_VERSION[0] + 'A'
    existing_splits = set(os.path.basename(split_file).split('_')[0] for split_file in split_files)
    split_files_a = glob.glob(os.path.join('splits/v' + DATASET_VERSION[0], '*_COVIDx_CT-{}.txt'.format(a_var)))
    split_files_a = [f for f in split_files_a if os.path.basename(f).split('_')[0] not in existing_splits]
    split_files = split_files + split_files_a

# Check that all files from all splits are present in the constructed data
count = 0
total = 0
incomplete = False
for split_file in split_files:
    with open(split_file, 'r') as f:
        for line in f.readlines():
            fname, cls = line.strip('\n').split()[:2]
            fpath = os.path.join(OUTPUT_DIR, fname)
            
            total += 1
            if os.path.exists(fpath):
                count += 1
            else:
                print('Missing', fname)
                incomplete = True
if incomplete:
    print('{}/{} files are missing, dataset is incomplete!'.format(count, total))
else:
    print('{}/{} files created, dataset successfully constructed!'.format(count, total))

194922/194922 files created, dataset successfully constructed!
