In [None]:
import os
import shutil
import zipfile
import pandas as pd
import numpy as np
from tqdm import tqdm

## Downloading the Datasets

In [None]:
root_dir = "/content/drive/MyDrive/Colab Notebooks"
original_datasets_dir = "/content/drive/MyDrive/Colab Notebooks/original_datasets"
if not os.path.exists(original_datasets_dir):
    os.makedirs(original_datasets_dir, exist_ok=True)

In [None]:
!mkdir /root/.kaggle
!echo '{"username":"{INSERT_KAGGLE_USERNAME}","key":"{INSERT_KAGGLE_USER_API_KEY}"}' > /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!mkdir /root/.kaggle
!echo '{"username":"replaceWithUsername","key":"replaceWithAPIKey"}' > /root/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
# Downloading APTOS-2019 Dataset from Kaggle
!kaggle competitions download -c aptos2019-blindness-detection -p "drive/MyDrive/Colab Notebooks/original_datasets"

Downloading aptos2019-blindness-detection.zip to drive/MyDrive/Colab Notebooks/original_datasets
100% 9.51G/9.51G [01:32<00:00, 129MB/s]
100% 9.51G/9.51G [01:32<00:00, 110MB/s]


In [None]:
# Downloading Messidor-2 Dataset from Kaggle
!kaggle datasets download -d xyaustin/messidor2 -p "drive/MyDrive/Colab Notebooks/original_datasets"

Downloading messidor2.zip to drive/MyDrive/Colab Notebooks/original_datasets
100% 2.29G/2.29G [00:26<00:00, 132MB/s]
100% 2.29G/2.29G [00:26<00:00, 92.3MB/s]


In [None]:
# Downloading Diabetic Retinopathy Detection Competition Dataset from Kaggle
!kaggle competitions download -c diabetic-retinopathy-detection -p "drive/MyDrive/Colab Notebooks/original_datasets"

Downloading diabetic-retinopathy-detection.zip to drive/MyDrive/Colab Notebooks/original_datasets
100% 82.2G/82.2G [14:40<00:00, 154MB/s]
100% 82.2G/82.2G [14:40<00:00, 100MB/s]


*IDRiD dataset was retrieved from the IDRiD website:<br>
https://ieee-dataport.org/open-access/indian-diabetic-retinopathy-image-dataset-idrid*


*However, it can also be gotten from Kaggle via the API command:*

<code>
!kaggle datasets download -d mariaherrerot/idrid-dataset -p "drive/MyDrive/Colab Notebooks/original_datasets"
</code>

## Extracting Datasets from Zip

In [None]:
def unzip_dataset(data_root_path, dataset_name):
    zip_file_name = os.path.join(data_root_path, f"{dataset_name}.zip")
    with zipfile.ZipFile(zip_file_name, 'r') as zip:
        # Extract all files in the zip
        zip.extractall(os.path.join(data_root_path, dataset_name))

In [None]:
unzip_dataset(original_datasets_dir, "aptos2019-blindness-detection")

In [None]:
unzip_dataset(original_datasets_dir, "messidor2")

In [None]:
unzip_dataset(original_datasets_dir, "B. Disease Grading")

In [None]:
unzip_dataset(original_datasets_dir, "diabetic-retinopathy-detection")

# Link to csv for test set (download and add to diabetic-retinopathy-detection directory): 
# https://www.kaggle.com/competitions/diabetic-retinopathy-detection/discussion/16149

### Extracting split archives in Diabetic Retinopathy Detection Dataset Directory

In [None]:
# Initialise path to Diabetic Retinopathy Detection Dataset directory
dr_detection_dataset_dir = os.path.join(original_datasets_dir, "diabetic-retinopathy-detection")

In [None]:
def concatenate_multipart_archives(source_dir, archive_prefix, target_dir): 
    '''
    source_dir: the path where the archives are stored
    archive_prefix: the prefix for the archives' file names
    target_dir: the path to where the final archive is to be stored
    '''
    archive_path = os.path.join(target_dir, 'archive.zip')
    if os.path.exists(archive_path):
        return archive_path

    # get the list of all archive files
    archive_files = sorted([f for f in os.listdir(source_dir) if f.startswith(archive_prefix)])

    # create target directory if it does not exist
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # concatenate parts of archive into a single file
    with tqdm(total=len(archive_files), desc='Concatenating parts') as pbar:
        with open(archive_path, 'wb') as f:
            for file in archive_files:
                with open(os.path.join(source_dir, file), 'rb') as part:
                    f.write(part.read())
                pbar.update(1)

    return archive_path

In [None]:
def extract_multipart_archives(source_dir, archive_prefix, target_dir, archive_path=None):
    '''
    source_dir: the path where the archives are stored
    archive_prefix: the prefix for the archives' file names
    target_dir: the path to where the final archive is to be stored
    '''
    if archive_path is None:
        archive_path = concatenate_multipart_archives(source_dir, archive_prefix, target_dir)
    
    # create target directory if it does not exist
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
        
    try:
        # open the concatenated archive file
        with zipfile.ZipFile(archive_path) as archive:
            # initialize progress bar
            with tqdm(total=len(archive.namelist()), unit='file', desc='Extracting files') as pbar:
                # extract all files to target directory
                for file in archive.namelist():
                    archive.extract(file, target_dir)
                    # update progress bar
                    pbar.update(1)

    except zipfile.BadZipfile as e:
        print(f"Extraction failed: {e}")
    # finally:
        # remove the concatenated archive file
        # os.remove(archive_path)

In [None]:
# Concatenate the DR train archives
concatenate_multipart_archives(
    dr_detection_dataset_dir,
    'train.zip.',
    os.path.join(dr_detection_dataset_dir, 'train')
)

Concatenating parts: 100%|██████████| 5/5 [03:53<00:00, 46.72s/it]


'/content/drive/MyDrive/Colab Notebooks/original_datasets/diabetic-retinopathy-detection/train/archive.zip'

In [None]:
# Extract the train archive
extract_multipart_archives(
    dr_detection_dataset_dir,
    'train.zip.',
    os.path.join(dr_detection_dataset_dir, 'train')
)

Extracting files: 100%|██████████| 35127/35127 [11:12<00:00, 52.20file/s]


In [None]:
# Concatenate the DR test archives
concatenate_multipart_archives(
    dr_detection_dataset_dir,
    'test.zip.',
    os.path.join(dr_detection_dataset_dir, 'test')
)

Concatenating parts: 100%|██████████| 7/7 [07:46<00:00, 66.59s/it]


'/content/drive/MyDrive/Colab Notebooks/original_datasets/diabetic-retinopathy-detection/test/archive.zip'

In [None]:
# Extract the DR test archive
extract_multipart_archives(
    dr_detection_dataset_dir,
    'test.zip.',
    os.path.join(dr_detection_dataset_dir, 'test')
)

Extracting files: 100%|██████████| 53577/53577 [33:44<00:00, 26.46file/s]


In [None]:
# Extract the train labels csv from its zip file
zip_file_name = os.path.join(dr_detection_dataset_dir, "trainLabels.csv.zip")
with zipfile.ZipFile(zip_file_name, 'r') as zip:
    zip.extractall(dr_detection_dataset_dir)

## Collating the Datasets

In [None]:
collated_dataset_dir = os.path.join(root_dir, 'collated_datasets')
if not os.path.exists(collated_dataset_dir):
    os.makedirs(collated_dataset_dir, exist_ok=True)

In [None]:
def copy_images(
    csv_data, 
    image_name_column, 
    diagnosis_column, 
    source_images_dir, 
    destination_dir,
    image_file_extension=None,
    suffix=''
):
    '''
    Copy images to new folder based on diagnosis
    
    csv_data: pandas dataframe containing image names and diagnosis
    image_name_column: column name of image names
    diagnosis_column: column name of diagnosis
    source_images_dir: path to directory containing images
    image_file_extension: extension/format of images
    destination_dir: path to output directory
    suffix: string added to image name for customisation
    '''
    # Select rows that don't have null values in the specified columns
    csv_data = csv_data.loc[        
        csv_data[image_name_column].notnull() & 
        csv_data[diagnosis_column].notnull()
    ]

    # Create full paths for each image
    if image_file_extension is None:
        csv_data['full_path'] = source_images_dir + '/' + csv_data[image_name_column]
    else:
        csv_data['full_path'] = source_images_dir + '/' + csv_data[image_name_column] + image_file_extension

    # Create output folder path for each diagnosis
    csv_data['output_folder'] = destination_dir + '/' + csv_data[diagnosis_column].astype(int).astype(str)

    # Create output path for each image
    if image_file_extension is None:
        csv_data['output_path'] = csv_data['output_folder'] + '/' + csv_data[image_name_column]
    else:
        csv_data['output_path'] = csv_data['output_folder'] + '/' + csv_data[image_name_column] + suffix + image_file_extension

    # Make sure all output folders exist
    csv_data['output_folder'].drop_duplicates().apply(lambda x: os.makedirs(x, exist_ok=True))

    # Copy images to their output folders
    tqdm(csv_data.apply(lambda x: shutil.copy2(x['full_path'], x['output_path']), axis=1))

In [None]:
def get_image_paths(dataset_dir, class_dir):
    """
    Auxiliary function to get the image paths for a class
    """
    class_dir_path = os.path.join(dataset_dir, class_dir)
    images = [os.path.join(class_dir_path, image) for image in os.listdir(class_dir_path) if image.endswith(('.jpeg', '.jpg', '.png'))]
    return images

In [None]:
def add_to_classes_dict(collated_dataset_dir, dataset_name):
    '''
    Add number of images in each class to dictionary

    collated_dataset_dir: path to collated dataset directory
    '''
        
    for class_dir in os.listdir(collated_dataset_dir):
        class_images = get_image_paths(collated_dataset_dir, class_dir)
        added_dataset_count = len(class_images) - classes_dict[class_dir]
        print(f"Class {class_dir} Total: {len(class_images)} | Class {class_dir} {dataset_name}: {added_dataset_count}")
        if class_dir not in classes_dict:
            classes_dict[class_dir] = len(class_images)
        else:
            classes_dict[class_dir] += added_dataset_count

In [None]:
classes_dict = dict()

### Collating APTOS-2019 Blindness Detection Dataset

In [None]:
aptos_2019_dir = os.path.join(original_datasets_dir, 'aptos2019-blindness-detection')
aptos_train_csv = os.path.join(aptos_2019_dir, 'train.csv')
aptos_train_data = pd.read_csv(aptos_train_csv)

# Display first 5 rows in the csv
print(aptos_train_data.head())

        id_code  diagnosis
0  000c1434d8d7          2
1  001639a390f0          4
2  0024cdab0c1e          1
3  002c21358ce6          0
4  005b95c28852          0


In [None]:
# Copy APTOS-2019 images to new folder
copy_images(
    aptos_train_data, 
    'id_code', 
    'diagnosis', 
    os.path.join(aptos_2019_dir, 'train_images'),
    collated_dataset_dir,
    '.png'
)

In [None]:
# Display count for each class
print(aptos_train_data['diagnosis'].value_counts())

0    1805
2     999
1     370
4     295
3     193
Name: diagnosis, dtype: int64


In [None]:
add_to_classes_dict(collated_dataset_dir)

Class 2: 999
Class 4: 295
Class 1: 370
Class 0: 1805
Class 3: 193


### Collating Messidor-2 Dataset

In [None]:
messidor_dir = os.path.join(original_datasets_dir, 'messidor2/messidor-2')
messidor_csv = os.path.join(messidor_dir, 'messidor_data.csv')
messidor_data = pd.read_csv(messidor_csv)

# Display first 5 rows in the csv
print(messidor_data.head())

                     image_id  adjudicated_dr_grade  adjudicated_dme  \
0  20051020_43808_0100_PP.png                   0.0              0.0   
1  20051020_43832_0100_PP.png                   1.0              0.0   
2  20051020_43882_0100_PP.png                   1.0              0.0   
3  20051020_43906_0100_PP.png                   2.0              1.0   
4  20051020_44261_0100_PP.png                   0.0              0.0   

   adjudicated_gradable  
0                     1  
1                     1  
2                     1  
3                     1  
4                     1  


In [None]:
# Check for missing values
print(messidor_data.isnull().sum())

image_id                0
adjudicated_dr_grade    4
adjudicated_dme         4
adjudicated_gradable    0
dtype: int64


In [None]:
# Copy Messidor images to new folder
copy_images(
    messidor_data, 
    'image_id', 
    'adjudicated_dr_grade', 
    os.path.join(messidor_dir, 'images'),
    collated_dataset_dir
)

In [None]:
# Display count for each class where the images are gradable
print(messidor_data['adjudicated_dr_grade'].value_counts(dropna=True))

0.0    1017
2.0     347
1.0     270
3.0      75
4.0      35
Name: adjudicated_dr_grade, dtype: int64


In [None]:
add_to_classes_dict(collated_dataset_dir, 'Messidor')

Class 2 Total: 1346 | Class 2 Messidor: 347
Class 4 Total: 330 | Class 4 Messidor: 35
Class 1 Total: 640 | Class 1 Messidor: 270
Class 0 Total: 2822 | Class 0 Messidor: 1017
Class 3 Total: 268 | Class 3 Messidor: 75


### Collating IDRiD Dataset

In [None]:
idrid_dir = os.path.join(original_datasets_dir, 'B. Disease Grading/B. Disease Grading')
idrid_train_csv = os.path.join(idrid_dir, '2. Groundtruths/a. IDRiD_Disease Grading_Training Labels.csv')
idrid_test_csv = os.path.join(idrid_dir, '2. Groundtruths/b. IDRiD_Disease Grading_Testing Labels.csv')
idrid_train_data = pd.read_csv(idrid_train_csv)
idrid_test_data = pd.read_csv(idrid_test_csv)

In [None]:
# Display first 5 rows in the train csv
print(idrid_train_data.head())

  Image name  Retinopathy grade  Risk of macular edema   Unnamed: 3  \
0  IDRiD_001                  3                       2         NaN   
1  IDRiD_002                  3                       2         NaN   
2  IDRiD_003                  2                       2         NaN   
3  IDRiD_004                  3                       2         NaN   
4  IDRiD_005                  4                       0         NaN   

   Unnamed: 4  Unnamed: 5  Unnamed: 6  Unnamed: 7  Unnamed: 8  Unnamed: 9  \
0         NaN         NaN         NaN         NaN         NaN         NaN   
1         NaN         NaN         NaN         NaN         NaN         NaN   
2         NaN         NaN         NaN         NaN         NaN         NaN   
3         NaN         NaN         NaN         NaN         NaN         NaN   
4         NaN         NaN         NaN         NaN         NaN         NaN   

   Unnamed: 10 Unnamed: 11  
0          NaN         NaN  
1          NaN         NaN  
2          NaN         

In [None]:
# Copy IDRiD train images to new folder
copy_images(
    idrid_train_data, 
    'Image name', 
    'Retinopathy grade', 
    os.path.join(idrid_dir, '1. Original Images/a. Training Set'),
    collated_dataset_dir,
    '.jpg',
    '_train'
)

In [None]:
# Display count for each class in train csv
print(idrid_train_data['Retinopathy grade'].value_counts())

2    136
0    134
3     74
4     49
1     20
Name: Retinopathy grade, dtype: int64


In [None]:
add_to_classes_dict(collated_dataset_dir, 'IDRiD')

Class 2 Total: 1482 | Class 2 IDRiD: 136
Class 4 Total: 379 | Class 4 IDRiD: 49
Class 1 Total: 660 | Class 1 IDRiD: 20
Class 0 Total: 2956 | Class 0 IDRiD: 134
Class 3 Total: 342 | Class 3 IDRiD: 74


In [None]:
# Display first 5 rows in the test csv
print(idrid_test_data.head())

  Image name  Retinopathy grade  Risk of macular edema 
0  IDRiD_001                  4                       0
1  IDRiD_002                  4                       1
2  IDRiD_003                  4                       0
3  IDRiD_004                  4                       0
4  IDRiD_005                  4                       1


In [None]:
# Copy IDRiD test images to new folder
copy_images(
    idrid_test_data, 
    'Image name', 
    'Retinopathy grade', 
    os.path.join(idrid_dir, '1. Original Images/b. Testing Set'),
    collated_dataset_dir,
    '.jpg',
    '_test'
)

In [None]:
# Display count for each class in test csv
print(idrid_test_data['Retinopathy grade'].value_counts())

0    34
2    32
3    19
4    13
1     5
Name: Retinopathy grade, dtype: int64


In [None]:
add_to_classes_dict(collated_dataset_dir, 'IDRiD Test')

Class 2 Total: 1514 | Class 2 IDRiD Test: 32
Class 4 Total: 392 | Class 4 IDRiD Test: 13
Class 1 Total: 665 | Class 1 IDRiD Test: 5
Class 0 Total: 2990 | Class 0 IDRiD Test: 34
Class 3 Total: 361 | Class 3 IDRiD Test: 19


### Collating Diabetic Retinopathy Detection Train Dataset

In [None]:
dr_detection_train_csv = os.path.join(dr_detection_dataset_dir, 'trainLabels.csv')
dr_detection_train_data = pd.read_csv(dr_detection_train_csv)

# Display first 5 rows in the csv
print(dr_detection_train_data.head())

      image  level
0   10_left      0
1  10_right      0
2   13_left      0
3  13_right      0
4   15_left      1


In [None]:
# Copy DR Detection Dataset train images to new folder
copy_images(
    dr_detection_train_data, 
    'image', 
    'level', 
    os.path.join(dr_detection_dataset_dir, 'train/train'),
    collated_dataset_dir,
    '.jpeg',
    '_DRtrain'
)

In [None]:
# Display count for each class
print(dr_detection_train_data['level'].value_counts())

0    25810
2     5292
1     2443
3      873
4      708
Name: level, dtype: int64


In [None]:
add_to_classes_dict(collated_dataset_dir, 'DR Detection Train Dataset')

Class 2 Total: 6806 | Class 2 DR Detection Train Dataset: 5292
Class 4 Total: 1100 | Class 4 DR Detection Train Dataset: 708
Class 1 Total: 3108 | Class 1 DR Detection Train Dataset: 2443
Class 0 Total: 28800 | Class 0 DR Detection Train Dataset: 25810
Class 3 Total: 1234 | Class 3 DR Detection Train Dataset: 873


### Collating Diabetic Retinopathy Detection Test Dataset

In [None]:
dr_detection_test_csv = os.path.join(dr_detection_dataset_dir, 'retinopathy_solution.csv')
dr_detection_test_data = pd.read_csv(dr_detection_test_csv)

# Display first 5 rows in the csv
print(dr_detection_test_data.head())

     image  level    Usage
0   1_left      0  Private
1  1_right      0  Private
2   2_left      0   Public
3  2_right      0   Public
4   3_left      2  Private


In [None]:
# Copy DR Detection Dataset test images to new folder
copy_images(
    dr_detection_test_data, 
    'image', 
    'level', 
    os.path.join(dr_detection_dataset_dir, 'testFormer/test'),
    collated_dataset_dir,
    '.jpeg',
    '_DRtest'
)

  0%|          | 0/53576 [00:00<?, ?it/s]


In [None]:
# Display count for each class
print(dr_detection_test_data['level'].value_counts())

0    39533
2     7861
1     3762
3     1214
4     1206
Name: level, dtype: int64


In [None]:
add_to_classes_dict(collated_dataset_dir, 'DR Detection Test Dataset')

Class 2 Total: 14667 | Class 2 DR Detection Test Dataset: 7861
Class 4 Total: 2306 | Class 4 DR Detection Test Dataset: 1206
Class 1 Total: 6870 | Class 1 DR Detection Test Dataset: 3762
Class 0 Total: 68333 | Class 0 DR Detection Test Dataset: 39533
Class 3 Total: 2448 | Class 3 DR Detection Test Dataset: 1214


In [None]:
# Confirm total number of images in all classes
class_dirs = sorted([int(class_num) for class_num in os.listdir(collated_dataset_dir)])
for class_dir in class_dirs:
    class_images = get_image_paths(collated_dataset_dir, str(class_dir))
    print(f"Class {class_dir} Total: {len(class_images)}")

Class 0 Total: 68333
Class 1 Total: 6870
Class 2 Total: 14667
Class 3 Total: 2448
Class 4 Total: 2306


In [None]:
# Create dictionary of class image paths
class_dirs = sorted([int(class_num) for class_num in os.listdir(collated_dataset_dir)])
class_image_dict = dict()
for class_dir in class_dirs:
    class_images = get_image_paths(collated_dataset_dir, str(class_dir))
    print(f"Class {class_dir} Total: {len(class_images)}")
    class_image_dict[str(class_dir)] = class_images

# Save dictionary to npz file
np.savez(os.path.join(root_dir, 'class_image_paths.npz'), **class_image_dict)

Class 0 Total: 68333
Class 1 Total: 6870
Class 2 Total: 14667
Class 3 Total: 2448
Class 4 Total: 2306
