In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install tqdm

Collecting tqdm
  Using cached tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.66.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.66.1


In [2]:
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import shutil
import math
import random
import itertools
import pprint
import json
from tqdm import tqdm
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
import tensorflow as tf
from numpy import expand_dims
import cv2
from pathlib import Path
from itertools import product

In [3]:
def count_images(folder_path):
    total_files = 0
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        for foldername, subfolders, filenames in os.walk(folder_path):
            for filename in filenames:
                file_path = os.path.join(foldername, filename)
                if os.path.isfile(file_path) and (file_path.endswith('.jpg') or file_path.endswith('.jpeg') or file_path.endswith('.JPG') or file_path.endswith('.JPEG')):
                    total_files += 1
    return total_files

def partition_file_list(ratios, items):
    """
    :param ratios: [0.15, 0.15, 0.7]
    :param items: ["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9"]
    :return: [["f0"],  #truncates amount to the lowest integer
              ["f1"],  #truncates to the lowest integer
              ["f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9"] #truncates to the lowest integer
              ]

    """
    total_items = len(items)
    partitions = []
    start = 0
    for i, ratio in enumerate(ratios):
        end = start + int(math.ceil(ratio * total_items))
        if i == len(ratios)-1:
            partitions.append(items[start:])
        else:
            partitions.append(items[start:end])
            start = end
    return partitions

def create_splitted_folders(src_dataset, dst_dataset):
    """
    :param src_dataset: {
        'esca':    os.path.join(base_folder, 'esca_dataset_unsplitted', 'esca'),
        'healthy': os.path.join(base_folder, 'esca_dataset_unsplitted', 'healthy')
    }
    :param dst_dataset: {
        'train': {
            'esca':    os.path.join(base_folder, 'esca_dataset', 'train', 'esca'),
            'healthy': os.path.join(base_folder, 'esca_dataset', 'train', 'healthy'),
            'ratio': 0.7
        },
        'test': {
            'esca':    os.path.join(base_folder, 'esca_dataset', 'test', 'esca'),
            'healthy': os.path.join(base_folder, 'esca_dataset', 'test', 'healthy'),
            'ratio': 0.15
        },
        'validation': {
            'esca':    os.path.join(base_folder, 'esca_dataset', 'validation', 'esca'),
            'healthy': os.path.join(base_folder, 'esca_dataset', 'validation', 'healthy'),
            'ratio': 0.15
        }
    }
    :return: creates the folders indicated in the record dst_dataset copying the files from src_dataset
    """
    for class_name in ['esca', 'healthy']:
        src_file_list = os.listdir(src_dataset[class_name])
        random.seed(1)# set seed fo replicability
        random.shuffle(src_file_list)
        test_file_list, validation_file_list, train_file_list = partition_file_list(
            [dst_dataset['test']['ratio'], dst_dataset['validation']['ratio'], dst_dataset['train']['ratio']],
            src_file_list
        )
        partition_info = [
            {'name': 'test', 'file_list': test_file_list},
            {'name': 'validation', 'file_list': validation_file_list},
            {'name': 'train', 'file_list': train_file_list}
        ]
        for record in partition_info:
            split_name = record['name']
            if not (os.path.exists(dst_dataset[split_name][class_name]) and os.path.isdir(dst_dataset[split_name][class_name])):
                os.makedirs(dst_dataset[split_name][class_name], exist_ok=True)
            print(f"Creating folder {split_name}/{class_name}")
            for file_name in tqdm(record['file_list']):
                try:
                    shutil.copy(
                        os.path.join(src_dataset[class_name], file_name),
                        os.path.join(dst_dataset[split_name][class_name], file_name)
                    )
                    #print("File copied successfully.")
                except:
                    print(f"Error occurred while copying file {file_name}")

    print("Data partitioning completed.")

def find_duplicate_strings(L):
    occurrences = {}
    duplicates = []
    for string in L:
        # Count occurrences of each string using a dictionary
        occurrences[string] = occurrences.get(string, 0) + 1

    # Find strings that occur more than once
    for string, count in occurrences.items():
        if count > 1:
            duplicates.append(string)
    return duplicates

In [4]:
current_directory = os.getcwd()
base_dir = os.path.dirname(os.path.abspath('__file__'))
os.chdir(base_dir)

**THE PICTURES ARE ALL STORED INTO TWO FOLDERS:**

* f"{base_folder}/esca_dataset_unsplitted/esca"
* f"{base_folder}/esca_dataset_unsplitted/healthy"

In [5]:
base_folder = '.'
data_folders = {
    '.': base_folder,
    'esca_dataset_unsplitted': {
        '.': os.path.join(base_folder, 'esca_dataset_unsplitted'),
        'esca':    os.path.join(base_folder, 'esca_dataset_unsplitted', 'esca'),
        'healthy': os.path.join(base_folder, 'esca_dataset_unsplitted', 'healthy')
    },
    'esca_dataset': {
        '.': os.path.join(base_folder, 'esca_dataset'),
        'train': {
            '.': os.path.join(base_folder, 'esca_dataset', 'train'),
            'esca':    os.path.join(base_folder, 'esca_dataset', 'train', 'esca'),
            'healthy': os.path.join(base_folder, 'esca_dataset', 'train', 'healthy'),
            'ratio': 0.7
        },
        'test': {
            '.': os.path.join(base_folder, 'esca_dataset', 'test'),
            'esca':    os.path.join(base_folder, 'esca_dataset', 'test', 'esca'),
            'healthy': os.path.join(base_folder, 'esca_dataset', 'test', 'healthy'),
            'ratio': 0.15
        },
        'validation': {
            '.': os.path.join(base_folder, 'esca_dataset', 'validation'),
            'esca':    os.path.join(base_folder, 'esca_dataset', 'validation', 'esca'),
            'healthy': os.path.join(base_folder, 'esca_dataset', 'validation', 'healthy'),
            'ratio': 0.15
        }
    },
    'esca_dataset_swapped_bg_unsplitted': {
        '.': os.path.join(base_folder, 'esca_dataset_swapped_bg_unsplitted'),
        'esca':    os.path.join(base_folder, 'esca_dataset_swapped_bg_unsplitted', 'esca'),
        'healthy': os.path.join(base_folder, 'esca_dataset_swapped_bg_unsplitted', 'healthy')
    },
    'esca_dataset_swapped_bg': {
        '.': os.path.join(base_folder, 'esca_dataset_swapped_bg'),
        'train': {
            '.': os.path.join(base_folder, 'esca_dataset_swapped_bg', 'train'),
            'esca':    os.path.join(base_folder, 'esca_dataset_swapped_bg', 'train', 'esca'),
            'healthy': os.path.join(base_folder, 'esca_dataset_swapped_bg', 'train', 'healthy'),
            'ratio': 0.7
        },
        'test': {
            '.': os.path.join(base_folder, 'esca_dataset_swapped_bg', 'test'),
            'esca':    os.path.join(base_folder, 'esca_dataset_swapped_bg', 'test', 'esca'),
            'healthy': os.path.join(base_folder, 'esca_dataset_swapped_bg', 'test', 'healthy'),
            'ratio': 0.15
        },
        'validation': {
            '.': os.path.join(base_folder, 'esca_dataset_swapped_bg', 'validation'),
            'esca':    os.path.join(base_folder, 'esca_dataset_swapped_bg', 'validation', 'esca'),
            'healthy': os.path.join(base_folder, 'esca_dataset_swapped_bg', 'validation', 'healthy'),
            'ratio': 0.15
        }
    }
}

# Serializing json
json_object = json.dumps(data_folders, indent = 4)
print(json_object)

{
    ".": ".",
    "esca_dataset_unsplitted": {
        ".": "./esca_dataset_unsplitted",
        "esca": "./esca_dataset_unsplitted/esca",
        "healthy": "./esca_dataset_unsplitted/healthy"
    },
    "esca_dataset": {
        ".": "./esca_dataset",
        "train": {
            ".": "./esca_dataset/train",
            "esca": "./esca_dataset/train/esca",
            "healthy": "./esca_dataset/train/healthy",
            "ratio": 0.7
        },
        "test": {
            ".": "./esca_dataset/test",
            "esca": "./esca_dataset/test/esca",
            "healthy": "./esca_dataset/test/healthy",
            "ratio": 0.15
        },
        "validation": {
            ".": "./esca_dataset/validation",
            "esca": "./esca_dataset/validation/esca",
            "healthy": "./esca_dataset/validation/healthy",
            "ratio": 0.15
        }
    },
    "esca_dataset_swapped_bg_unsplitted": {
        ".": "./esca_dataset_swapped_bg_unsplitted",
        "esca": "./esca

**DATA AUGMENTATION IN TRAIN AND VALIDATION FOLDERS**

In [7]:
def blur(img):
    return (cv2.blur(img,(30,30)))

def horizontal_flip(img):
    return (tf.image.flip_left_right(img))

def vertical_flip(img):
    return (tf.image.flip_up_down(img))

def contrast(img):
    return (tf.image.adjust_contrast(img, 0.5))

def saturation(img):
    return (tf.image.adjust_saturation(img, 3))

def hue(img):
    return (tf.image.adjust_hue(img, 0.1))

def gamma(img):
    return (tf.image.adjust_gamma(img, 2))

def augment_with_13transformations(splitted_dataset):
    """
    :param splitted_dataset:  {
            'train': {
                'esca':  ...,
                'healthy':
            }
    }
    :return: ...
    """

    transformations = [
        {'name' : 'horizontalFlip',
        'datagen' : ImageDataGenerator(preprocessing_function=horizontal_flip)},
        {'name' : 'verticalFlip',
        'datagen' : ImageDataGenerator(preprocessing_function=vertical_flip)},
        {'name' : 'rotation',
        'datagen' : ImageDataGenerator(rotation_range = 40, fill_mode='nearest')},
        {'name' : 'widthShift',
        'datagen' : ImageDataGenerator(width_shift_range = 0.2, fill_mode='nearest')},
        {'name' : 'heightShift',
        'datagen' : ImageDataGenerator(height_shift_range = 0.2, fill_mode='nearest')},
        {'name' : 'shearRange',
        'datagen' : ImageDataGenerator(shear_range = 0.2)},
        {'name' : 'zoom',
        'datagen' : ImageDataGenerator(zoom_range = [0.5, 1.0])},
        {'name' : 'blur',
        'datagen' : ImageDataGenerator(preprocessing_function=blur)},
        {'name' : 'brightness',
        'datagen' : ImageDataGenerator(brightness_range = [1.1, 1.5])},
        {'name' : 'contrast',
        'datagen' : ImageDataGenerator(preprocessing_function=contrast)},
        {'name' : 'saturation',
        'datagen' : ImageDataGenerator(preprocessing_function=saturation)},
        {'name' : 'hue',
        'datagen' : ImageDataGenerator(preprocessing_function=hue)},
        {'name' : 'gamma',
        'datagen' : ImageDataGenerator(preprocessing_function=gamma)}
    ]
    split_class_list = list(product(['train'], ['esca', 'healthy']))
    for s, c in split_class_list:
        print(f"Augmenting data in folder {splitted_dataset[s][c]}")
        list_of_files = [f for f in os.listdir(splitted_dataset[s][c]) if not f.startswith('.') and f.endswith('jpg')]
        for filename in tqdm(list_of_files):
            for t in transformations:
                print(f"Applying {t['name']} transformation to file {os.path.join(splitted_dataset[s][c], filename)}")
                img = load_img(os.path.join(splitted_dataset[s][c], filename))
                data = img_to_array(img)
                samples = expand_dims(data, 0)
                it = t['datagen'].flow(samples, batch_size = 1,
                    save_to_dir = splitted_dataset[s][c],
                    save_prefix = '_'.join([Path(filename).stem, t['name']]),
                    save_format ='jpg')
                batch = it.next()


In [None]:
create_splitted_folders(
    data_folders['esca_dataset_unsplitted'],
    data_folders['esca_dataset']
)

In [None]:
create_splitted_folders(
    data_folders['esca_dataset_swapped_bg_unsplitted'],
    data_folders['esca_dataset_swapped_bg']
)

In [7]:
print(f"esca_dataset_unsplitted: {count_images(data_folders['esca_dataset_unsplitted']['.'])} images.")
print(f"esca_dataset_swapped_bg_unsplitted: {count_images(data_folders['esca_dataset_swapped_bg_unsplitted']['.'])} images.")
print(f"esca_dataset: {count_images(data_folders['esca_dataset']['.'])} images.")
print(f"esca_dataset_swapped_bg: {count_images(data_folders['esca_dataset_swapped_bg']['.'])} images.")
print(f"esca_dataset_mixed: {count_images(data_folders['esca_dataset_mixed']['.'])} images.")

esca_dataset_unsplitted: 1770 images.
esca_dataset_swapped_bg_unsplitted: 1770 images.
esca_dataset: 17851 images.


KeyboardInterrupt: 

In [18]:
data_folders_info = {
    'esca_dataset_unsplitted': {
        'esca':    count_images(data_folders['esca_dataset_unsplitted']['esca']),
        'healthy': count_images(data_folders['esca_dataset_unsplitted']['healthy'])
    },
    'esca_dataset': {
        'train': {
            'esca':    count_images(data_folders['esca_dataset']['train']['esca']),
            'healthy': count_images(data_folders['esca_dataset']['train']['healthy']),
            'proportion': count_images(data_folders['esca_dataset']['train']['.'])/count_images(data_folders['esca_dataset']['.']) if count_images(data_folders['esca_dataset']['.']) > 0 else 0
        },
        'test': {
            'esca':    count_images(data_folders['esca_dataset']['test']['esca']),
            'healthy': count_images(data_folders['esca_dataset']['test']['healthy']),
            'proportion': count_images(data_folders['esca_dataset']['test']['.'])/count_images(data_folders['esca_dataset']['.']) if count_images(data_folders['esca_dataset']['.']) > 0 else 0
        },
        'validation': {
            'esca':    count_images(data_folders['esca_dataset']['validation']['esca']),
            'healthy': count_images(data_folders['esca_dataset']['validation']['healthy']),
            'proportion': count_images(data_folders['esca_dataset']['validation']['.'])/count_images(data_folders['esca_dataset']['.']) if count_images(data_folders['esca_dataset']['.']) > 0 else 0
        }
    },
    'esca_dataset_swapped_bg_unsplitted': {
        'esca':    count_images(data_folders['esca_dataset_swapped_bg_unsplitted']['esca']),
        'healthy': count_images(data_folders['esca_dataset_swapped_bg_unsplitted']['healthy'])
    },
    'esca_dataset_swapped_bg': {
        'train': {
            'esca':    count_images(data_folders['esca_dataset_swapped_bg']['train']['esca']),
            'healthy': count_images(data_folders['esca_dataset_swapped_bg']['train']['healthy']),
            'proportion': count_images(data_folders['esca_dataset_swapped_bg']['train']['.'])/count_images(data_folders['esca_dataset_swapped_bg']['.']) if count_images(data_folders['esca_dataset_swapped_bg']['.']) > 0 else 0
        },
        'test': {
            'esca':    count_images(data_folders['esca_dataset_swapped_bg']['test']['esca']),
            'healthy': count_images(data_folders['esca_dataset_swapped_bg']['test']['healthy']),
            'proportion': count_images(data_folders['esca_dataset_swapped_bg']['test']['.'])/count_images(data_folders['esca_dataset_swapped_bg']['.'])  if count_images(data_folders['esca_dataset_swapped_bg']['.']) > 0 else 0
        },
        'validation': {
            'esca':    count_images(data_folders['esca_dataset_swapped_bg']['validation']['esca']),
            'healthy': count_images(data_folders['esca_dataset_swapped_bg']['validation']['healthy']),
            'proportion': count_images(data_folders['esca_dataset_swapped_bg']['validation']['.'])/count_images(data_folders['esca_dataset_swapped_bg']['.'])  if count_images(data_folders['esca_dataset_swapped_bg']['.']) > 0 else 0
        }
    }
}

# Serializing json
json_object = json.dumps(data_folders_info, indent = 4)
print(json_object)

{
    "esca_dataset_unsplitted": {
        "esca": 888,
        "healthy": 882
    },
    "esca_dataset": {
        "train": {
            "esca": 8680,
            "healthy": 8638,
            "proportion": 0.9701417287546916
        },
        "test": {
            "esca": 134,
            "healthy": 133,
            "proportion": 0.014957145257968742
        },
        "validation": {
            "esca": 134,
            "healthy": 132,
            "proportion": 0.014901125987339645
        }
    },
    "esca_dataset_swapped_bg_unsplitted": {
        "esca": 882,
        "healthy": 888
    },
    "esca_dataset_swapped_bg": {
        "train": {
            "esca": 8638,
            "healthy": 8694,
            "proportion": 0.9702194357366771
        },
        "test": {
            "esca": 133,
            "healthy": 134,
            "proportion": 0.014946260635915809
        },
        "validation": {
            "esca": 132,
            "healthy": 133,
            "proportion": 0.

In [None]:
augment_with_13transformations(
    splitted_dataset=data_folders['esca_dataset']
    )

In [None]:
augment_with_13transformations(
    splitted_dataset=data_folders['esca_dataset_swapped_bg']
    )

In [19]:
import os
from PIL import Image

# Supported image file formats
supported_formats = ('JPEG', 'PNG', 'GIF', 'BMP')

def filter_image_files(folder):
    for dirpath, _, filenames in os.walk(folder):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            try:
                # Attempt to open the image file
                with Image.open(file_path) as img:
                    # Check if the image format is supported
                    if img.format not in supported_formats:
                        # Remove the file if the format is not supported
                        os.remove(file_path)
                        print(f"Removed: {file_path}")
            except (IOError, OSError):
                # Handle errors (e.g., unable to open file)
                print(f"Error processing: {file_path}")
                os.remove(file_path)
                print(f"Removed: {file_path}")

# Filter image files in the folder and its subdirectories
filter_image_files(data_folders['esca_dataset_swapped_bg']['.'])
filter_image_files(data_folders['esca_dataset']['.'])

Error processing: ./esca_dataset_swapped_bg/.DS_Store
Removed: ./esca_dataset_swapped_bg/.DS_Store
Error processing: ./esca_dataset_swapped_bg/validation/healthy/.DS_Store
Removed: ./esca_dataset_swapped_bg/validation/healthy/.DS_Store
Error processing: ./esca_dataset_swapped_bg/validation/esca/.DS_Store
Removed: ./esca_dataset_swapped_bg/validation/esca/.DS_Store
Error processing: ./esca_dataset/.DS_Store
Removed: ./esca_dataset/.DS_Store
Error processing: ./esca_dataset/train/esca/.DS_Store
Removed: ./esca_dataset/train/esca/.DS_Store
Error processing: ./esca_dataset/validation/healthy/.DS_Store
Removed: ./esca_dataset/validation/healthy/.DS_Store


In [6]:
def combine_folders(source_folder1, source_folder2, destination_folder):
    # Iterate over the subfolders of the source folders
    for root, dirs, files in os.walk(source_folder1):
        for file in files:
            # Get the relative path of the file
            relative_path = os.path.relpath(root, source_folder1)
            # Create the corresponding destination directory
            dest_dir = os.path.join(destination_folder, relative_path)
            os.makedirs(dest_dir, exist_ok=True)
            # Copy the file from the first source folder to the destination folder
            shutil.copy2(os.path.join(root, file), dest_dir)

    # Iterate over the subfolders of the second source folder
    for root, dirs, files in os.walk(source_folder2):
        for file in files:
            # Get the relative path of the file
            relative_path = os.path.relpath(root, source_folder2)
            # Create the corresponding destination directory
            dest_dir = os.path.join(destination_folder, relative_path)
            os.makedirs(dest_dir, exist_ok=True)
            # Copy the file from the second source folder to the destination folder
            shutil.copy2(os.path.join(root, file), dest_dir)

# Define the source folders and the destination folder
esca_dataset = "esca_dataset"
esca_dataset_swapped_bg = "esca_dataset_swapped_bg"
esca_dataset_mixed = "esca_dataset_mixed"

# Combine the folders
combine_folders(esca_dataset, esca_dataset_swapped_bg, esca_dataset_mixed)

print("Folders combined successfully!")

Folders combined successfully!
