<a href="https://colab.research.google.com/github/Inefra21/Colour-bias-experiment/blob/main/Creating_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing packages

In [None]:
import os
import shutil
import numpy as np
from PIL import Image

# Loading data

In [None]:
# Mount the drive to access files
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Create directory to the filtered cats and dogs folder
base_dir = '/content/drive/MyDrive/cats_and_dogs_filtered'
train_dir = os.path.join(base_dir, 'train')
val_dir = os.path.join(base_dir, 'validation')

In [None]:
train_cats_dir = os.path.join(train_dir, 'cats')
train_dogs_dir = os.path.join(train_dir, 'dogs')
val_cats_dir = os.path.join(val_dir, 'cats')
val_dogs_dir = os.path.join(val_dir, 'dogs')

train_cats_light_dir = os.path.join(train_cats_dir, 'Light')
train_cats_dark_dir = os.path.join(train_cats_dir, 'Dark')
train_dogs_light_dir = os.path.join(train_dogs_dir, 'Light')
train_dogs_dark_dir = os.path.join(train_dogs_dir, 'Dark')

val_cats_light_dir = os.path.join(val_cats_dir, 'Light')
val_cats_dark_dir = os.path.join(val_cats_dir, 'Dark')
val_dogs_light_dir = os.path.join(val_dogs_dir, 'Light')
val_dogs_dark_dir = os.path.join(val_dogs_dir, 'Dark')

# Understanding the data

In [None]:
# Count the number of images in each folder
num_cats_tr = (len(os.listdir(train_cats_light_dir))+len(os.listdir(train_cats_dark_dir)))
num_dogs_tr = (len(os.listdir(train_dogs_light_dir))+len(os.listdir(train_dogs_dark_dir)))

num_cats_val = (len(os.listdir(val_cats_light_dir))+len(os.listdir(val_cats_dark_dir)))
num_dogs_val = (len(os.listdir(val_dogs_light_dir))+len(os.listdir(val_dogs_dark_dir)))

total_train = num_cats_tr + num_dogs_tr
total_val = num_cats_val + num_dogs_val

In [None]:
print('total training cat images:', num_cats_tr)
print('total training dog images:', num_dogs_tr)

print('total validation cat images:', num_cats_val)
print('total validation dog images:', num_dogs_val)
print("--")
print("Total training images:", total_train)
print("Total validation images:", total_val)

# Resize images

In [None]:
# List all files in the folder
files = [f for f in os.listdir(train_cats_dark_dir) if os.path.isfile(os.path.join(train_cats_dark_dir, f))]

# Filter out image files (e.g., .jpg, .png)
image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

In [None]:
# Define a function to resize the image
def resize_image(image_path, output_path, max_size_kb=40):
    with Image.open(image_path) as img:
        #Ensure correct mode
        if img.mode in ("P", "L", "RGBA"):  # Palette, grayscale, or with alpha channel
            img = img.convert("RGB")
        # Check file size
        size_kb = os.path.getsize(output_path) / 1024
        quality = 85
        while size_kb > max_size_kb and quality > 10:
            quality -= 5
            img.save(output_path, quality=quality)
            size_kb = os.path.getsize(output_path) / 1024
    return

# Loop through the images and resize those above 40KB
for image_file in image_files:
    image_path = os.path.join(train_cats_dark_dir, image_file)
    output_path = os.path.join(train_cats_dark_dir, image_file)
    # Overwrite the original file
    size_kb = os.path.getsize(image_path) / 1024
    if size_kb > 40:
        resize_image(image_path, output_path)
        print(f'Resized: {image_file}')
    else:
        print(f'Skipped: {image_file}')

# Create datasets

Run the cells below for each dataset, changing the number on the file name and the INDEX value acordingly.

In [None]:
# Create a new folder in Drive
CaD_50_dir = '/content/drive/MyDrive/c&d_50'
CaD_50_train_dir = '/content/drive/MyDrive/c&d_50/train'
CaD_50_train_c_dir = '/content/drive/MyDrive/c&d_50/train/cats'
CaD_50_train_d_dir = '/content/drive/MyDrive/c&d_50/train/dogs'
CaD_50_val_dir = '/content/drive/MyDrive/c&d_50/validation'
CaD_50_val_c_dir = '/content/drive/MyDrive/c&d_50/validation/cats'
CaD_50_val_d_dir = '/content/drive/MyDrive/c&d_50/validation/dogs'

folders = [CaD_50_dir, CaD_50_train_dir, CaD_50_train_c_dir, CaD_50_train_d_dir, CaD_50_val_dir, CaD_50_val_c_dir, CaD_50_val_d_dir]

for directory in range(len(folders)):
  os.mkdir(folders[directory])

In [None]:
# Define a function to copy a file to another destination
def Copy_files(source_dir, dest_dir, INDEX):
  all_files = os.listdir(source_dir)
  selected_files = all_files[:INDEX]
  for file_name in selected_files:
      full_file_name = os.path.join(source_dir, file_name)
      if os.path.isfile(full_file_name):
          shutil.copy(full_file_name, dest_dir)

# Copy selected files to the training cats folder
Copy_files(train_cats_light_dir, CaD_50_train_c_dir, int(np.ceil(1000*0.5)))
Copy_files(train_cats_dark_dir, CaD_50_train_c_dir, int(np.ceil(1000*0.5)))

# Copy selected files to the training dogs folder
Copy_files(train_dogs_light_dir, CaD_50_train_d_dir, int(np.ceil(1000*0.5)))
Copy_files(train_dogs_dark_dir, CaD_50_train_d_dir, int(np.ceil(1000*0.5)))

# Copy selected files to the validation cats folder
Copy_files(val_cats_light_dir, CaD_50_val_c_dir, int(np.ceil(200*0.5)))
Copy_files(val_cats_dark_dir, CaD_50_val_c_dir, int(np.ceil(200*0.5)))

# Copy selected files to the validation dogs folder
Copy_files(val_dogs_light_dir, CaD_50_val_d_dir, int(np.ceil(200*0.5)))
Copy_files(val_dogs_dark_dir, CaD_50_val_d_dir, int(np.ceil(200*0.5)))