# Data Split

Split the data set to one set for training and testing (90%) and a second one to make predictions (10%)

Load

In [1]:
from google.colab import drive
import os
import shutil
import random
import zipfile
import shutil
import random


Mount Google Drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


Unzip the Dataset

In [3]:
# Paths
zip_path = "/content/drive/MyDrive/IronHack/animals10.zip"
extract_path = "/content"  # So we get /content/animals10

# Unzip only if not already done
if not os.path.exists("/content/animals10/raw-img"):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print("✅ Dataset extracted.")
else:
    print("✅ Dataset already extracted.")

✅ Dataset extracted.


Split Off a Holdout Set (10%)

In [4]:
dataset_path = "/content/animals10/raw-img"
holdout_path = "/content/animals10/holdout/raw-img"

# Make sure holdout folder exists
os.makedirs(holdout_path, exist_ok=True)

# Move 10% from each class to holdout
for class_name in os.listdir(dataset_path):
    class_folder = os.path.join(dataset_path, class_name)

    if os.path.isdir(class_folder):
        images = os.listdir(class_folder)
        num_holdout = int(len(images) * 0.10)
        holdout_images = random.sample(images, num_holdout)

        holdout_class_folder = os.path.join(holdout_path, class_name)
        os.makedirs(holdout_class_folder, exist_ok=True)

        for image in holdout_images:
            src = os.path.join(class_folder, image)
            dst = os.path.join(holdout_class_folder, image)
            shutil.move(src, dst)

        print(f"Moved {num_holdout} images from {class_name} to holdout.")

# Print summary
for folder, label in [(dataset_path, "Remaining"), (holdout_path, "Holdout")]:
    for class_name in os.listdir(folder):
        class_folder = os.path.join(folder, class_name)
        if os.path.isdir(class_folder):
            count = len(os.listdir(class_folder))
            print(f"{label} in {class_name}: {count} images")

Moved 167 images from squirrel to holdout.
Moved 190 images from butterfly to holdout.
Moved 168 images from cow to holdout.
Moved 278 images from chicken to holdout.
Moved 130 images from elephant to holdout.
Moved 437 images from dog to holdout.
Moved 163 images from sheep to holdout.
Moved 150 images from cat to holdout.
Moved 433 images from spider to holdout.
Moved 236 images from horse to holdout.
Remaining in squirrel: 1509 images
Remaining in butterfly: 1711 images
Remaining in cow: 1512 images
Remaining in chicken: 2511 images
Remaining in elephant: 1172 images
Remaining in dog: 3940 images
Remaining in sheep: 1475 images
Remaining in cat: 1352 images
Remaining in spider: 3906 images
Remaining in horse: 2125 images
Holdout in squirrel: 167 images
Holdout in butterfly: 190 images
Holdout in cow: 168 images
Holdout in chicken: 278 images
Holdout in elephant: 130 images
Holdout in dog: 437 images
Holdout in sheep: 163 images
Holdout in cat: 150 images
Holdout in spider: 433 image

In [5]:
import os

print("📁 Main dataset structure:")
for class_name in os.listdir("/content/animals10/raw-img"):
    class_folder = os.path.join("/content/animals10/raw-img", class_name)
    if os.path.isdir(class_folder):
        print(f"{class_name}: {len(os.listdir(class_folder))} images")

print("\n📁 Holdout dataset structure:")
for class_name in os.listdir("/content/animals10/holdout/raw-img"):
    class_folder = os.path.join("/content/animals10/holdout/raw-img", class_name)
    if os.path.isdir(class_folder):
        print(f"{class_name}: {len(os.listdir(class_folder))} images")


📁 Main dataset structure:
squirrel: 1509 images
butterfly: 1711 images
cow: 1512 images
chicken: 2511 images
elephant: 1172 images
dog: 3940 images
sheep: 1475 images
cat: 1352 images
spider: 3906 images
horse: 2125 images

📁 Holdout dataset structure:
squirrel: 167 images
butterfly: 190 images
cow: 168 images
chicken: 278 images
elephant: 130 images
dog: 437 images
sheep: 163 images
cat: 150 images
spider: 433 images
horse: 236 images


Save Holdout to Google Drive

In [6]:
import shutil

# Zip the holdout
shutil.make_archive("/content/holdout", 'zip', "/content/animals10/holdout")

# Move to Drive
shutil.move("/content/holdout.zip", "/content/drive/MyDrive/IronHack/holdout.zip")
print("✅ Holdout saved to Google Drive.")

✅ Holdout saved to Google Drive.
