# Clone & Filter dataset

### Init

In [None]:
import os
import shutil
import fnmatch
import tqdm

In [None]:
# Change for different file formats
reference = {
  "small": {
    "suffix": ".S",
    "file": "209"
  },
  "mid": {
    "suffix": ".M",
    "file": "503"
  },
  "large": {
    "suffix": ".L",
    "file": "000"
  }
}

In [None]:
# Clone config files
def copy_config(src_folder, dest_folder):
    """
    Copies files from src_folder to dest_folder, excluding subfolders.

    Args:
        src_folder: The path to the source folder.
        dest_folder: The path to the destination folder.
    """

    try:
        # Ensure destination folder exists
        os.makedirs(dest_folder, exist_ok=True)

        for filename in os.listdir(src_folder):
            src_path = os.path.join(src_folder, filename)
            dest_path = os.path.join(dest_folder, filename)

            if os.path.isfile(src_path):
                shutil.copy2(src_path, dest_path) #copy metadata as well.
                #Use shutil.copy for not copying metadata.
                print(f"Copied: {filename}")
            #else: #optional
                #print(f"Skipped (not a file): {filename}") #optional. Uncomment if you want to see the skipped folders.

        print("✅ Copying complete.")

    except Exception as e:
        print(f"❌ An error occurred: {e}")


In [None]:
# Copy filtered dataset images/labels
def copy_and_filter_folder(src_folder, dest_folder, pattern):
    """
    Copies a folder and files that match the given pattern.
    Alerts the user when a folder or file already exists but *does not* overwrite.
    Creates only what is needed.

    :param src_folder: Path to the source folder.
    :param dest_folder: Path to the destination folder.
    :param pattern: Filename pattern to keep (e.g., "*.txt").
    """
    try:
        # Ensure destination folder exists
        if not os.path.exists(dest_folder):
            print(f"✓ Creating destination folder '{dest_folder}'.\n")
            os.makedirs(dest_folder)
        else:
            print(f"✓ Destination folder '{dest_folder}' already exists.\n")

        # Walk through the source folder
        for root, _, files in os.walk(src_folder):
            relative_path = os.path.relpath(root, src_folder)
            new_root = os.path.join(dest_folder, relative_path)

            if not os.path.exists(new_root):
                print(f"Creating subdirectory '{new_root}'")
                os.makedirs(new_root)
            else:
                print(f"❕Subdirectory '{new_root}' already exists.")
                print("Make sure the data inside is relevant. Otherwise, just delete the folder and repeat the cloning process.")

            for file in files:
                if fnmatch.fnmatch(file, pattern + "*"):
                    src_file = os.path.join(root, file)
                    dest_file = os.path.join(new_root, file)

                    if not os.path.exists(dest_file):
                        shutil.copy2(src_file, dest_file)  # copy metadata as well
                    else:
                        print(f"❗️File '{dest_file}' already exists. Skipping.")

            print(f" ✓ Copying files complete.\n")
        print("✅ Copying dataset complete.")

    except Exception as e:
        print(f"❌ An error occurred: {e}")

## Importing from Drive

In [None]:
!rm -rf /content/sample_data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import time
# Start time measurement for testing
start_time = time.time()  # Record start time

In [None]:
!ls /content/drive/MyDrive/YOLO

3.5m.v3i.yolov8


In [None]:
#!cp -r /content/drive/MyDrive/YOLO/ /content

In [None]:
drive_path = '/content/drive/MyDrive/YOLO'
model = os.listdir(drive_path)[0]

In [None]:
#os.chdir("/content/YOLO")
#model = os.listdir(".")[0]

In [None]:
src_folder = f"{drive_path}/{model}"

## Settings

In [None]:
# CHOOSE THE DATASET YOU WANT TO WORK WITH
choosen_data = 'small' # Options: small / mid / large

In [None]:
suffix = reference[choosen_data]['suffix']
#src_folder = f"/content/YOLO/{model}"
dest_folder = f"/content/YOLO/{model}{suffix}"
pattern = reference[choosen_data]['file']

In [None]:
print(f'🆗 SETTING PARAMETERS\n - Model: {model}\n - rigin: {src_folder}\n - Destination: {dest_folder}\n - Prefix: {pattern}…')

🆗 SETTING PARAMETERS
 - Model: 3.5m.v3i.yolov8
 - rigin: /content/drive/MyDrive/YOLO/3.5m.v3i.yolov8
 - Destination: /content/YOLO/3.5m.v3i.yolov8.S
 - Prefix: 209…


## Cloning

In [None]:
copy_config(src_folder, dest_folder)

Copied: data.yaml
Copied: README.dataset.txt
Copied: README.roboflow.txt
Copied: .DS_Store
✅ Copying complete.


In [None]:
copy_and_filter_folder(src_folder, dest_folder, pattern)

✅ Destination folder '/content/YOLO/3.5m.v3i.yolov8.S' already exists.
❕Subdirectory '/content/YOLO/3.5m.v3i.yolov8.S/.' already exists.
Make sure the data inside is relevant. Otherwise, just delete the folder and repeat the cloning process.
 ✓ Copying complete.

Creating subdirectory '/content/YOLO/3.5m.v3i.yolov8.S/valid'
 ✓ Copying complete.

Creating subdirectory '/content/YOLO/3.5m.v3i.yolov8.S/valid/labels'
 ✓ Copying complete.

Creating subdirectory '/content/YOLO/3.5m.v3i.yolov8.S/valid/images'
 ✓ Copying complete.

Creating subdirectory '/content/YOLO/3.5m.v3i.yolov8.S/train'
 ✓ Copying complete.

Creating subdirectory '/content/YOLO/3.5m.v3i.yolov8.S/train/labels'
 ✓ Copying complete.

Creating subdirectory '/content/YOLO/3.5m.v3i.yolov8.S/train/images'
 ✓ Copying complete.

✅ Copying complete.


In [None]:
# End time measurement
end_time = time.time()
elapsed_time = end_time - start_time
print(f"⏱️ Execution time: {elapsed_time:.2f} seconds")

⏱️ Execution time: 87.70 seconds


**Execution optimization tests:**
1.  Copying local the whole dataset ⏱️ Execution time: 105.03 seconds
2.  From drive to local (Full verbose) ⏱️ Execution time: 106.29 seconds
3.  From drive to local (Minimal verbose) ⏱️ Execution time: 87.70 seconds

***Conclusions:***
*   **Option 1:** is desirable if you need to test many subset combinations in the same session (avoid downloading data twice from the cloud).
*   **Option 3:** is best if you're just going to test one subset combination  (avoid downloading unnecessary data from the cloud).



In [None]:
# Removes complete local dataset
#!rm -rf /content/YOLO

In [None]:
# YOLO Finetuning test