In [1]:
!pip install super-gradients



In [2]:
!pip install datasets roboflow pybboxes pillow pyyaml scikit-learn



In [31]:

import os
import shutil
import zipfile
import json
import random
import yaml
from PIL import Image
from roboflow import Roboflow
from sklearn.model_selection import train_test_split
import pybboxes as pbx
import glob
from collections import defaultdict
import matplotlib.pyplot as plt
from getpass import getpass

from super_gradients.training import dataloaders
from super_gradients.training.dataloaders.dataloaders import coco_detection_yolo_format_train, coco_detection_yolo_format_val


In [26]:
base_dir = '/content/YoloNAS_Project/'
os.makedirs(base_dir, exist_ok=True)

In [5]:
def download_huggingface_dataset(save_dir):
    """
    Downloads the Hugging Face dataset and organizes it into images and labels directories.
    """
    os.makedirs(save_dir, exist_ok=True)

    # Load the dataset
    ds = load_dataset("keremberke/forklift-object-detection", name="full")

    # Create directories for each split
    for split in ds.keys():
        split_path = os.path.join(save_dir, split)
        images_path = os.path.join(split_path, 'images')
        labels_path = os.path.join(split_path, 'labels')
        os.makedirs(images_path, exist_ok=True)
        os.makedirs(labels_path, exist_ok=True)

        # Process each example in the split
        for example in ds[split]:
            # Download and save the image
            image_url = example['image']
            image_filename = os.path.basename(image_url)
            image_path = os.path.join(images_path, image_filename)
            if not os.path.exists(image_path):
                !wget -q {image_url} -O {image_path}

            # Save annotations
            annotation_filename = os.path.splitext(image_filename)[0] + '.txt'
            label_path = os.path.join(labels_path, annotation_filename)
            with open(label_path, 'w') as f:
                for obj in example['objects']:
                    class_id = obj['category_id'] - 1  # Assuming category_id starts at 1
                    bbox = obj['bbox']  # [x, y, width, height]
                    # YOLO format requires normalized coordinates, which we'll handle later
                    f.write(f"{class_id} {bbox[0]} {bbox[1]} {bbox[2]} {bbox[3]}\n")

    print(f"Hugging Face dataset downloaded and saved to {save_dir}")

In [6]:
# # Define the path to save the Hugging Face dataset
# hf_save_dir = os.path.join(base_dir, 'huggingface_forklift')
# download_huggingface_dataset(hf_save_dir)

## Roboflow dataset

In [27]:
def download_roboflow_datasets(api_key, save_dir):
    """
    Downloads multiple Roboflow datasets and organizes them into separate directories.
    """
    rf = Roboflow(api_key=api_key)

    # Define datasets to download
    roboflow_datasets = [
        {
            "workspace": "hitsz",
            "project": "forklift-and-human",
            "version": "2"  # Changed to version 2 as per your working code
        },
        {
            "workspace": "paft",
            "project": "forklift-model",
            "version": "1"
        },
        {
            "workspace": "uqtr-2v1ej",
            "project": "forklift-9rs9h",
            "version": "1"
        }
    ]

    # Iterate over each dataset and download
    for ds_info in roboflow_datasets:
        workspace = ds_info["workspace"]
        project = ds_info["project"]
        version = ds_info["version"]

        print(f"Downloading Roboflow dataset: {workspace}/{project}/v{version}")
        project_rf = rf.workspace(workspace).project(project)
        dataset = project_rf.version(int(version)).download("yolov5")  # Ensure version is an integer

        # Move dataset to specific folder
        destination = os.path.join(save_dir, f"{project}_v{version}")
        os.makedirs(destination, exist_ok=True)
        for item in os.listdir(dataset.location):
            s = os.path.join(dataset.location, item)
            d = os.path.join(destination, item)
            if os.path.isdir(s):
                shutil.move(s, d)
            else:
                shutil.move(s, d)

    print(f"All Roboflow datasets downloaded and saved to {save_dir}")

In [29]:
def extract_zip_datasets(rf_dir):
    """
    Extracts all .zip files within the Roboflow directory.
    """
    zip_files = glob.glob(os.path.join(rf_dir, '*', '*.zip'))
    for zip_file in zip_files:
        print(f"Extracting {zip_file}")
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            # Extract to the same directory as the zip file
            extract_path = os.path.splitext(zip_file)[0]
            zip_ref.extractall(extract_path)
        print(f"Extracted to {extract_path}")

        # Optionally, remove the zip file after extraction
        os.remove(zip_file)
        print(f"Removed zip file: {zip_file}")

In [30]:
def merge_datasets(rf_dir, merged_dir):
    """
    Merges multiple Roboflow datasets into unified train, valid, and test directories.
    """
    os.makedirs(merged_dir, exist_ok=True)
    merged_images = os.path.join(merged_dir, 'images')
    merged_labels = os.path.join(merged_dir, 'labels')
    os.makedirs(merged_images, exist_ok=True)
    os.makedirs(merged_labels, exist_ok=True)

    # Define splits
    splits = ['train', 'valid', 'test']

    for split in splits:
        split_images_dest = os.path.join(merged_images, split)
        split_labels_dest = os.path.join(merged_labels, split)
        os.makedirs(split_images_dest, exist_ok=True)
        os.makedirs(split_labels_dest, exist_ok=True)

        # Iterate through each dataset
        roboflow_projects = os.listdir(rf_dir)
        for project in roboflow_projects:
            project_dir = os.path.join(rf_dir, project)

            # Determine the path based on whether 'images' is inside 'train', 'valid', 'test'
            project_split_images = os.path.join(project_dir, 'images', split)
            project_split_labels = os.path.join(project_dir, 'labels', split)

            # If not found, try alternative structure
            if not os.path.exists(project_split_images):
                project_split_images = os.path.join(project_dir, split, 'images')
            if not os.path.exists(project_split_labels):
                project_split_labels = os.path.join(project_dir, split, 'labels')

            if not os.path.exists(project_split_images):
                print(f"Warning: {project_split_images} does not exist.")
                continue
            if not os.path.exists(project_split_labels):
                print(f"Warning: {project_split_labels} does not exist.")
                continue

            # Copy images
            for img_file in os.listdir(project_split_images):
                if img_file.endswith(('.jpg', '.png', '.jpeg')):
                    src_img = os.path.join(project_split_images, img_file)
                    dest_img = os.path.join(split_images_dest, img_file)
                    if not os.path.exists(dest_img):
                        shutil.copy(src_img, dest_img)

            # Copy labels
            for label_file in os.listdir(project_split_labels):
                if label_file.endswith('.txt'):
                    src_label = os.path.join(project_split_labels, label_file)
                    dest_label = os.path.join(split_labels_dest, label_file)
                    if not os.path.exists(dest_label):
                        shutil.copy(src_label, dest_label)

    print(f"All datasets merged into {merged_dir}")

In [33]:
roboflow_api_key = getpass('Enter your Roboflow API Key: ')

# Define the path to save Roboflow datasets
rf_save_dir = os.path.join(base_dir, 'roboflow')
os.makedirs(rf_save_dir, exist_ok=True)

# Download Roboflow datasets
download_roboflow_datasets(roboflow_api_key, rf_save_dir)

# Extract all zip files in the Roboflow directory
extract_zip_datasets(rf_save_dir)

# Merge datasets into a unified directory
merged_dataset_dir = os.path.join(base_dir, 'merged_dataset')
os.makedirs(merged_dataset_dir, exist_ok=True)

merge_datasets(rf_save_dir, merged_dataset_dir)

All datasets merged into /content/YoloNAS_Project/merged_dataset


# Investigate dataset

In [34]:
def count_instances(merged_labels_dir, classes):
    """
    Counts the number of instances per class in each split.

    Parameters:
    - merged_labels_dir (str): Directory containing label files organized by split.
    - classes (list): List of class names.

    Returns:
    - stats (dict): Nested dictionary with counts per split and per class.
    """
    stats = {
        'train': defaultdict(int),
        'valid': defaultdict(int),
        'test': defaultdict(int)
    }

    splits = ['train', 'valid', 'test']

    for split in splits:
        split_labels_dir = os.path.join(merged_labels_dir, split)
        if not os.path.exists(split_labels_dir):
            print(f"Warning: {split_labels_dir} does not exist.")
            continue
        for label_file in os.listdir(split_labels_dir):
            if label_file.endswith('.txt'):
                label_path = os.path.join(split_labels_dir, label_file)
                with open(label_path, 'r') as lf:
                    lines = lf.readlines()
                for line in lines:
                    parts = line.strip().split()
                    if len(parts) < 5:
                        continue  # Invalid label format
                    class_id = int(parts[0])
                    class_name = classes[class_id] if class_id < len(classes) else "Unknown"
                    stats[split][class_name] += 1

    return stats

In [35]:
# Load updated class names from dataset.yaml
dataset_yaml_path = os.path.join(merged_dataset_dir, 'dataset.yaml')
with open(dataset_yaml_path, 'r') as f:
    dataset_yaml = yaml.safe_load(f)

classes = dataset_yaml['names']

# Count instances
stats = count_instances(merged_labels_dir, classes)

# Display statistics
for split in ['train', 'valid', 'test']:
    print(f"\n--- {split.capitalize()} Set ---")
    total = sum(stats[split].values())
    print(f"Total Instances: {total}")
    for class_name, count in stats[split].items():
        print(f" - {class_name}: {count}")

FileNotFoundError: [Errno 2] No such file or directory: '/content/YoloNAS_Project/merged_dataset/dataset.yaml'

In [8]:
def organize_datasets(rf_dir, merged_dir):
    """
    Merges multiple Roboflow datasets into a single directory with images and labels.
    """
    os.makedirs(merged_dir, exist_ok=True)
    merged_images = os.path.join(merged_dir, 'images')
    merged_labels = os.path.join(merged_dir, 'labels')
    os.makedirs(merged_images, exist_ok=True)
    os.makedirs(merged_labels, exist_ok=True)

    # Function to copy files
    def copy_files(src_images, src_labels):
        for img_file in os.listdir(src_images):
            if img_file.endswith(('.jpg', '.png', '.jpeg')):
                # Copy image
                src_img_path = os.path.join(src_images, img_file)
                dest_img_path = os.path.join(merged_images, img_file)
                if not os.path.exists(dest_img_path):
                    shutil.copy(src_img_path, dest_img_path)

                # Copy label
                label_file = os.path.splitext(img_file)[0] + '.txt'
                src_label_path = os.path.join(src_labels, label_file)
                dest_label_path = os.path.join(merged_labels, label_file)
                if os.path.exists(src_label_path) and not os.path.exists(dest_label_path):
                    shutil.copy(src_label_path, dest_label_path)

    # Iterate over each Roboflow project
    roboflow_projects = os.listdir(rf_dir)
    for project in roboflow_projects:
        project_dir = os.path.join(rf_dir, project)
        for split in ['train', 'valid', 'test']:
            split_dir = os.path.join(project_dir, split)
            if os.path.exists(split_dir):
                src_images = os.path.join(split_dir, 'images')
                src_labels = os.path.join(split_dir, 'labels')
                copy_files(src_images, src_labels)

    print(f"Datasets merged and saved to {merged_dir}")

In [9]:
# Define merged dataset directory
merged_dataset_dir = os.path.join(base_dir, 'merged')

# Organize datasets
organize_datasets(rf_save_dir, merged_dataset_dir)

Datasets merged and saved to /content/YoloNAS_Project/merged


In [10]:
def create_dataset(merged_dir, dataset_params, split_ratios=(0.7, 0.2, 0.1), move_files=False):
    """
    Create a dataset structure for training, validation, and test sets with images and YOLO labels.

    Parameters:
    - merged_dir (str): Directory containing merged images and labels.
    - dataset_params (dict): Dictionary defining the dataset structure and paths.
    - split_ratios (tuple): Ratios for splitting the dataset into (train, valid, test).
    - move_files (bool): Whether to move files instead of copying.
    """
    # Create directories for train, val, test
    for dir_path in [dataset_params['train_images_dir'], dataset_params['train_labels_dir'],
                     dataset_params['val_images_dir'], dataset_params['val_labels_dir'],
                     dataset_params['test_images_dir'], dataset_params['test_labels_dir']]:
        full_path = os.path.join(dataset_params['data_dir'], dir_path)
        os.makedirs(full_path, exist_ok=True)

    # Get list of image files
    images_dir = os.path.join(merged_dir, 'images')
    labels_dir = os.path.join(merged_dir, 'labels')
    image_files = [f for f in os.listdir(images_dir) if f.endswith(('.jpg', '.png', '.jpeg'))]

    # Shuffle images
    random.seed(42)
    random.shuffle(image_files)

    # Split indices
    train_end = int(len(image_files) * split_ratios[0])
    val_end = train_end + int(len(image_files) * split_ratios[1])

    train_files = image_files[:train_end]
    val_files = image_files[train_end:val_end]
    test_files = image_files[val_end:]

    # Function to copy or move files
    def copy_move_files(file_list, src_images, src_labels, dest_images, dest_labels):
        for img_file in file_list:
            src_img_path = os.path.join(src_images, img_file)
            dest_img_path = os.path.join(dest_images, img_file)
            src_label_path = os.path.join(src_labels, os.path.splitext(img_file)[0] + '.txt')
            dest_label_path = os.path.join(dest_labels, os.path.splitext(img_file)[0] + '.txt')

            # Copy or move image
            if move_files:
                shutil.move(src_img_path, dest_img_path)
            else:
                shutil.copy(src_img_path, dest_img_path)

            # Copy or move label if it exists
            if os.path.exists(src_label_path):
                if move_files:
                    shutil.move(src_label_path, dest_label_path)
                else:
                    shutil.copy(src_label_path, dest_label_path)
            else:
                print(f"Warning: Label file does not exist for image {img_file}")

    # Define source and destination directories
    src_images = images_dir
    src_labels = labels_dir

    dest_train_images = os.path.join(dataset_params['data_dir'], dataset_params['train_images_dir'])
    dest_train_labels = os.path.join(dataset_params['data_dir'], dataset_params['train_labels_dir'])
    dest_val_images = os.path.join(dataset_params['data_dir'], dataset_params['val_images_dir'])
    dest_val_labels = os.path.join(dataset_params['data_dir'], dataset_params['val_labels_dir'])
    dest_test_images = os.path.join(dataset_params['data_dir'], dataset_params['test_images_dir'])
    dest_test_labels = os.path.join(dataset_params['data_dir'], dataset_params['test_labels_dir'])

    # Copy/move files
    copy_move_files(train_files, src_images, src_labels, dest_train_images, dest_train_labels)
    copy_move_files(val_files, src_images, src_labels, dest_val_images, dest_val_labels)
    copy_move_files(test_files, src_images, src_labels, dest_test_images, dest_test_labels)

    print(f"Dataset created at {dataset_params['data_dir']} with splits:")
    print(f"Train: {len(train_files)} images")
    print(f"Validation: {len(val_files)} images")
    print(f"Test: {len(test_files)} images")

In [18]:
# Define label mapping
label_mapping = {
    'human': 'person'
}

# Standardize labels in the merged dataset
standardize_labels(
    merged_labels_dir=os.path.join(merged_dataset_dir, 'labels'),
    label_mapping=label_mapping,
    target_class_id=1  # Assuming 'person' has class ID 1
)

RecursionError: maximum recursion depth exceeded while calling a Python object

In [11]:
# Define dataset parameters
dataset_params = {
    'data_dir': os.path.join(base_dir, 'final_dataset'),
    'train_images_dir': 'train/images',
    'train_labels_dir': 'train/labels',
    'val_images_dir': 'valid/images',
    'val_labels_dir': 'valid/labels',
    'test_images_dir': 'test/images',
    'test_labels_dir': 'test/labels',
    'classes': ['forklift', 'person']  # Update based on actual classes in your datasets
}

# Define split ratios
split_ratios = (0.7, 0.2, 0.1)

# Create dataset
create_dataset(
    merged_dir=merged_dataset_dir,
    dataset_params=dataset_params,
    split_ratios=split_ratios,
    move_files=False  # Set to True if you want to move files instead of copying
)

Dataset created at /content/YoloNAS_Project/final_dataset with splits:
Train: 1977 images
Validation: 565 images
Test: 283 images


In [36]:
def create_yaml(dataset_dir, classes):
    """
    Creates a dataset.yaml file for YOLO-NAS.

    Parameters:
    - dataset_dir (str): Base directory of the dataset containing train/val/test splits.
    - classes (list): List of class names.
    """
    yaml_content = {
        'train': os.path.join(dataset_dir, 'train/images'),
        'val': os.path.join(dataset_dir, 'valid/images'),
        'test': os.path.join(dataset_dir, 'test/images'),  # Optional
        'nc': len(classes),
        'names': classes
    }

    yaml_path = os.path.join(dataset_dir, 'dataset.yaml')
    with open(yaml_path, 'w') as f:
        yaml.dump(yaml_content, f)

    print(f"dataset.yaml created at {yaml_path}")

In [37]:
# Create dataset.yaml
create_yaml(dataset_params['data_dir'], dataset_params['classes'])

NameError: name 'dataset_params' is not defined

# Dataloaders

In [15]:
# Example usage of custom data loader functions

train_data = coco_detection_yolo_format_train(
    dataset_params={
        'data_dir': dataset_params['data_dir'],
        'images_dir': dataset_params['train_images_dir'],
        'labels_dir': dataset_params['train_labels_dir'],
        'classes': dataset_params['classes']
    },
    dataloader_params={
        'batch_size':8,
        'num_workers':2
    }
)

val_data = coco_detection_yolo_format_val(
    dataset_params={
        'data_dir': dataset_params['data_dir'],
        'images_dir': dataset_params['val_images_dir'],
        'labels_dir': dataset_params['val_labels_dir'],
        'classes': dataset_params['classes']
    },
    dataloader_params={
        'batch_size':8,
        'num_workers':2
    }
)

test_data = coco_detection_yolo_format_val(
    dataset_params={
        'data_dir': dataset_params['data_dir'],
        'images_dir': dataset_params['test_images_dir'],
        'labels_dir': dataset_params['test_labels_dir'],
        'classes': dataset_params['classes']
    },
    dataloader_params={
        'batch_size':8,
        'num_workers':2
    }
)

[2024-09-28 12:27:44] INFO - detection_dataset.py - Dataset Initialization in progress. `cache_annotations=True` causes the process to take longer due to full dataset indexing.
Indexing dataset annotations: 100%|██████████| 1977/1977 [00:00<00:00, 4981.89it/s]
[2024-09-28 12:27:45] INFO - detection_dataset.py - Dataset Initialization in progress. `cache_annotations=True` causes the process to take longer due to full dataset indexing.
Indexing dataset annotations: 100%|██████████| 565/565 [00:00<00:00, 2205.46it/s]
[2024-09-28 12:27:46] INFO - detection_dataset.py - Dataset Initialization in progress. `cache_annotations=True` causes the process to take longer due to full dataset indexing.
