# Blood Cell Dataset Preparation Tutorial

This tutorial will guide you through the process of:

1. Setting up the blood cell dataset directory structure
2. Copying dataset files from Roboflow format
3. Creating the data.yaml configuration file
4. Creating training and validation text files
5. Verifying the dataset structure

## Required Dependencies

First, let's ensure we have all necessary packages installed:

In [None]:
!pip install pyyaml pathlib

## 1. Creating Directory Structure

We'll start by creating the necessary directory structure for the blood cell dataset. The structure will be:

```
Dataset/blood-cell/
├── images/
│   ├── train/
│   ├── val/
│   └── test/
└── labels/
    ├── train/
    ├── val/
    └── test/
```

In [None]:
import os
from pathlib import Path

def create_directory_structure():
    """Create the necessary directory structure for the dataset."""
    base_dir = "Dataset/blood-cell"
    dirs = [
        "images/train",
        "images/val",
        "images/test",
        "labels/train",
        "labels/val", 
        "labels/test"
    ]
    
    for dir_path in dirs:
        Path(f"{base_dir}/{dir_path}").mkdir(parents=True, exist_ok=True)
    
    return base_dir

base_dir = create_directory_structure()
print(f"Created directory structure in {base_dir}")

## 2. Copying Dataset Files

Now we'll copy the files from the Roboflow dataset format to our structured format. This includes:
- Training images and labels
- Validation images and labels
- Test images and labels

Note: The source dataset should be in the 'Dataset/blood-cell-6' directory with Roboflow's structure.

In [None]:
import shutil

def copy_dataset_files(source_base="Dataset/blood-cell-6", dest_base="Dataset/blood-cell"):
    """Copy files from Roboflow dataset to our structured format."""
    # Define the directories to process
    dirs = ['train', 'valid', 'test']
    
    for dir_name in dirs:
        # Determine destination directory name (valid -> val)
        dest_dir = 'val' if dir_name == 'valid' else dir_name
        
        # Copy images
        src_img_dir = Path(f"{source_base}/{dir_name}/images")
        dst_img_dir = Path(f"{dest_base}/images/{dest_dir}")
        
        if src_img_dir.exists():
            print(f"Copying {dir_name} images...")
            for img_file in src_img_dir.glob("*"):
                shutil.copy2(img_file, dst_img_dir)
        
        # Copy labels
        src_label_dir = Path(f"{source_base}/{dir_name}/labels")
        dst_label_dir = Path(f"{dest_base}/labels/{dest_dir}")
        
        if src_label_dir.exists():
            print(f"Copying {dir_name} labels...")
            for label_file in src_label_dir.glob("*"):
                shutil.copy2(label_file, dst_label_dir)

copy_dataset_files()

## 3. Creating Dataset Lists

Now we'll create text files listing all images in the training and validation sets. These files will be used during training to locate the images.

In [None]:
def create_dataset_lists(base_dir="Dataset/blood-cell"):
    """Create text files listing all images in train and val sets."""
    # Create train.txt
    train_images = list(Path(f"{base_dir}/images/train").glob("*"))
    with open(Path(f"{base_dir}/train.txt"), 'w') as f:
        for img_path in train_images:
            f.write(f'./Dataset/blood-cell/images/train/{img_path.name}\n')
    
    # Create val.txt
    val_images = list(Path(f"{base_dir}/images/val").glob("*"))
    with open(Path(f"{base_dir}/val.txt"), 'w') as f:
        for img_path in val_images:
            f.write(f'./Dataset/blood-cell/images/val/{img_path.name}\n')
    
    print(f"Created train.txt with {len(train_images)} images")
    print(f"Created val.txt with {len(val_images)} images")

create_dataset_lists()

## 4. Verification

Finally, let's verify that our dataset is properly structured and all necessary files are in place:

In [None]:
def verify_dataset(base_dir="Dataset/blood-cell"):
    """Verify the dataset structure and files."""
    base_dir = Path(base_dir)
    
    # Check directory structure
    required_dirs = [
        "images/train",
        "images/val",
        "images/test",
        "labels/train",
        "labels/val",
        "labels/test"
    ]
    
    for dir_path in required_dirs:
        full_path = base_dir / dir_path
        if not full_path.exists():
            print(f"❌ Missing directory: {dir_path}")
        else:
            print(f"✅ Found directory: {dir_path}")
    
    # Check text files
    for txt_file in ["train.txt", "val.txt"]:
        if (base_dir / txt_file).exists():
            print(f"✅ Found file: {txt_file}")
        else:
            print(f"❌ Missing file: {txt_file}")
    
    # Check data.yaml
    if (base_dir / "data.yaml").exists():
        print("✅ Found data.yaml")
    else:
        print("❌ Missing data.yaml")

verify_dataset()

## 5. Verification

Let's visualize some images from the training and validation sets along with their bounding boxes in a 3x3 grid:

In [None]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import yaml

# Load class names from args.yaml
def load_class_names(yaml_path="utils/args_blood.yaml"):
    with open(yaml_path, 'r') as f:
        config = yaml.safe_load(f)
    return config['names']

# Function to generate random color for each class
def generate_class_colors(class_names):
    colors = {}
    for idx, name in enumerate(class_names):
        # Generate a random color for each class (RGB)
        colors[idx] = tuple(np.random.randint(0, 256, 3).tolist())
    return colors

def plot_images_with_bboxes(dataset, num_images=9, class_names=None, class_colors=None):
    base_dir = "Dataset/blood-cell"
    image_dir = os.path.join(base_dir, "images", dataset)
    label_dir = os.path.join(base_dir, "labels", dataset)
    
    # Get all image files and randomly sample num_images
    all_images = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]
    selected_images = random.sample(all_images, min(num_images, len(all_images)))
    
    # Create a 3x3 grid
    fig, axes = plt.subplots(3, 3, figsize=(15, 15))
    axes = axes.ravel()
    
    for i, img_name in enumerate(selected_images):
        img_path = os.path.join(image_dir, img_name)
        label_path = os.path.join(label_dir, os.path.splitext(img_name)[0] + '.txt')
        
        # Read and convert image
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # Draw bounding boxes if label file exists
        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                for line in f:
                    values = line.strip().split()
                    class_id = int(values[0])  # First value is class ID
                    x_center, y_center, width, height = map(float, values[1:])
                    
                    # Convert normalized coordinates to pixel coordinates
                    x_center *= img.shape[1]
                    y_center *= img.shape[0]
                    width *= img.shape[1]
                    height *= img.shape[0]
                    
                    # Calculate box coordinates
                    x_min = int(x_center - width / 2)
                    y_min = int(y_center - height / 2)
                    x_max = int(x_center + width / 2)
                    y_max = int(y_center + height / 2)
                    
                    # Draw rectangle with random color based on class
                    color = class_colors.get(class_id, (0, 255, 0))  # Default to green if class id is not in class_colors
                    cv2.rectangle(img, (x_min, y_min), (x_max, y_max), color, 2)
                    
                    # Put class name instead of class number
                    class_name = class_names.get(class_id, f"Class {class_id}")
                    cv2.putText(img, class_name, (x_min, y_min - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
        
        # Display image
        axes[i].imshow(img)
        axes[i].set_title(f"{dataset}: {img_name}")
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

# Load class names and colors
class_names = load_class_names()
class_colors = generate_class_colors(class_names)

# Plot training images
print("Training Images with Bounding Boxes:")
plot_images_with_bboxes('train', class_names=class_names, class_colors=class_colors)

# Plot test images
print("\nTest Images with Bounding Boxes:")
plot_images_with_bboxes('val', class_names=class_names, class_colors=class_colors)


## Conclusion

You have now successfully:

1. Created the necessary directory structure for the blood cell dataset
2. Copied the dataset files from Roboflow format to our structured format
3. Created the data.yaml configuration file
4. Created text files listing all training and validation images
5. Verified the dataset structure and format

The dataset is now ready to be used for training YOLO models. The directory structure should look like this:

```
Dataset/blood-cell/
├── images/
│   ├── train/  (contains training images)
│   ├── val/    (contains validation images)
│   └── test/   (contains test images)
├── labels/
│   ├── train/  (contains training labels)
│   ├── val/    (contains validation labels)
│   └── test/   (contains test labels)
├── data.yaml   (dataset configuration)
├── train.txt   (list of training image paths)
└── val.txt     (list of validation image paths)
```