In [1]:
import os
import random
import shutil
from pathlib import Path
from PIL import Image
from datasets import load_dataset

In [2]:
data_path = '/home/jupyter/novice/vlm.jsonl'
images_source_dir = Path("/home/jupyter/novice/images")
dataset_dir = Path("/home/jupyter/til-24-base/derrick/dataset")

# Create directories for train, val, and test sets
train_images_dir = dataset_dir / "images/train"
val_images_dir = dataset_dir / "images/val"
train_labels_dir = dataset_dir / "labels/train"
val_labels_dir = dataset_dir / "labels/val"

In [3]:
# Create directories if they don't exist
for directory in [train_images_dir, val_images_dir, train_labels_dir, val_labels_dir]:
    os.makedirs(directory, exist_ok=True)

In [4]:
# Load the JSONL file
dataset = load_dataset('json', data_files='/home/jupyter/novice/vlm.jsonl', split='train')

In [5]:
print(dataset)

Dataset({
    features: ['image', 'annotations'],
    num_rows: 5000
})


In [6]:
# Split the dataset into train, validation, and test sets
train_val_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Get the splits
train_dataset = train_val_test_split['train']
val_dataset = train_val_test_split['test']

In [7]:
print(train_dataset)

Dataset({
    features: ['image', 'annotations'],
    num_rows: 4000
})


In [8]:
print(val_dataset)

Dataset({
    features: ['image', 'annotations'],
    num_rows: 1000
})


In [9]:
val_dataset['image'][1]

'image_1183.jpg'

In [10]:
# Initialize a dictionary to map captions to class indices
class_map = {}
class_counter = 0

In [11]:
# Define a function to convert bounding box format
def convert_bbox_to_yolo(bbox, img_width, img_height):
    x_center = (bbox[0] + bbox[2] / 2) / img_width
    y_center = (bbox[1] + bbox[3] / 2) / img_height
    width = bbox[2] / img_width
    height = bbox[3] / img_height
    return x_center, y_center, width, height

In [12]:
def process_split(dataset, images_dir, labels_dir):
    global class_counter
    for obj in dataset:
        image_filename = obj['image']
        annotations = obj['annotations']
        
        # Source image path
        image_path = images_source_dir / image_filename
        
        # Copy image to target directory
        target_image_path = images_dir / image_filename
        if not target_image_path.exists():
            shutil.copy(image_path, target_image_path)

        # Get image dimensions
        img = Image.open(image_path)
        img_width, img_height = img.size

        # Create corresponding label file
        label_file_path = labels_dir / f"{os.path.splitext(image_filename)[0]}.txt"
        if not label_file_path.exists():
            with open(label_file_path, 'w') as label_file:
                for ann in annotations:
                    caption = ann['caption']
                    if caption not in class_map:
                        class_map[caption] = class_counter
                        class_counter += 1
                    class_idx = class_map[caption]
                    bbox = ann['bbox']
                    yolo_bbox = convert_bbox_to_yolo(bbox, img_width, img_height)
                    label_file.write(f"{class_idx} " + " ".join(map(str, yolo_bbox)) + "\n")

In [13]:
# Process each split
process_split(train_dataset, train_images_dir, train_labels_dir)
process_split(val_dataset, val_images_dir, val_labels_dir)

In [29]:
def save_classmap(dataset):
    global class_counter
    for obj in dataset:
        image_filename = obj['image']
        annotations = obj['annotations']
        
        for ann in annotations:
            caption = ann['caption']
            if caption not in class_map:
                print(caption)
                class_map[caption] = class_counter
                class_counter += 1            

In [30]:
save_classmap(train_dataset)
save_classmap(val_dataset)

In [14]:
# Save class map to a file
class_map_file = dataset_dir / "class_map.txt"
with class_map_file.open('w') as f:
    for caption, idx in class_map.items():
        f.write(f"{caption}: {idx}\n")

print("Dataset split and label creation completed successfully!")

Dataset split and label creation completed successfully!


In [15]:
# Create the YOLOv8 configuration file
config_content = f"""train: {train_images_dir}  # train images directory
val: {val_images_dir}  # validation images directory
nc: {len(class_map)}  # number of classes
names: {list(class_map.keys())}  # class names
"""

config_path = dataset_dir / "model_config.yaml"
with config_path.open('w') as f:
    f.write(config_content)

print("Dataset split and label creation completed successfully!")
print("YOLOv8 configuration file created successfully!")

Dataset split and label creation completed successfully!
YOLOv8 configuration file created successfully!
