In [13]:
# 导入必要的库
import os
import shutil
import random
from pathlib import Path
from typing import List, Tuple
from tqdm import tqdm
def find_annotated_images(image_dir: str, annotation_dir: str, image_extensions: List[str] = ['.jpg','.tiff', '.jpeg', '.png', '.bmp']) -> Tuple[List[Path], List[Path]]:
    """
    查找有标注和无标注的图片。

    Args:
        image_dir (str): 图片目录路径。
        annotation_dir (str): 标注文件目录路径。
        image_extensions (List[str], optional): 支持的图片扩展名列表。默认为 ['.jpg', '.jpeg', '.png', '.bmp']。

    Returns:
        Tuple[List[Path], List[Path]]: 有标注的图片路径列表和无标注的图片路径列表。
    """
    image_dir = Path(image_dir)
    annotation_dir = Path(annotation_dir)

    # 收集所有图片文件
    all_images = [p for p in image_dir.iterdir() if p.suffix.lower() in image_extensions and p.is_file()]

    annotated_images = []
    non_annotated_images = []

    for img_path in tqdm(all_images, desc="查找有标注和无标注的图片"):
        # 假设标注文件与图片同名但扩展名为 .txt
        label_path = annotation_dir / f"{img_path.stem}.json"
        if label_path.exists():
            annotated_images.append(img_path)
        else:
            non_annotated_images.append(img_path)

    return annotated_images, non_annotated_images

def split_data(annotated_images: List[Path], train_ratio: float = 0.8) -> Tuple[List[Path], List[Path]]:
    """
    将有标注的图片随机划分为训练集和验证集。

    Args:
        annotated_images (List[Path]): 有标注的图片路径列表。
        train_ratio (float, optional): 训练集比例。默认为 0.8。

    Returns:
        Tuple[List[Path], List[Path]]: 训练集图片路径列表和验证集图片路径列表。
    """
    random.shuffle(annotated_images)
    split_index = int(len(annotated_images) * train_ratio)
    train_images = annotated_images[:split_index]
    val_images = annotated_images[split_index:]
    return train_images, val_images

def organize_data(
    image_dir: str,
    annotation_dir: str,
    output_dir: str,
    annotated_images: List[Path],
    non_annotated_images: List[Path],
    train_ratio: float = 0.8
) -> None:
    """
    将图片和标注文件组织到不同的目录中，并划分训练集和验证集。

    Args:
        image_dir (str): 图片目录路径。
        annotation_dir (str): 标注文件目录路径。
        output_dir (str): 输出的组织后数据的目录路径。
        annotated_images (List[Path]): 有标注的图片路径列表。
        non_annotated_images (List[Path]): 无标注的图片路径列表。
        train_ratio (float, optional): 训练集比例。默认为 0.8。
    """
    image_dir = Path(image_dir)
    annotation_dir = Path(annotation_dir)
    output_dir = Path(output_dir)

    train_images_dir = output_dir / 'train' / 'images'
    train_labels_dir = output_dir / 'train' / 'labels'
    val_images_dir = output_dir / 'val' / 'images'
    val_labels_dir = output_dir / 'val' / 'labels'
    non_annotated_images_dir = output_dir / 'non_annotated_images'

    # 创建目标目录
    train_images_dir.mkdir(parents=True, exist_ok=True)
    train_labels_dir.mkdir(parents=True, exist_ok=True)
    val_images_dir.mkdir(parents=True, exist_ok=True)
    val_labels_dir.mkdir(parents=True, exist_ok=True)
    non_annotated_images_dir.mkdir(parents=True, exist_ok=True)

    # 划分训练集和验证集
    train_images, val_images = split_data(annotated_images, train_ratio)

    # 复制训练集
    for img_path in tqdm(train_images, desc="复制训练集图片"):
        shutil.copy(img_path, train_images_dir / img_path.name)
        label_path = annotation_dir / f"{img_path.stem}.json"
        if label_path.exists():
            shutil.copy(label_path, train_labels_dir / label_path.name)

    # 复制验证集
    for img_path in tqdm(val_images, desc="复制验证集图片"):
        shutil.copy(img_path, val_images_dir / img_path.name)
        label_path = annotation_dir / f"{img_path.stem}.json"
        if label_path.exists():
            shutil.copy(label_path, val_labels_dir / label_path.name)

    # 复制无标注的图片
    for img_path in tqdm(non_annotated_images, desc="复制无标注的图片"):
        shutil.copy(img_path, non_annotated_images_dir / img_path.name)

    print(f"训练集图片已复制到: {train_images_dir}")
    print(f"训练集标注文件已复制到: {train_labels_dir}")
    print(f"验证集图片已复制到: {val_images_dir}")
    print(f"验证集标注文件已复制到: {val_labels_dir}")
    print(f"无标注的图片已复制到: {non_annotated_images_dir}")


In [14]:
# 设置目录路径
image_dir = "/data/yan/track/TS-20241123222043114.avi_frames"          # 原始图片目录
annotation_dir = "/data/yan/track/TS-20241123222043114.avi_frames/annotations"  # 标注文件目录
output_dir = "./dataset"                                 # 输出数据集目录
train_ratio = 0.8                                        # 训练集比例

# 查找有标注和无标注的图片
annotated_images, non_annotated_images = find_annotated_images(image_dir, annotation_dir)

print(f"找到 {len(annotated_images)} 张有标注的图片。")
print(f"找到 {len(non_annotated_images)} 张无标注的图片。")

# 组织数据并划分训练集和验证集
organize_data(
    image_dir=image_dir,
    annotation_dir=annotation_dir,
    output_dir=output_dir,
    annotated_images=annotated_images,
    non_annotated_images=non_annotated_images,
    train_ratio=train_ratio
)


查找有标注和无标注的图片: 100%|██████████| 2184/2184 [00:00<00:00, 135462.19it/s]


找到 23 张有标注的图片。
找到 2161 张无标注的图片。


复制训练集图片: 100%|██████████| 18/18 [00:00<00:00, 795.55it/s]
复制验证集图片:   0%|          | 0/5 [00:00<?, ?it/s]

复制验证集图片: 100%|██████████| 5/5 [00:00<00:00, 718.25it/s]
复制无标注的图片: 100%|██████████| 2161/2161 [00:01<00:00, 1106.39it/s]

训练集图片已复制到: dataset/train/images
训练集标注文件已复制到: dataset/train/labels
验证集图片已复制到: dataset/val/images
验证集标注文件已复制到: dataset/val/labels
无标注的图片已复制到: dataset/non_annotated_images



