사용하지 않는 클래스 제거

In [None]:
import os

label_dirs = ["train/labels", "valid/labels", "test/labels"]

# 제외할 클래스 ID 
exclude_classes = {0, 2, 3, 4, 5}
# calculus(0), caries(1)[사용], gingivitis(2), hypodontia(3), tooth_discolation(4), ulcer(5)

def filter_labels(label_file):
    with open(label_file, 'r') as f:
        lines = f.readlines()
    filtered_lines = [line for line in lines if int(line.split()[0]) not in exclude_classes]
    # 덮어쓰기
    with open(label_file, 'w') as f:
        f.writelines(filtered_lines)

# 필터링
for label_dir in label_dirs:
    for label_file in os.listdir(label_dir):
        if label_file.endswith('.txt'):
            filter_labels(os.path.join(label_dir, label_file))

print("라벨 필터링 완료")


In [None]:
label_dirs = ["train/labels", "valid/labels", "test/labels"]

# 클래스 ID 매핑: 원래 클래스 ID -> 새 클래스 ID
id_mapping = {
    2: 0
}

def adjust_label_ids(label_file):
    with open(label_file, 'r') as f:
        lines = f.readlines()
    
    adjusted_lines = []
    for line in lines:
        elements = line.strip().split()
        class_id = int(elements[0])
        
        if class_id in id_mapping:
            new_class_id = id_mapping[class_id]
            elements[0] = str(new_class_id)
            adjusted_lines.append(" ".join(elements) + "\n")
        elif class_id not in id_mapping:
            adjusted_lines.append(line)

    with open(label_file, 'w') as f:
        f.writelines(adjusted_lines)

# 클래스 ID 조정
for label_dir in label_dirs:
    for label_file in os.listdir(label_dir):
        if label_file.endswith('.txt'):
            adjust_label_ids(os.path.join(label_dir, label_file))

print("클래스 ID 조정 완료")


좌표 형식 검토

In [None]:
import os

label_dirs = ["train/labels", "valid/labels", "test/labels"]

segmentation_count = 0
bounding_box_count = 0

def count_coordinates(label_file):
    global segmentation_count, bounding_box_count
    with open(label_file, 'r') as f:
        lines = f.readlines()
    
    for line in lines:
        elements = line.strip().split()
        
        # 요소 5개일 경우 -> Bounding Box 
        if len(elements) == 5:
            bounding_box_count += 1
        # 요소 8개 이상일 경우 -> Segmentation
        elif len(elements) >= 8:
            segmentation_count += 1

for label_dir in label_dirs:
    for label_file in os.listdir(label_dir):
        if label_file.endswith('.txt'):
            count_coordinates(os.path.join(label_dir, label_file))

print(f"Bounding Box 좌표 개수: {bounding_box_count}")
print(f"Segmentation 좌표 개수: {segmentation_count}")


Change Segmentation Labels to BB

In [None]:
import os

label_dirs = ["train/labels", "valid/labels", "test/labels"]

def convert_segmentation_to_bbox(label_file):
    with open(label_file, 'r') as f:
        lines = f.readlines()

    converted_lines = []
    for line in lines:
        elements = line.strip().split()
        class_id = elements[0]
        coordinates = list(map(float, elements[1:]))

        # Segmentation 형식인 경우 bounding box로 변환
        if len(coordinates) >= 8 and len(coordinates) % 2 == 0:
            x_points = coordinates[0::2]
            y_points = coordinates[1::2]

            # bounding box 계산
            x_min, x_max = min(x_points), max(x_points)
            y_min, y_max = min(y_points), max(y_points)

            x_center = (x_min + x_max) / 2
            y_center = (y_min + y_max) / 2
            width = x_max - x_min
            height = y_max - y_min

            # 형식 변환
            converted_lines.append(f"{class_id} {x_center} {y_center} {width} {height}\n")
        elif len(coordinates) == 4:
            converted_lines.append(line)

    with open(label_file, 'w') as f:
        f.writelines(converted_lines)

for label_dir in label_dirs:
    for label_file in os.listdir(label_dir):
        if label_file.endswith('.txt'):
            convert_segmentation_to_bbox(os.path.join(label_dir, label_file))

print("Segmentation -> Bounding Box 형식 변환 완료")


In [None]:
from collections import defaultdict

label_dirs = {
    "train": "train/labels",
    "valid": "valid/labels",
    "test": "test/labels"
}

class_counts = {split: defaultdict(int) for split in label_dirs}

def count_labels(label_dir, split):
    for label_file in os.listdir(label_dir):
        if label_file.endswith('.txt'):
            with open(os.path.join(label_dir, label_file), 'r') as f:
                lines = f.readlines()
            for line in lines:
                class_id = line.strip().split()[0]  # 클래스 ID만 추출
                class_counts[split][class_id] += 1

for split, label_dir in label_dirs.items():
    count_labels(label_dir, split)

# 클래스별 카운트
for split, counts in class_counts.items():
    print(f"\n{split} 데이터셋 클래스별 개수:")
    for class_id, count in sorted(counts.items()):
        print(f"클래스 {class_id}: {count}개")


좌표 없는 데이터 삭제 (Backgrounds 사용하려면 실행할 필요 X)

In [None]:
import os

label_dirs = {
    "train": "train/labels",
    "valid": "valid/labels",
    "test": "test/labels"
}

empty_label_counts = {split: 0 for split in label_dirs}

def count_empty_labels(label_dir, split):
    for label_file in os.listdir(label_dir):
        if label_file.endswith('.txt'):
            label_path = os.path.join(label_dir, label_file)
            # 파일 비어있는지 확인
            if os.path.getsize(label_path) == 0:
                empty_label_counts[split] += 1

for split, label_dir in label_dirs.items():
    count_empty_labels(label_dir, split)

# 결과 출력
for split, count in empty_label_counts.items():
    print(f"{split} 데이터셋에서 라벨이 없는 이미지 개수: {count}개")


In [None]:
import os

# train, valid, test
train_image_folder = "train/images"
train_label_folder = "train/labels"
valid_image_folder = "valid/images"
valid_label_folder = "valid/labels"
test_image_folder = "test/images"
test_label_folder = "test/labels"

# 라벨 파일이 비어 있으면 해당 라벨 파일과 이미지를 삭제
def delete_empty_labels(image_folder, label_folder):
    deleted_files = 0
    
    for label_file in os.listdir(label_folder):
        label_path = os.path.join(label_folder, label_file)
        
        # 라벨 파일이 비어 있는 경우
        if os.path.getsize(label_path) == 0:
            image_path = os.path.join(image_folder, label_file.replace(".txt", ".jpg"))
            
            # 이미지와 라벨 파일이 존재하는 경우 삭제
            if os.path.exists(image_path):
                os.remove(image_path)
            os.remove(label_path)
            
            print(f"Deleted: {label_file} and corresponding image.")
            deleted_files += 1

    print(f"Total deleted files: {deleted_files}")

delete_empty_labels(train_image_folder, train_label_folder)
delete_empty_labels(valid_image_folder, valid_label_folder)
delete_empty_labels(test_image_folder, test_label_folder)


In [None]:
# 클래스별 데이터 수 카운트

import os
from collections import defaultdict

label_dirs = {
    "train": "train/labels",
    "valid": "valid/labels",
    "test": "test/labels"
}

class_counts = {split: defaultdict(int) for split in label_dirs}

def count_data(label_dir, split):
    for label_file in os.listdir(label_dir):
        if label_file.endswith('.txt'): 
            with open(os.path.join(label_dir, label_file), 'r') as f:
                lines = f.readlines()
            unique_classes = set()  # 각 파일에서 등장하는 클래스 한 번만 카운트
            for line in lines:
                class_id = line.strip().split()[0]
                unique_classes.add(class_id)  # 클래스 ID 중복 방지
            # 각 레이블 파일마다 등장하는 클래스 개수 카운트
            for class_id in unique_classes:
                class_counts[split][class_id] += 1

# 각 데이터셋에 대해 데이터 수 카운트
for split, label_dir in label_dirs.items():
    count_data(label_dir, split)

for split, counts in class_counts.items():
    print(f"\n{split} 데이터셋 클래스별 데이터 수:")
    for class_id, count in sorted(counts.items()):
        print(f"클래스 {class_id}: {count}개")
