# base dataset 

In [None]:
import os
from glob import glob
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def analyze_dataset(BASE_DIR):
    SPLITS = ["train", "val", "test"]
    for split in SPLITS:
        print(f"\n=== Split: {split.upper()} ===")

        image_paths = glob(f"{BASE_DIR}/images/{split}/*.png") + \
              glob(f"{BASE_DIR}/images/{split}/*.jpg")

        label_paths = glob(f"{BASE_DIR}/labels/{split}/*.txt")
        print(f"Số ảnh: {len(image_paths)}")
        print(f"Số nhãn: {len(label_paths)}")

        class_counter = Counter()
        widths, heights = [], []
        x_centers, y_centers = [], []
        object_counts = []
        empty_labels = []

        for label_file in label_paths:
            with open(label_file, 'r') as f:
                lines = f.readlines()
                if not lines:
                    empty_labels.append(label_file)
                object_counts.append(len(lines))
                for line in lines:
                    parts = list(map(float, line.strip().split()))
                    class_id, x, y, w, h = parts
                    class_counter[int(class_id)] += 1
                    widths.append(w)
                    heights.append(h)
                    x_centers.append(x)
                    y_centers.append(y)

        print("\n🔹 Phân bố class:")
        for cls, count in sorted(class_counter.items()):
            print(f"  Class {cls}: {count} instances")

        # ⬇️ BẮT ĐẦU CHÈN ĐOẠN NÀY SAU "Phân bố class:"
        # Vẽ biểu đồ bar thể hiện phân bố class
        if class_counter:
            sorted_classes = sorted(class_counter.items())
            class_ids, class_counts = zip(*sorted_classes)
            class_ids = list(class_ids)
            class_counts = list(class_counts)
        
            plt.figure(figsize=(10, 6))
            sns.barplot(x=class_ids, y=class_counts, color='skyblue')
            plt.title(f'Phân bố số lượng nhãn theo class ({split})')
            plt.xlabel('Class ID')
            plt.ylabel('Số lượng object')
            plt.xticks(rotation=45)
            plt.grid(True, axis='y', linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.show()


        print(f"\n🔹 Số file label rỗng: {len(empty_labels)}")

        # Plot histogram số object mỗi ảnh
        plt.figure(figsize=(8, 6))
        plt.hist(object_counts, bins=20)
        plt.title(f'Histogram - Số object/ảnh ({split})')
        plt.xlabel('Số object')
        plt.ylabel('Số ảnh')
        plt.show()

        # Plot kích thước bbox
        plt.figure(figsize=(10, 6))
        plt.hist(widths, bins=50, alpha=0.5, label='Width')
        plt.hist(heights, bins=50, alpha=0.5, label='Height')
        plt.title(f'Histogram - Kích thước bbox ({split})')
        plt.xlabel('Normalized size')
        plt.ylabel('Frequency')
        plt.legend()
        plt.show()

        # Heatmap vị trí object
        if x_centers and y_centers:
            plt.figure(figsize=(8, 8))
            sns.kdeplot(x=x_centers, y=y_centers, cmap="Reds", fill=True, bw_adjust=0.5)
            plt.title(f'Heatmap vị trí bbox ({split})')
            plt.xlabel('x center')
            plt.ylabel('y center')
            plt.show()


In [None]:
BASE_DIR = "/kaggle/input/vietnamese-traffic-signs-detection-and-recognition/train_data"
analyze_dataset(BASE_DIR)

# Yolo augmentation

In [None]:
Yolo_aug_DIR = "/kaggle/input/vietnamses-traffic-sign-detection-augmentaion/dataset"
analyze_dataset(Yolo_aug_DIR)