**Import library**

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
from collections import defaultdict
import copy
from collections import Counter

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount= True)

Mounted at /content/drive


**Load image annotation**

In [None]:
with open('/content/drive/MyDrive/New_BBox_Dataset/coco_annotation/cleaned/left_out_cha_mieng/tofu_separated_final.json', 'r', encoding='utf-8') as f:
    data_tofu = json.load(f)

In [None]:
with open('/content/drive/MyDrive/New_BBox_Dataset/coco_annotation/cleaned/left_out_cha_mieng/cha_ca_phuc_separated_final.json', 'r', encoding='utf-8') as f:
    data_cha_ca = json.load(f)

In [None]:
with open('/content/drive/MyDrive/New_BBox_Dataset/coco_annotation/cleaned/left_out_cha_mieng/suon_separated_2_final.json', 'r', encoding='utf-8') as f:
    data_suon = json.load(f)

**Config file_name**

In [None]:
def remove_filename_prefix(coco_dict, prefix="Suon/"):

    count = 0

    for img in coco_dict['images']:
        if img['file_name'].startswith(prefix):

            old_name = img['file_name']
            new_name = old_name.replace(prefix, "", 1)

            img['file_name'] = new_name
            count += 1

            if count <= 3:
                print(f"Đã sửa: '{old_name}' -> '{new_name}'")

    print(f"--- Hoàn tất! Đã sửa đường dẫn cho {count} ảnh. ---")
    return coco_dict

data_suon = remove_filename_prefix(data_suon, prefix="Suon/")

if data_suon['images']:
    print("Ví dụ sau khi sửa:", data_suon['images'][0]['file_name'])

--- Hoàn tất! Đã sửa đường dẫn cho 0 ảnh. ---
Ví dụ sau khi sửa: suon_cot_let_100.jpg


In [None]:
def remove_empty_images(coco_dict):
    """
    Xóa các image trong coco_dict nếu image đó không có annotation nào.
    """

    annotated_image_ids = {ann['image_id'] for ann in coco_dict['annotations']}

    original_count = len(coco_dict['images'])

    coco_dict['images'] = [
        img for img in coco_dict['images']
        if img['id'] in annotated_image_ids
    ]

    new_count = len(coco_dict['images'])

    print(f"Đã xử lý xong!")
    print(f"Trước khi lọc: {original_count} ảnh")
    print(f"Sau khi lọc:   {new_count} ảnh")
    print(f"Đã xóa:        {original_count - new_count} ảnh rỗng")

    return coco_dict

In [None]:
data_tofu = remove_empty_images(data_tofu)
data_cha_ca = remove_empty_images(data_cha_ca)
data_suon = remove_empty_images(data_suon)

Đã xử lý xong!
Trước khi lọc: 209 ảnh
Sau khi lọc:   209 ảnh
Đã xóa:        0 ảnh rỗng
Đã xử lý xong!
Trước khi lọc: 200 ảnh
Sau khi lọc:   200 ảnh
Đã xóa:        0 ảnh rỗng
Đã xử lý xong!
Trước khi lọc: 329 ảnh
Sau khi lọc:   329 ảnh
Đã xóa:        0 ảnh rỗng


**Concat dataset (suon_non -> suon_cot_let -> cha_cat_lat->cha_mieng->tofu_chien->tofu_trang)**

In [None]:
def merge_list_of_coco_dicts(dict_list):
    if not dict_list:
        return {}

    merged_dict = {
        'info': dict_list[0].get('info', {}),
        'licenses': dict_list[0].get('licenses', []),
        'images': [],
        'annotations': [],
        'categories': []
    }

    global_cat_name_to_id = {}
    next_global_cat_id = 1

    current_img_offset = 0
    current_ann_offset = 0

    print(f"Bắt đầu gộp {len(dict_list)} datasets...")

    for i, sub_dict in enumerate(dict_list):

        local_to_global_cat_id = {}

        for cat in sub_dict.get('categories', []):
            name = cat['name']
            if name not in global_cat_name_to_id:
                global_cat_name_to_id[name] = next_global_cat_id

                new_cat = cat.copy()
                new_cat['id'] = next_global_cat_id
                merged_dict['categories'].append(new_cat)

                next_global_cat_id += 1

            local_to_global_cat_id[cat['id']] = global_cat_name_to_id[name]

        max_id_in_this_batch = 0

        for img in sub_dict.get('images', []):
            new_img = img.copy()
            new_img['id'] = img['id'] + current_img_offset
            merged_dict['images'].append(new_img)

            if img['id'] > max_id_in_this_batch:
                max_id_in_this_batch = img['id']

        max_ann_id_in_this_batch = 0

        for ann in sub_dict.get('annotations', []):
            new_ann = ann.copy()

            new_ann['id'] = ann['id'] + current_ann_offset

            new_ann['image_id'] = ann['image_id'] + current_img_offset

            if ann['category_id'] in local_to_global_cat_id:
                new_ann['category_id'] = local_to_global_cat_id[ann['category_id']]
            else:
                continue

            merged_dict['annotations'].append(new_ann)

            if ann['id'] > max_ann_id_in_this_batch:
                max_ann_id_in_this_batch = ann['id']

        current_img_offset += max_id_in_this_batch
        current_ann_offset += max_ann_id_in_this_batch

        print(f" -> Đã gộp dict #{i+1}: Thêm {len(sub_dict.get('images', []))} ảnh.")

    return merged_dict

In [None]:
all_dicts = [data_suon, data_cha_ca, data_tofu]

final_dataset = merge_list_of_coco_dicts(all_dicts)

print(f"Tổng categories: {len(final_dataset['categories'])}")
print(f"Tổng images: {len(final_dataset['images'])}")
print(f"Tổng annotations: {len(final_dataset['annotations'])}")

Bắt đầu gộp 3 datasets...
 -> Đã gộp dict #1: Thêm 329 ảnh.
 -> Đã gộp dict #2: Thêm 200 ảnh.
 -> Đã gộp dict #3: Thêm 209 ảnh.
Tổng categories: 6
Tổng images: 738
Tổng annotations: 3137


In [None]:
def analyze_coco_class_distribution(coco_dict):

    stats = {}

    for cat in coco_dict['categories']:
        stats[cat['id']] = {
            'name': cat['name'],
            'ann_count': 0,
            'image_ids': set()
        }

    for ann in coco_dict['annotations']:
        cat_id = ann['category_id']
        image_id = ann['image_id']

        if cat_id in stats:
            stats[cat_id]['ann_count'] += 1
            stats[cat_id]['image_ids'].add(image_id)

    print(f"{'ID':<5} | {'Class Name':<25} | {'Anns':<8} | {'Images':<8}")
    print("-" * 55)

    sorted_stats = sorted(stats.items(), key=lambda x: x[1]['ann_count'], reverse=True)

    total_anns = 0

    for cat_id, data in sorted_stats:
        img_count = len(data['image_ids'])
        print(f"{cat_id:<5} | {data['name']:<25} | {data['ann_count']:<8} | {img_count:<8}")
        total_anns += data['ann_count']

    print("-" * 55)
    print(f"Total Annotations: {total_anns}")
    print(f"Total Categories:  {len(stats)}")

analyze_coco_class_distribution(final_dataset)

ID    | Class Name                | Anns     | Images  
-------------------------------------------------------
3     | cha_cat_lat               | 781      | 159     
5     | tofu_chien                | 744      | 117     
6     | tofu_trang                | 572      | 118     
4     | cha_mieng                 | 495      | 77      
1     | suon_non                  | 291      | 144     
2     | suon_cot_let              | 254      | 193     
-------------------------------------------------------
Total Annotations: 3137
Total Categories:  6


In [None]:
with open('/content/drive/MyDrive/New_BBox_Dataset/merged_coco_version/ver_6/merged_coco_v6_left_out_cha_mieng_fix_cha_ca.json', 'w', encoding='utf-8') as f:
    json.dump(final_dataset, f, ensure_ascii=False, indent=4)

**Split metadata dict to train/val/test**

In [None]:
import json
import pandas as pd
import random

def split_dataset_stratified_v5(list_cha_ca, list_tofu, list_suon, val_ratio=0.1, test_ratio=0.1):

    all_data = []

    def get_subclass(filename):
        keywords = [
            'cha_cat_lat', 'cha_mieng',
            'tofu_trang', 'tofu_chien',
            'suon_non', 'suon_cot_let', 'suon_cotlet', 'chaca_catlat'
        ]
        for kw in keywords:
            if kw in filename:
                if kw == 'suon_cotlet': return 'suon_cot_let'
                if kw == 'chaca_catlat': return 'cha_cat_lat'
                return kw
        return "unknown"

    all_lists = list_cha_ca + list_tofu + list_suon

    temp_data = []
    for item in all_lists:
        sub_class = get_subclass(item['id_anh'])

        img_attrs = item.get('image_attributes', {})
        food_attrs = item.get('food_attributes', {})

        difficulty = img_attrs.get('difficulty_level', 'Unknown_Diff')

        context = (
            food_attrs.get('dish_context') or
            food_attrs.get('dish_variant') or
            food_attrs.get('interaction_state') or
            'Unknown_Ctx'
        )

        temp_data.append({
            'data': item,
            'sub_class': sub_class,
            'difficulty': difficulty,
            'context': context,

            'key_level_1': f"{sub_class}|{difficulty}|{context}",

            'key_level_2': f"{sub_class}|{difficulty}",

            'key_level_3': f"{sub_class}"
        })

    df = pd.DataFrame(temp_data)

    counts_L1 = df['key_level_1'].value_counts()

    def get_final_stratify_key(row):
        if counts_L1.get(row['key_level_1'], 0) >= 3:
            return row['key_level_1']
        return row['key_level_2']

    df['final_key'] = df.apply(get_final_stratify_key, axis=1)

    counts_Final = df['final_key'].value_counts()

    def fallback_to_level_3(row):
        if counts_Final.get(row['final_key'], 0) >= 3:
            return row['final_key']
        return row['key_level_3']

    df['final_key'] = df.apply(fallback_to_level_3, axis=1)

    train_set, val_set, test_set = [], [], []
    unique_groups = df['final_key'].unique()

    print(f"{'Stratify Group (v5)':<50} | {'Total':<5} | {'Tr':<3} | {'Va':<3} | {'Te':<3}")
    print("-" * 80)

    for group_name in unique_groups:
        group_items = df[df['final_key'] == group_name]['data'].tolist()
        total_count = len(group_items)

        random.seed(42)
        random.shuffle(group_items)

        if total_count == 1:
            train_set.extend(group_items)
            n_val, n_test = 0, 0

        elif total_count == 2:
            train_set.append(group_items[0])
            val_set.append(group_items[1])
            n_val, n_test = 1, 0

        else:
            n_val = max(1, int(round(total_count * val_ratio)))
            n_test = max(1, int(round(total_count * test_ratio)))

            if total_count - n_val - n_test < 1:
                if n_test > 0: n_test -= 1
                elif n_val > 0: n_val -= 1

            test_set.extend(group_items[:n_test])
            val_set.extend(group_items[n_test : n_test + n_val])
            train_set.extend(group_items[n_test + n_val :])

        short_name = (group_name[:47] + '..') if len(group_name) > 49 else group_name
        tr_c = len(group_items) - n_val - n_test if total_count > 2 else (1 if total_count > 0 else 0)
        va_c = n_val if total_count > 2 else (1 if total_count == 2 else 0)
        te_c = n_test if total_count > 2 else 0

        print(f"{short_name:<50} | {total_count:<5} | {tr_c:<3} | {va_c:<3} | {te_c:<3}")

    return {"train": train_set, "val": val_set, "test": test_set}

with open('/content/drive/MyDrive/Data MetaData/left_out_cha_mieng/cha_ca_v2_filter_2_final.json', 'r', encoding='utf-8') as f:
    list_cha_ca_raw = json.load(f)
with open('/content/drive/MyDrive/Data MetaData/left_out_cha_mieng/tofu_filter_final.json', 'r', encoding='utf-8') as f:
    list_tofu_raw = json.load(f)
with open('/content/drive/MyDrive/Data MetaData/left_out_cha_mieng/suon_v2_filter_final.json', 'r', encoding='utf-8') as f:
    list_suon_raw = json.load(f)

datasets = split_dataset_stratified_v5(list_cha_ca_raw, list_tofu_raw, list_suon_raw)

print(f"\nTổng ảnh Train: {len(datasets['train'])}")
print(f"Tổng ảnh Val:   {len(datasets['val'])}")
print(f"Tổng ảnh Test:  {len(datasets['test'])}")

Stratify Group (v5)                                | Total | Tr  | Va  | Te 
--------------------------------------------------------------------------------
cha_cat_lat|Medium|Surface_Topping_Arrangement     | 62    | 50  | 6   | 6  
cha_cat_lat|Easy|Surface_Topping_Arrangement       | 64    | 52  | 6   | 6  
cha_mieng|Medium|Shallow_Braised_Bathing           | 3     | 1   | 1   | 1  
cha_mieng|Medium|Solid_Dry_Stacked                 | 14    | 12  | 1   | 1  
cha_cat_lat|Easy|Solid_Dry_Stacked                 | 7     | 5   | 1   | 1  
cha_mieng|Easy|Solid_Dry_Stacked                   | 12    | 10  | 1   | 1  
cha_mieng|Hard|Solid_Dry_Stacked                   | 6     | 4   | 1   | 1  
cha_mieng|Easy|Shallow_Braised_Bathing             | 6     | 4   | 1   | 1  
cha_cat_lat|Hard|Surface_Topping_Arrangement       | 14    | 12  | 1   | 1  
cha_cat_lat                                        | 1     | 1   | 0   | 0  
cha_mieng                                          | 4     | 2   | 1   |

In [None]:
def check_annotation_balance(dataset_dict, dataset_name="Train"):
    """
    Hàm kiểm tra xem sau khi split, số lượng annotation có bị lệch không.
    """
    class_counts = {}
    total_imgs = len(dataset_dict)
    total_anns = 0

    for item in dataset_dict:
        pass

    print(f"--- Kiểm tra tập {dataset_name} ({total_imgs} ảnh) ---")

def verify_difficulty_distribution(dataset, name):
    from collections import Counter
    diffs = [item.get('image_attributes', {}).get('difficulty_level', 'Unknown') for item in dataset]
    counts = Counter(diffs)
    total = sum(counts.values())
    print(f"\nPhân phối độ khó tập {name} ({total} ảnh):")
    for k, v in counts.items():
        print(f"  - {k}: {v} ({v/total*100:.1f}%)")

# Gọi hàm kiểm tra
verify_difficulty_distribution(datasets['train'], "TRAIN")
verify_difficulty_distribution(datasets['val'], "VAL")
verify_difficulty_distribution(datasets['test'], "TEST")


Phân phối độ khó tập TRAIN (585 ảnh):
  - Medium: 241 (41.2%)
  - Easy: 239 (40.9%)
  - Hard: 105 (17.9%)

Phân phối độ khó tập VAL (77 ảnh):
  - Medium: 32 (41.6%)
  - Easy: 31 (40.3%)
  - Hard: 14 (18.2%)

Phân phối độ khó tập TEST (76 ảnh):
  - Medium: 30 (39.5%)
  - Easy: 31 (40.8%)
  - Hard: 15 (19.7%)


In [None]:
train_json = datasets['train']
val_json = datasets['val']
test_json = datasets['test']

with open('/content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/left_out_cha_mieng_fix_cha_ca/train_split.json', 'w') as f: json.dump(train_json, f)
with open('/content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/left_out_cha_mieng_fix_cha_ca/val_split.json', 'w') as f: json.dump(val_json, f)
with open('/content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/left_out_cha_mieng_fix_cha_ca/test_split.json', 'w') as f: json.dump(test_json, f)

In [None]:
def load_and_combine_splits(train_path, val_path, test_path):
    """
    Đọc 3 file JSON (Train, Val, Test) và gộp vào một dictionary.
    """
    final_dataset = {
        "train": [],
        "val": [],
        "test": []
    }

    try:
        print(f"Đang đọc file Train: {train_path}...")
        with open(train_path, 'r', encoding='utf-8') as f:
            final_dataset["train"] = json.load(f)

        print(f"Đang đọc file Val: {val_path}...")
        with open(val_path, 'r', encoding='utf-8') as f:
            final_dataset["val"] = json.load(f)

        print(f"Đang đọc file Test: {test_path}...")
        with open(test_path, 'r', encoding='utf-8') as f:
            final_dataset["test"] = json.load(f)

        print("\n--- ĐÃ LOAD DỮ LIỆU THÀNH CÔNG ---")
        print(f"Số lượng Train: {len(final_dataset['train'])}")
        print(f"Số lượng Val:   {len(final_dataset['val'])}")
        print(f"Số lượng Test:  {len(final_dataset['test'])}")

        return final_dataset

    except FileNotFoundError as e:
        print(f"\nLỖI: Không tìm thấy file. Vui lòng kiểm tra lại đường dẫn.\nChi tiết: {e}")
        return None
    except json.JSONDecodeError as e:
        print(f"\nLỖI: File không đúng định dạng JSON.\nChi tiết: {e}")
        return None

In [None]:
path_train = '/content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/left_out_cha_mieng_fix_cha_ca/train_split.json'
path_val   = '/content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/left_out_cha_mieng_fix_cha_ca/val_split.json'
path_test  = '/content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/left_out_cha_mieng_fix_cha_ca/test_split.json'

datasets = load_and_combine_splits(path_train, path_val, path_test)

Đang đọc file Train: /content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/left_out_cha_mieng_fix_cha_ca/train_split.json...
Đang đọc file Val: /content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/left_out_cha_mieng_fix_cha_ca/val_split.json...
Đang đọc file Test: /content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/left_out_cha_mieng_fix_cha_ca/test_split.json...

--- ĐÃ LOAD DỮ LIỆU THÀNH CÔNG ---
Số lượng Train: 585
Số lượng Val:   77
Số lượng Test:  76


**Split coco format data to train/val/test**

In [None]:
import json
import os
from collections import defaultdict

def create_coco_subsets(master_coco_path, split_datasets, output_dir):

    print(f"Đang đọc file gốc: {master_coco_path} ...")
    with open(master_coco_path, 'r', encoding='utf-8') as f:
        coco_data = json.load(f)

    filename_to_img = {img['file_name']: img for img in coco_data['images']}

    img_id_to_anns = defaultdict(list)
    if 'annotations' in coco_data:
        for ann in coco_data['annotations']:
            img_id_to_anns[ann['image_id']].append(ann)

    categories = coco_data.get('categories', [])
    info = coco_data.get('info', {})
    licenses = coco_data.get('licenses', [])

    os.makedirs(output_dir, exist_ok=True)

    for phase in ['train', 'val', 'test']:
        print(f"--> Đang xử lý tập: {phase.upper()}")

        target_filenames = set([item['id_anh'] for item in split_datasets[phase]])

        new_images = []
        new_annotations = []

        missing_count = 0

        for fname in target_filenames:
            if fname in filename_to_img:
                img_info = filename_to_img[fname]
                new_images.append(img_info)

                img_id = img_info['id']
                if img_id in img_id_to_anns:
                    new_annotations.extend(img_id_to_anns[img_id])
            else:
                missing_count += 1

        if missing_count > 0:
            print(f"    Cảnh báo: Có {missing_count} ảnh trong tập {phase} không khớp tên trong file COCO.")

        new_json = {
            "info": info,
            "licenses": licenses,
            "images": new_images,
            "annotations": new_annotations,
            "categories": categories
        }

        out_path = os.path.join(output_dir, f"{phase}.json")
        with open(out_path, 'w', encoding='utf-8') as f:
            json.dump(new_json, f, ensure_ascii=False)

        print(f"    Đã lưu: {out_path}")
        print(f"    Số lượng: {len(new_images)} ảnh, {len(new_annotations)} labels.")

MASTER_COCO_FILE = '/content/drive/MyDrive/New_BBox_Dataset/merged_coco_version/ver_6/merged_coco_v6_left_out_cha_mieng_fix_cha_ca.json'
OUTPUT_FOLDER = "/content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/ver_6/left_out_cha_mieng_fix_cha_ca/background_v5/"

create_coco_subsets(MASTER_COCO_FILE, datasets, OUTPUT_FOLDER)

Đang đọc file gốc: /content/drive/MyDrive/New_BBox_Dataset/merged_coco_version/ver_6/merged_coco_v6_left_out_cha_mieng_fix_cha_ca.json ...
--> Đang xử lý tập: TRAIN
    Đã lưu: /content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/ver_6/left_out_cha_mieng_fix_cha_ca/background_v5/train.json
    Số lượng: 585 ảnh, 2449 labels.
--> Đang xử lý tập: VAL
    Đã lưu: /content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/ver_6/left_out_cha_mieng_fix_cha_ca/background_v5/val.json
    Số lượng: 77 ảnh, 365 labels.
--> Đang xử lý tập: TEST
    Đã lưu: /content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/ver_6/left_out_cha_mieng_fix_cha_ca/background_v5/test.json
    Số lượng: 76 ảnh, 323 labels.


**Add BackGround Image to Train**

In [None]:
with open('/content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/ver_6/left_out_cha_mieng_fix_cha_ca/background_v5/train.json', 'r', encoding='utf-8') as f:
    data_train = json.load(f)

In [None]:
def merge_coco_datasets(data_all, data_background):

    bg_imgs = copy.deepcopy(data_background.get('images', []))
    bg_anns = copy.deepcopy(data_background.get('annotations', []))

    max_img_id = max([img['id'] for img in data_all['images']]) if data_all['images'] else -1
    max_ann_id = max([ann['id'] for ann in data_all['annotations']]) if data_all['annotations'] else -1

    img_id_map = {}

    print(f"Đang merge {len(bg_imgs)} ảnh và {len(bg_anns)} annotations...")

    for img in bg_imgs:
        old_id = img['id']
        max_img_id += 1
        new_id = max_img_id

        img['id'] = new_id
        img_id_map[old_id] = new_id

        data_all['images'].append(img)

    for ann in bg_anns:
        max_ann_id += 1
        ann['id'] = max_ann_id

        if ann['image_id'] in img_id_map:
            ann['image_id'] = img_id_map[ann['image_id']]
            data_all['annotations'].append(ann)
        else:
            print(f"Warning: Bỏ qua annotation {ann['id']} vì không tìm thấy ảnh gốc ID {ann['image_id']}")

    print(f"Hoàn tất! Tổng số ảnh hiện tại: {len(data_all['images'])}")
    return data_all

with open('/content/drive/MyDrive/New_BBox_Dataset/coco_annotation/cleaned/background_v3.json', 'r') as f: data_background = json.load(f)

data_train = merge_coco_datasets(data_train, data_background)

Đang merge 45 ảnh và 0 annotations...
Hoàn tất! Tổng số ảnh hiện tại: 634


In [None]:
with open('/content/drive/MyDrive/New_BBox_Dataset/train_test_split_version/ver_6/left_out_cha_mieng_fix_cha_ca/background_v5/train.json', 'w') as f: json.dump(data_train, f)