In [None]:
!pip install git+https://github.com/huggingface/transformers accelerate bitsandbytes qwen_vl_utils

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-ji_soyjg
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-ji_soyjg
  Resolved https://github.com/huggingface/transformers to commit 471d7ce9abbb3bc1b3bab673367378f9dbc3caac
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting qwen_vl_utils
  Downloading qwen_vl_utils-0.0.14-py3-none-any.whl.metadata (9.0 kB)
Collecting huggingface-hub<2.0,>=1.2.1 (from transformers==5.0.0.dev0)
  Downloading huggingface_hub-1.2.1-py3-none-any.whl.metadata (13 kB)
Collecting av (from qwen_vl_utils)
  Downloading av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata 

**LOAD DATASET**

In [None]:
# !gdown ...... -O /content/data.zip

In [None]:
%%capture
!unzip "/content/data.zip" -d "/content/data"

In [None]:
import json
from google.colab import drive
import pandas as pd
import numpy as np

In [None]:
drive.mount('/content/drive', force_remount= True)

Mounted at /content/drive


**Apply AI Annotate Assisstance**

In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info
import json
import os
from tqdm import tqdm

# --- 1. CẤU HÌNH INPUT / OUTPUT ---
img_folders = ['Suon', 'Cha_Ca', 'TOFU']
output_files = ['suon_metadata.json', 'chaca_metadata.json', 'tofu_metadata.json']

BASE_DATA_PATH = '/content/data/'
BASE_OUTPUT_PATH = '/content/drive/MyDrive/Data MetaData/'

os.makedirs(BASE_OUTPUT_PATH, exist_ok=True)

# --- 2. LOAD MODEL
print("Đang cấu hình quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print("Đang load Qwen2-VL-7B...")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    attn_implementation="sdpa",
    device_map="auto",
    quantization_config=bnb_config
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
print("Load model thành công!")

def get_prompt_by_category(category_name):
    """
    Trả về System Prompt phù hợp cho từng loại món ăn.
    Bạn cần định nghĩa lại các thuộc tính (Attributes) cho Sườn và Chả Cá.
    """

    prompt_tofu = """
Role: You are a Computer Vision Data Specialist for Vietnamese Cuisine (Target: Tofu - Dau Hu).
Objective: Analyze the image to extract TWO distinct layers of metadata: (1) Food-specific attributes and (2) General Image attributes.

INPUT: Image of Tofu.
OUTPUT: A single valid JSON object following the structure below.

### 1. FOOD_ATTRIBUTES (Domain Specific - Tofu)
Analyze the physical state of the tofu:
- Morphology:
  - "Block_Cube": Standard square/rectangular block (intact).

- Surface_Texture:
  - "Fried_Rough": Brown/Yellow skin, porous/bubbly texture.
  - "Matte_Porous": White/Ivory skin, matte/micro-porous texture.

- Dish_Variant (Contextual State):
  - "Plain_Dry_Item":
      Description: "Tofu blocks (either White or Fried) appearing alone on a plate, basket, or pan. No heavy sauce, no soup. Surface is relatively dry or just oily."
  - "Sauce_Topping_Complex":
      Description: "Tofu is coated with THICK sauce (Tomato/Soy), glazed with oil (Scallion Oil), or stuffed with meat. Liquid is viscous/sticky, not watery."
  - "Soup_Broth_Context":
      Description: "Tofu is submerged or floating in THIN/WATERY liquid (Broth/Soup/Bun). Usually accompanied by liquid, vegetables, or noodles."

### 2. IMAGE_ATTRIBUTES (General Vision - Standardized)
Analyze the photographic conditions (Must match standard project schema):
- Lighting_Condition:
  - "Natural_Daylight": Balanced white light.
  - "Artificial_Warm": Yellow/Orange tint (Indoor/Restaurant).
- Background_Complexity:
  - "Clean_Solid": Single color plate/background.
  - "Simple_Table": 1-2 items nearby.
  - "Cluttered_Messy": Busy scene (>3 items), hard to isolate.
- Occlusion_Level:
  - "None": Fully visible.
  - "Low": <20% covered (e.g., some scallions).
  - "Medium": 20-50% covered.
  - "High": >50% covered (hard to recognize).

### 3. PLANNING_META (Scenario Mapping)
Map the image to the Project CSV Plan based on the DOMINANT feature:
- "1_Baseline": High contrast, clear view (Fried on White / White on Dark).
- "2_Low_Contrast": Camouflaged (White Tofu on White Plate/Board).
- "3_Texture_Macro": Close-up focusing on pores or smoothness.
- "4_Wet_Sauce": Floating in soup, or covered in Tomato sauce.
- "5_Deformed_Cut": Broken pieces, cut dices, or physically damaged.
- "6_Interaction": Being fried in oil, held by chopsticks, or handled.
- "7_Specularity_Light": Extreme lighting, flash, or high reflection.

RESPONSE FORMAT: JSON ONLY. NO MARKDOWN.
"""

    prompt_suon = """
Role: Computer Vision Specialist for Vietnamese Cuisine (Suon Nuong).
    Task: Analyze the image and extract metadata into a JSON object.

    ### KNOWLEDGE BASE (Use these criteria to decide):

    1. MORPHOLOGY (Shape):
       - "Planar_Loin_Structure": Flat slab, dense lean meat (Cot Let).
         * Variations: "Whole_Slab" (Curved rib bone on edge), "Sliced_Strips" (Uniform rectangular strips).
       - "Volumetric_Rib_Chunk": 3D blocky/tubular shape (Suon Non).
         * Internal_Details: "Embedded_Bone_Cartilage" (Center bone/cartilage), "Interspersed_Texture" (Marbled fat).

    2. SURFACE TEXTURE:
       - "Dry_Charred_Matte": Low reflection, visible grill marks, burnt edges.
       - "Oily_Glazed_Specular": High reflection, wet/sticky look (Honey/Scallion oil).

    3. DISH CONTEXT:
       - "Com_Tam_Assembly": Placed on broken rice.
       - "Isolated_Item": Alone on plate/grill.

    4. PLANNING META (Scenario Mapping):
       - "1_Geometry_Shape": Clear view of bone/cartilage.
       - "2_Com_Tam_Full_Set": Full context with Rice + Egg + Bi + Cha.
       - "3_Mo_Hanh_Occlusion": Covered by Scallion Oil.
       - "4_Grill_Marks_Char": Focus on char/burn marks.
       - "5_Glaze_Specularity": Focus on wet/shiny sauce.
       - "6_Lighting_Difficult": Bad lighting/shadows.
       - "7_Broken_Sliced": Cut pieces/deformed.

    ### REQUIRED OUTPUT FORMAT (JSON ONLY):
    Fill in the values based on the Knowledge Base above. Do NOT include descriptions.

    {
      "food_attributes": {
        "morphology": {
          "category": "...",
          "detail_variation": "..."
        },
        "surface_texture": "...",
        "dish_context": "..."
      },
      "image_attributes": {
        "lighting_condition": "Natural_Daylight OR Artificial_Warm",
        "background_complexity": "Clean_Solid OR Simple_Table OR Cluttered_Messy",
        "occlusion_level": "None OR Low OR Medium OR High"
      },
      "planning_meta": {
        "scenario_mapping": "..."
      }
    }
"""

    prompt_chaca = """
Role: You are a Computer Vision Data Specialist for Vietnamese Cuisine (Target: Cha Ca - Fish Cake).
Objective: Analyze the image to extract structured metadata, strictly adhering to the morphological dependency rules.

INPUT: Image of Cha Ca.
OUTPUT: A single valid JSON object following the structure below.

### 1. FOOD_ATTRIBUTES (Domain Specific)
Analyze the physical state of the food itself:

A. Morphology (Primary Classifier):
   - "Whole_Block": Intact piece, outer skin covers all sides (No meat visible).
   - "Sliced_Section": Cut piece exposing inner meat (High color contrast between dark skin and light meat).

B. Surface_Texture (Apply Priority Rule):
   CRITICAL LOGIC:
   1. IF Morphology is "Whole_Block" -> Texture MUST be "Wrinkled_Skin".
   2. IF Morphology is "Sliced_Section" -> You MUST ignore the outer skin and classify the EXPOSED INNER MEAT.

   - "Wrinkled_Skin": [VALID ONLY IF Whole_Block] Fried outer layer, brown/yellow, rough/leathery.
   - "Porous_Inner": [VALID ONLY IF Sliced_Section] The exposed inner meat has visible air holes, sponge-like texture (white/grey).
   - "Smooth_Cut": [VALID ONLY IF Sliced_Section] The exposed inner meat is solid, dense, and smooth with minimal pores (white/grey).

C. Interaction_State (Contextual Physics):
   - "Solid_Dry_Stacked":
       Description: "Item is on a dry surface (plate/cutting board) or held by hand. No liquid/sauce interaction."
       Visual_Cue: "Clean boundaries, distinct shadows on surface, items often overlapping or piled."
   - "Surface_Topping_Arrangement":
       Description: "Item matches 'Bun Ca' style. Placed ON TOP of a solid base (noodles), maintaining a specific geometric layout. Only the bottom touches the liquid."
       Visual_Cue: "High visibility, organized placement, distinct separation from the liquid level."
   - "Shallow_Braised_Bathing":
       Description: "Item is resting on the bottom of a container (pan/plate), surrounded by a THICK/SHALLOW sauce (not deep broth). Top surface is exposed but may be coated with sauce/oil."
       Visual_Cue: "Thick liquid texture (viscous), items are stationary, often garnished with sprinkles (scallions/dill) sticking to the wet surface."
   - "Partially_Submerged_Floating":
       Description: "Item is bobbing freely in DEEP, THIN liquid (broth/water). No solid base underneath."
       Visual_Cue: "Meniscus effect around edges, random orientation."
   - "Fully_Submerged_Deep":
       Description: "Item is completely under liquid."

### 2. IMAGE_ATTRIBUTES (General Vision)
Analyze the photographic conditions:
- Lighting_Condition:
   - "Natural_Daylight": Balanced white light.
   - "Artificial_Warm": Yellow/Orange tint (Indoor/Restaurant).

- Background_Complexity:
   - "Clean_Solid": Single color plate/background.
   - "Simple_Table": 1-2 items nearby.
   - "Cluttered_Messy": Busy scene (>3 items), hard to isolate.

- Occlusion_Level:
   - "None": Fully visible.
   - "Low": <20% covered (e.g., some scallions).
   - "Medium": 20-50% covered.
   - "High": >50% covered (hard to recognize).

### 3. PLANNING_META (Scenario Mapping)
Map the image to one of these groups based on the dominant feature:
- "1_Baseline": Clean background, standard view.
- "2_In_The_Wild": In soup/broth context.
- "3_Complex_BG": Cluttered table.
- "4_Occlusion": Herbs/Sauce covering object.
- "5_Interaction": Holding/Eating action.
- "6_Dense_Cluster": Piled up (Solid/Dry).
- "7_Lighting_Var": Difficult lighting.

RESPONSE FORMAT: JSON ONLY. NO MARKDOWN.
"""

    if "tofu" in category_name.lower():
        return prompt_tofu
    elif "suon" in category_name.lower():
        return prompt_suon
    elif "cha_ca" in category_name.lower() or "chaca" in category_name.lower():
        return prompt_chaca
    else:
        # Fallback prompt nếu không khớp tên folder
        return prompt_tofu

def analyze_image_qwen(image_path, img_id, specific_prompt):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image_path},
                {"type": "text", "text": specific_prompt},
            ],
        }
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=512, temperature=0.1)

    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]

    try:
        cleaned_text = output_text.replace("```json", "").replace("```", "").strip()
        start = cleaned_text.find('{')
        end = cleaned_text.rfind('}') + 1

        if start != -1 and end != -1:
            json_str = cleaned_text[start:end]
            data = json.loads(json_str)
            data["id_anh"] = img_id
            return data
        else:
            return None
    except Exception as e:
        print(f"Lỗi xử lý {img_id}: {e}")
        return None

def main():
    for folder_name, output_filename in zip(img_folders, output_files):

        current_img_dir = os.path.join(BASE_DATA_PATH, folder_name)
        current_out_path = os.path.join(BASE_OUTPUT_PATH, output_filename)

        print(f"\n{'='*40}")
        print(f"Đang xử lý Folder: {folder_name}")
        print(f"Đường dẫn ảnh: {current_img_dir}")
        print(f"File lưu: {current_out_path}")

        if not os.path.exists(current_img_dir):
            print(f"Không tìm thấy folder: {current_img_dir} -> Bỏ qua.")
            continue

        files = [f for f in os.listdir(current_img_dir) if f.lower().endswith(('.jpg', '.png', '.jpeg'))]
        if len(files) == 0:
            print(f"Folder {folder_name} rỗng -> Bỏ qua.")
            continue

        current_prompt = get_prompt_by_category(folder_name)

        results = []
        print(f"Bắt đầu quét {len(files)} ảnh trong {folder_name}...")

        for filename in tqdm(files, desc=f"Processing {folder_name}"):
            path = os.path.join(current_img_dir, filename)
            try:
                res = analyze_image_qwen(path, filename, current_prompt)
                if res:
                    results.append(res)
            except Exception as e:
                print(f"Lỗi file {filename}: {e}")

        with open(current_out_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=4, ensure_ascii=False)
        print(f"Đã lưu xong {len(results)} kết quả vào {output_filename}")

    print("\nHOÀN TẤT TẤT CẢ CÁC FOLDER!")

if __name__ == "__main__":
    main()