In [1]:
import pandas as pd
from tqdm import tqdm
import json


In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import json

# ============================
# 1. Model Transformers
# ============================
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct-AWQ"

print("ƒêang load model Transformers...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map={"": "cuda"},
    trust_remote_code=True
)

# ============================
# 2. D·ªØ li·ªáu
# ============================
INPUT_FILE = "test_flow_reviews_1.xlsx"
COL_REVIEW = "reviewContent"
OUTPUT_FILE = "full_reviews_1.xlsx"

CANONICAL = [
    "Ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m",
    "Gi√° c·∫£",
    "V·∫≠n chuy·ªÉn & Giao h√†ng",
    "ƒê√≥ng g√≥i & Bao b√¨",
    "D·ªãch v·ª• & CSKH",
    "M√¥ t·∫£ & H√¨nh ·∫£nh",
    "L·ªói, b·∫£o h√†nh, h√†ng gi·∫£",
    "Tr·∫£i nghi·ªám s·ª≠ d·ª•ng",
    "Uy t√≠n & th√°i ƒë·ªô shop",
    "Khuy·∫øn m√£i & voucher"
]

df = pd.read_excel(INPUT_FILE)
df = df[3:8]
for asp in CANONICAL:
    df[asp] = 2   # m·∫∑c ƒë·ªãnh = 2 = kh√¥ng ƒë·ªÅ c·∫≠p

# ============================
# 3. Prompt
# ============================
SYSTEM_PROMPT = """
B·∫°n l√† AI ph√¢n t√≠ch c·∫£m x√∫c cho ƒë√°nh gi√° s·∫£n ph·∫©m th∆∞∆°ng m·∫°i ƒëi·ªán t·ª≠ ti·∫øng Vi·ªát.

Nhi·ªám v·ª•:
1. Tr√≠ch xu·∫•t c√°c "target" (kh√≠a c·∫°nh) m√† ng∆∞·ªùi d√πng ƒëang khen/ch√™ trong c√¢u review.
2. Chu·∫©n h√≥a m·ªói target v·ªÅ m·ªôt trong C√ÅC NH√ìM C·ªê ƒê·ªäNH d∆∞·ªõi ƒë√¢y (canonical aspect):

- "Ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m"
- "Gi√° c·∫£"
- "V·∫≠n chuy·ªÉn & Giao h√†ng"
- "ƒê√≥ng g√≥i & Bao b√¨"
- "D·ªãch v·ª• & CSKH"
- "M√¥ t·∫£ & H√¨nh ·∫£nh"
- "L·ªói, b·∫£o h√†nh, h√†ng gi·∫£"
- "Tr·∫£i nghi·ªám s·ª≠ d·ª•ng"
- "Uy t√≠n & th√°i ƒë·ªô shop"
- "Khuy·∫øn m√£i & voucher"

3. G√°n score cho t·ª´ng target: "1": "t√≠ch c·ª±c", "-1": "ti√™u c·ª±c", "0": "trung l·∫≠p" ho·∫∑c "2": "kh√¥ng ƒë·ªÅ c·∫≠p".

Y√™u c·∫ßu:
- Ch·ªâ tr·∫£ v·ªÅ JSON h·ª£p l·ªá.
- Kh√¥ng gi·∫£i th√≠ch, kh√¥ng th√™m text ngo√†i JSON.
- Tr·∫£ v·ªÅ JSON g·ªìm ƒë√∫ng 10 keys.

Format JSON OUTPUT:

{
  "targets": [
    {
      "raw_target": "<target chi ti·∫øt, v√≠ d·ª•: 'pin', 'camera', 'ƒë·ªô tho·∫£i m√°i'>",
      "canonical_aspect": "<m·ªôt trong 10 nh√≥m canonical>",
      "sentiment": "<t√≠ch c·ª±c | ti√™u c·ª±c | trung l·∫≠p>"
    }
  ]
}
""".strip()


def build_prompt(review: str):
    return (
        f"<|im_start|>system\n{SYSTEM_PROMPT}\n<|im_end|>\n"
        f"<|im_start|>user\nReview: \"{review}\"\nH√£y tr·∫£ v·ªÅ JSON.\n<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )


# ============================
# 4. Ch·∫°y labeling t·ª´ng d√≤ng + tqdm
# ============================

print("üî• B·∫Øt ƒë·∫ßu labeling b·∫±ng Transformers...\n")

sentiment_map = {
    "t√≠ch c·ª±c": 1,
    "ti√™u c·ª±c": -1,
    "trung l·∫≠p": 0,
    "kh√¥ng ƒë·ªÅ c·∫≠p": 2
}

for idx in tqdm(df.index, desc="Labeling", unit="row"):

    review = str(df.at[idx, COL_REVIEW])
    prompt = build_prompt(review)

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.0,
        do_sample=False
    )

    # Ch·ªâ decode ph·∫ßn generated (sau input)
    input_length = inputs.input_ids.size(1)
    generated = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)

    # Parse JSON t·ª´ generated
    try:
        start = generated.index("{")
        end = generated.rindex("}") + 1
        data = json.loads(generated[start:end])
    except:
        data = {}

    # reset t·∫•t c·∫£ canonical = 2 ·ªü m·ªói d√≤ng
    # for asp in CANONICAL:
    #     df.at[idx, asp] = 2

    # N·∫øu JSON h·ª£p l·ªá
    if isinstance(data, dict) and "targets" in data:
        for item in data["targets"]:
            canonical = item.get("canonical_aspect")
            senti = item.get("sentiment")

            if canonical in CANONICAL:
                df.at[idx, canonical] = sentiment_map.get(senti, 2)


# ============================
# 5. L∆∞u file
# ============================

df.to_excel(OUTPUT_FILE, index=False)
print("File ƒë√£ l∆∞u:", OUTPUT_FILE)

  from .autonotebook import tqdm as notebook_tqdm


ƒêang load model Transformers...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
