In [None]:
# -*- coding: utf-8 -*-
"""
Dish Type 2 Classifier (DashScope, single-label)
------------------------------------------------
This script classifies each dish into ONE AND ONLY ONE culinary-form/use category
("Dish type 2") using Alibaba Cloud DashScope (Qwen). It:
  1) Builds a constrained taxonomy and a deterministic priority order.
  2) Prompts the model to return exactly one label from the taxonomy in JSON.
  3) Validates/normalizes the model output to the taxonomy.
  4) Falls back to a lightweight keyword rule if the API is unavailable or output is invalid.

Inputs
------
An input .csv/.xlsx with columns (recommended):
  - index         : recipe id
  - recipename    : dish title
  - ingredients   : ingredients text (optional but recommended)
  - instructions  : preparation/cooking text (optional but recommended)

Outputs
-------
A file with one extra column 'dish_type_2' that contains the final single label.

Environment
-----------
Set your DashScope API key in the environment:
  export DASHSCOPE_API_KEY="sk-xxxx"

Dependencies
------------
  pip install pandas openpyxl dashscope tqdm

Usage
-----
  # CSV
  python dish_type_classifier_dashscope.py --in recipes.csv --out recipes_labeled.csv
  # Excel
  python dish_type_classifier_dashscope.py --in recipes.xlsx --sheet Sheet1 --out recipes_labeled.xlsx
"""

import os
import re
import time
import json
import argparse
import pandas as pd

try:
    from dashscope import Generation
    _HAS_DASHSCOPE = True
except Exception:
    _HAS_DASHSCOPE = False

try:
    from tqdm import tqdm
    _HAS_TQDM = True
except Exception:
    _HAS_TQDM = False


# -------------------------------
# 1) Taxonomy and fixed priority
# -------------------------------
TAXONOMY = [
    "Beverages/Drinks",
    "Condiments/Seasonings/Sauces",
    "Soup",
    "Noodle-Based Dishes",
    "Salad",
    "Snacks/Desserts",
    "Side Dishes/Pickles",
    "Meat Dishes",
    "Seafood Dishes",
    "Vegetarian Dishes",
    "Other",
]

# Deterministic conflict-resolution priority (descending).
# If multiple signals exist, the earliest category in this list wins.
PRIORITY = [
    "Beverages/Drinks",
    "Condiments/Seasonings/Sauces",
    "Soup",
    "Noodle-Based Dishes",
    "Salad",
    "Snacks/Desserts",
    "Side Dishes/Pickles",
    "Meat Dishes",
    "Seafood Dishes",
    "Vegetarian Dishes",
    "Other",
]

# Lightweight rule-based keywords used ONLY as a fallback.
# Keys are taxonomy labels; values are lists of indicative keywords (lowercase).
FALLBACK_RULES = {
    "Beverages/Drinks": ["drink", "juice", "tea", "coffee", "smoothie", "milkshake", "latte"],
    "Condiments/Seasonings/Sauces": ["sauce", "dressing", "marinade", "paste", "chutney", "relish"],
    "Soup": ["soup", "broth", "stew", "congee", "porridge"],
    "Noodle-Based Dishes": ["noodle", "ramen", "udon", "soba", "vermicelli", "pho", "kwey teow", "kway teow"],
    "Salad": ["salad", "coleslaw"],
    "Snacks/Desserts": ["cookie", "cake", "dessert", "pudding", "pastry", "snack", "pie", "tart", "brownie", "mochi", "dumpling (sweet)"],
    "Side Dishes/Pickles": ["pickle", "kimchi", "achar", "tsukemono", "takuan"],
    # Protein-form fallbacks only used when none of the above forms trigger:
    "Meat Dishes": ["beef", "pork", "lamb", "mutton", "goat", "bacon", "ham", "meatball", "meat balls"],
    "Seafood Dishes": ["tuna", "fish", "shrimp", "prawn", "crab", "clam", "mussel", "squid", "oyster", "seafood"],
    "Vegetarian Dishes": ["tofu", "tempeh", "vegetarian", "vegan", "meatless"],
    # "Other" has no keywords; it is the final catch-all.
}


# -----------------------------------------
# 2) Prompt builder (single-label, JSON)
# -----------------------------------------
SYSTEM_PROMPT = (
    "You are a strict classifier. Return ONE AND ONLY ONE label from the taxonomy. "
    "Do not invent new labels. Output must be valid JSON with a single key 'dish_type_2'."
)

USER_TEMPLATE = """Classify the dish into ONE label from the taxonomy below.

Taxonomy (choose exactly one):
- Beverages/Drinks
- Condiments/Seasonings/Sauces
- Soup
- Noodle-Based Dishes
- Salad
- Snacks/Desserts
- Side Dishes/Pickles
- Meat Dishes
- Seafood Dishes
- Vegetarian Dishes
- Other

Dish title: {title}

Ingredients (optional): {ingredients}

Instructions (optional): {instructions}

Rules:
1) Return exactly one label from the taxonomy above.
2) Respond ONLY with a JSON object, e.g. {{"dish_type_2": "Salad"}}
"""

def build_user_prompt(title: str, ingredients: str = "", instructions: str = "") -> str:
    """Compose the user prompt with the dish text."""
    return USER_TEMPLATE.format(
        title=(title or "").strip(),
        ingredients=(ingredients or "").strip(),
        instructions=(instructions or "").strip(),
    )


# ---------------------------------------------------
# 3) DashScope call and output normalization helpers
# ---------------------------------------------------
def normalize_label(label: str) -> str:
    """Map any free-form model text to one of the taxonomy labels (case-insensitive)."""
    if not label:
        return "Other"

    # Normalize whitespace/case; try exact match first
    norm = label.strip().lower()
    for lab in TAXONOMY:
        if norm == lab.lower():
            return lab

    # Loose containment or alias mapping (e.g. "beverage" -> "Beverages/Drinks")
    aliases = {
        "beverage": "Beverages/Drinks",
        "drink": "Beverages/Drinks",
        "drinks": "Beverages/Drinks",
        "condiment": "Condiments/Seasonings/Sauces",
        "seasoning": "Condiments/Seasonings/Sauces",
        "sauce": "Condiments/Seasonings/Sauces",
        "noodle": "Noodle-Based Dishes",
        "noodles": "Noodle-Based Dishes",
        "dessert": "Snacks/Desserts",
        "snack": "Snacks/Desserts",
        "side dish": "Side Dishes/Pickles",
        "pickles": "Side Dishes/Pickles",
        "meat": "Meat Dishes",
        "seafood": "Seafood Dishes",
        "vegetarian": "Vegetarian Dishes",
    }
    for key, val in aliases.items():
        if key in norm:
            return val

    # If still not matched, return "Other"
    return "Other"


def classify_with_dashscope(title: str, ingredients: str, instructions: str, model: str = "qwen-turbo") -> str:
    """Call DashScope Generation to classify; return a normalized taxonomy label."""
    if not _HAS_DASHSCOPE:
        raise RuntimeError("dashscope is not installed or import failed.")

    api_key = os.getenv("DASHSCOPE_API_KEY")
    if not api_key:
        raise RuntimeError("DASHSCOPE_API_KEY not set in environment.")

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": build_user_prompt(title, ingredients, instructions)},
    ]

    # Call DashScope. If needed, you can tune parameters like temperature=0.0 for stable outputs.
    resp = Generation.call(
        model=model,
        messages=messages,
        result_format="message",  # returns an object with .output.choices[0].message
        temperature=0.0,
    )

    # Parse model output as JSON
    content = resp.get("output", {}).get("choices", [{}])[0].get("message", {}).get("content", "")
    # Try to extract a JSON object; keep it strict
    try:
        data = json.loads(content)
        raw_label = data.get("dish_type_2", "")
    except Exception:
        # As a fallback, try to extract a simple JSON via regex
        m = re.search(r'\{.*\}', content, flags=re.S)
        if not m:
            return "Other"
        try:
            data = json.loads(m.group(0))
            raw_label = data.get("dish_type_2", "")
        except Exception:
            return "Other"

    return normalize_label(raw_label)


# ---------------------------------------------------------
# 4) Fallback rule-based single-label classifier (keywords)
# ---------------------------------------------------------
def fallback_single_label(title: str, ingredients: str, instructions: str) -> str:
    """Return ONE label using keyword rules and the deterministic PRIORITY."""
    text = f"{title} {ingredients} {instructions}".lower()

    # First pass: form/use categories in priority order
    for label in PRIORITY:
        if label in ["Meat Dishes", "Seafood Dishes", "Vegetarian Dishes", "Other"]:
            # Reserve these for a second pass (protein-form catch-alls).
            continue
        for kw in FALLBACK_RULES.get(label, []):
            if kw in text:
                return label

    # Second pass: protein-form buckets if no form/use matched
    for label in ["Meat Dishes", "Seafood Dishes", "Vegetarian Dishes"]:
        for kw in FALLBACK_RULES.get(label, []):
            if kw in text:
                return label

    return "Other"


# ------------------------
# 5) Batch classification
# ------------------------
def classify_df(df: pd.DataFrame,
                title_col: str = "recipename",
                ing_col: str = "ingredients",
                ins_col: str = "instructions",
                model: str = "qwen-turbo",
                sleep_s: float = 0.5) -> pd.DataFrame:
    """Classify each row into a single 'dish_type_2' label."""
    results = []
    iterator = tqdm(df.itertuples(index=False), total=len(df)) if _HAS_TQDM else df.itertuples(index=False)

    for row in iterator:
        title = getattr(row, title_col, "") if title_col in df.columns else ""
        ings  = getattr(row, ing_col, "") if ing_col in df.columns else ""
        inst  = getattr(row, ins_col, "") if ins_col in df.columns else ""

        label = "Other"
        used_fallback = False

        # Try DashScope
        try:
            label = classify_with_dashscope(title, ings, inst, model=model)
        except Exception:
            used_fallback = True
            label = fallback_single_label(title, ings, inst)

        results.append(label)

        # Gentle pacing to avoid rate limits
        if not used_fallback and sleep_s > 0:
            time.sleep(sleep_s)

    out = df.copy()
    out["dish_type_2"] = results
    return out


# -------------
# 6) CLI entry
# -------------
def main():
    parser = argparse.ArgumentParser(description="Dish Type 2 single-label classifier (DashScope).")
    parser.add_argument("--in", dest="in_path", required=True, help="Input CSV/XLSX file.")
    parser.add_argument("--sheet", dest="sheet", default=None, help="Sheet name if input is .xlsx.")
    parser.add_argument("--out", dest="out_path", required=True, help="Output CSV/XLSX file.")
    parser.add_argument("--title-col", dest="title_col", default="recipename", help="Column for dish title.")
    parser.add_argument("--ing-col", dest="ing_col", default="ingredients", help="Column for ingredients.")
    parser.add_argument("--ins-col", dest="ins_col", default="instructions", help="Column for instructions.")
    parser.add_argument("--model", dest="model", default="qwen-turbo", help="DashScope model name.")
    parser.add_argument("--sleep", dest="sleep_s", type=float, default=0.5, help="Sleep seconds between API calls.")
    args = parser.parse_args()

    # Read input
    in_path = args.in_path
    if in_path.lower().endswith(".xlsx"):
        df = pd.read_excel(in_path, sheet_name=args.sheet)
    else:
        df = pd.read_csv(in_path)

    # Classify
    labeled = classify_df(
        df,
        title_col=args.title_col,
        ing_col=args.ing_col,
        ins_col=args.ins_col,
        model=args.model,
        sleep_s=args.sleep_s
    )

    # Write output
    out_path = args.out_path
    if out_path.lower().endswith(".xlsx"):
        labeled.to_excel(out_path, index=False)
    else:
        labeled.to_csv(out_path, index=False)

    print(f"Saved: {out_path}")


if __name__ == "__main__":
    main()