In [None]:
"""
코랩용 기존 파이프라인 vs. 파인튜닝된 파이프라인 비교 스크립트
./adapters_dpo에 존재하는 어댑터를 기준으로 GPT-Score / 비GPT-Score를 비교.
random_persona_campaign.csv의 더미 데이터를 기준으로 평가함.
비교 문서는 adapter_comparison_{timestamp}.md로 저장.
"""

In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
!pip install datasets peft trl bitsandbytes accelerate
!pip install -U transformers
!pip show transformers

Name: transformers
Version: 4.57.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.12/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers, trl


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
print(os.getcwd())
print(os.listdir())

/content
['.config', '.env', 'drive', '.ipynb_checkpoints', 'AmoRe_crm_generator', 'sample_data']


In [None]:
!git clone https://github.com/jjjh02/AmoRe_crm_generator.git
%cd AmoRe_crm_generator
!git checkout jinhyeok
!git branch
os.chdir("/content/AmoRe_crm_generator/finetuning")
print(os.getcwd())

/content/AmoRe_crm_generator/finetuning


In [6]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
#!/usr/bin/env python3
import argparse
import csv
import json
import os
import re
import sys
import urllib.error
import urllib.request
from collections import Counter
from contextlib import contextmanager
from datetime import datetime, timezone


BASE_DIR = os.getcwd()
PROJECT_DIR = os.path.abspath(os.path.join(BASE_DIR, ".."))
SRC_DIR = os.path.abspath(os.path.join(BASE_DIR, "..", "src"))
DEFAULT_CSV = os.path.join(BASE_DIR, "random_persona_campaign.csv")
DEFAULT_ADAPTER1_DIR = "/content/drive/MyDrive/멋사/adapters_dpo_1_v2"
STAGE_ORDER = ["Acquisition", "Activation", "Retention", "Revenue", "Referral"]
CANDIDATE_LABELS = ["raw", "adapter1"]

print(BASE_DIR, SRC_DIR)

def _log(message):
    print(message)


def _import_pipeline_module():
    if SRC_DIR not in sys.path:
        sys.path.insert(0, SRC_DIR)
    try:
        import run_qwen_exaone_pipeline as pipeline_module
    except Exception as exc:
        raise ImportError(
            "Failed to import main from ../src/run_qwen_exaone_pipeline.py"
        ) from exc
    return pipeline_module


def _load_json(path):
    try:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except FileNotFoundError:
        return None


def _parse_bool(value):
    if isinstance(value, bool):
        return value
    if value is None:
        return False
    if isinstance(value, (int, float)):
        return bool(value)
    text = str(value).strip().lower()
    return text in {"1", "true", "yes", "y", "t"}


def _load_rows(csv_path):
    with open(csv_path, "r", newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            if not row:
                continue
            persona_raw = row.get("persona", "").strip()
            brand_raw = row.get("brand", "").strip()
            product_raw = row.get("product", "").strip()
            stage_raw = row.get("stage_index", "").strip()
            style_raw = row.get("style_index", "").strip()
            if not persona_raw or not brand_raw or not product_raw:
                continue
            if not stage_raw or not style_raw:
                continue
            try:
                persona = int(persona_raw)
                stage_index = int(stage_raw)
                style_index = int(style_raw)
            except ValueError:
                continue
            yield {
                "persona": persona,
                "brand": brand_raw,
                "product": product_raw,
                "stage_index": stage_index,
                "style_index": style_index,
                "is_event": _parse_bool(row.get("is_event", "")),
            }


def _get_stage_name(stage_index):
    if isinstance(stage_index, int) and 0 <= stage_index < len(STAGE_ORDER):
        return STAGE_ORDER[stage_index]
    return ""


def _get_crm_goal(crm_goals, stage_index, stage_name=None):
    if not isinstance(crm_goals, dict):
        return {}
    if stage_name and stage_name in crm_goals:
        return crm_goals.get(stage_name, {}) or {}
    stage_name = _get_stage_name(stage_index)
    if stage_name:
        return crm_goals.get(stage_name, {}) or {}
    return {}


def _get_brand_story(brand_stories, brand_name):
    if not isinstance(brand_stories, dict) or not brand_name:
        return {}
    if brand_name in brand_stories:
        return brand_stories.get(brand_name, {}) or {}
    for story in brand_stories.values():
        if str(story.get("name_en", "")).lower() == brand_name.lower():
            return story
    return {}


def _format_event(selected_event):
    if selected_event in (None, "", {}):
        return "없음"
    if isinstance(selected_event, dict):
        for key in ("title", "name", "event_name", "event"):
            if selected_event.get(key):
                return str(selected_event.get(key))
        return json.dumps(selected_event, ensure_ascii=False)
    return str(selected_event)


def _format_price(price):
    if price in (None, ""):
        return ""
    if isinstance(price, (int, float)):
        return f"{int(price):,}원"
    text = str(price).strip()
    if not text:
        return ""
    if text.replace(",", "").isdigit():
        return f"{int(text.replace(',', '')):,}원"
    return text


def _format_persona(persona_profile):
    if not isinstance(persona_profile, dict):
        return str(persona_profile or "")
    name = persona_profile.get("name", "")
    extras = []
    value_focus = persona_profile.get("value_focus")
    skin_type = persona_profile.get("skin_type")
    traits = persona_profile.get("traits")
    shopping_style = persona_profile.get("shopping_style")
    if value_focus:
        extras.append(str(value_focus))
    if skin_type:
        extras.append(str(skin_type))
    if traits:
        if isinstance(traits, list):
            extras.append(", ".join([str(t) for t in traits if t]))
        else:
            extras.append(str(traits))
    if shopping_style:
        extras.append(str(shopping_style))
    extra_text = ", ".join([e for e in extras if e])
    if name and extra_text:
        return f"{name} ({extra_text})"
    return name or extra_text


def _build_context_block(out, max_style_templates=3):
    persona = _format_persona(out.get("persona_profile"))
    stage = out.get("stage_name") or out.get("stage_kr") or ""
    brand = out.get("brand") or ""
    product_basic = out.get("product_basic") if isinstance(out.get("product_basic"), dict) else {}
    product_name = product_basic.get("name") or out.get("product_query") or ""
    price = _format_price(product_basic.get("price"))
    objective = out.get("objective") or ""
    target_state = out.get("target_state") or ""
    style_templates = out.get("style_templates") or []
    if isinstance(style_templates, list):
        style_templates = style_templates[:max_style_templates]
    selected_event = _format_event(out.get("selected_event"))

    lines = ["[컨텍스트]"]
    if persona:
        lines.append(f"- 페르소나: {persona}")
    if stage:
        lines.append(f"- 단계: {stage}")
    if brand or product_name:
        if brand and product_name:
            brand_product = f"{brand} / {product_name}"
        else:
            brand_product = brand or product_name
        lines.append(f"- 브랜드/제품: {brand_product}")
    if price:
        lines.append(f"- 가격: {price}")
    if objective:
        lines.append(f"- 목표: {objective}")
    if target_state:
        lines.append(f"- 목표 상태: {target_state}")
    if style_templates:
        lines.append("- 스타일 템플릿:")
        for item in style_templates:
            lines.append(f"  - {item}")
    lines.append(f"- 이벤트: {selected_event}")
    return "\n".join(lines).strip()


def _extract_message(out):
    exaone = out.get("exaone", {}) if isinstance(out, dict) else {}
    return exaone.get("result_raw") or ""


def _tokenize(text):
    if not text:
        return []
    return [t for t in re.split(r"\s+", str(text)) if len(t) > 1]


def _split_tokens(text):
    if not text:
        return []
    cleaned = re.sub(r"[^\w\uac00-\ud7a3]+", " ", str(text), flags=re.UNICODE)
    return [t for t in cleaned.split() if len(t) > 1]


def _extract_keywords(texts, max_terms=30):
    counter = Counter()
    for text in texts:
        for token in _split_tokens(text):
            if token.isdigit():
                continue
            counter[token] += 1
    if not counter:
        return []
    return [item for item, _ in counter.most_common(max_terms)]


def _coverage_score(message, out):
    total = 0
    hits = 0
    if not message:
        return 0.0

    brand = out.get("brand")
    if brand:
        total += 1
        if brand in message:
            hits += 1

    product_basic = out.get("product_basic") if isinstance(out.get("product_basic"), dict) else {}
    product_name = product_basic.get("name") or out.get("product_query") or ""
    if product_name:
        total += 1
        if product_name in message:
            hits += 1

    selected_event = _format_event(out.get("selected_event"))
    if selected_event and selected_event != "없음":
        total += 1
        if selected_event in message:
            hits += 1

    stage_terms = []
    for text in (out.get("stage_kr"), out.get("objective"), out.get("target_state")):
        stage_terms.extend(_tokenize(text))
    if stage_terms:
        total += 1
        if any(term in message for term in stage_terms):
            hits += 1

    return hits / total if total else 0.0


def _tone_match_score(message, brand_story):
    if not message or not isinstance(brand_story, dict):
        return 0.0
    tone_keywords = brand_story.get("tone_keywords") or []
    if not tone_keywords:
        return 0.0
    hits = sum(1 for kw in tone_keywords if kw and kw in message)
    return hits / len(tone_keywords)


def _style_match_score(message, style_templates, max_terms=30):
    if not message or not style_templates:
        return 0.0
    if not isinstance(style_templates, list):
        style_templates = [str(style_templates)]
    keywords = _extract_keywords(style_templates, max_terms=max_terms)
    if not keywords:
        return 0.0
    hits = sum(1 for kw in keywords if kw in message)
    return hits / len(keywords)


def _info_density(message, out):
    if not message:
        return 0.0
    persona = out.get("persona_profile") if isinstance(out.get("persona_profile"), dict) else {}
    product_basic = out.get("product_basic") if isinstance(out.get("product_basic"), dict) else {}
    context_texts = [
        out.get("brand"),
        product_basic.get("name"),
        out.get("product_query"),
        out.get("stage_kr"),
        out.get("objective"),
        out.get("target_state"),
        persona.get("value_focus"),
        persona.get("skin_type"),
    ]
    if isinstance(persona.get("traits"), list):
        context_texts.extend(persona.get("traits"))
    if persona.get("shopping_style"):
        context_texts.append(persona.get("shopping_style"))

    keywords = _extract_keywords([t for t in context_texts if t], max_terms=40)
    if not keywords:
        return 0.0
    message_tokens = _split_tokens(message)
    if not message_tokens:
        return 0.0
    hits = sum(1 for kw in keywords if kw in message)
    return hits / len(message_tokens)


def _repetition_stats(message):
    tokens = _split_tokens(message)
    if not tokens:
        return 0.0, 0.0
    unique_tokens = set(tokens)
    repeat_token_ratio = (len(tokens) - len(unique_tokens)) / len(tokens)

    if len(tokens) < 6:
        return repeat_token_ratio, 0.0
    n = 3
    ngrams = [" ".join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
    counts = Counter(ngrams)
    total_ngrams = len(ngrams)
    repeated = sum(count - 1 for count in counts.values() if count > 1)
    repeat_ngram_ratio = repeated / total_ngrams if total_ngrams else 0.0
    return repeat_token_ratio, repeat_ngram_ratio


def _length_target(stage_name):
    if stage_name == "Acquisition":
        return 60, 200
    if stage_name == "Activation":
        return 60, 200
    if stage_name == "Retention":
        return 60, 180
    if stage_name == "Revenue":
        return 60, 180
    if stage_name == "Referral":
        return 60, 160
    return 50, 220


def _length_ok(message, stage_name):
    if not message:
        return False
    min_len, max_len = _length_target(stage_name)
    return min_len <= len(message) <= max_len


def _forbidden_violations(message, crm_goal):
    if not message or not isinstance(crm_goal, dict):
        return 0
    forbidden = crm_goal.get("forbidden_context") or []
    if not forbidden:
        return 0
    hits = 0
    for term in forbidden:
        if term and term in message:
            hits += 1
    return hits


def _cta_present(message):
    if not message:
        return False
    cta_markers = [
        "지금", "확인", "구매", "신청", "참여",
        "클릭", "받기", "혜택", "할인", "쿠폰",
        "해보세요", "하세요", "둘러보기",
        "바로", "추천", "문의"
    ]
    return any(marker in message for marker in cta_markers)


def _score_message(message, base_out, brand_story, crm_goal, stage_name):
    coverage = _coverage_score(message, base_out)
    tone = _tone_match_score(message, brand_story)
    style = _style_match_score(message, base_out.get("style_templates"))
    density = _info_density(message, base_out)
    rep_token, rep_ngram = _repetition_stats(message)
    length_ok = _length_ok(message, stage_name)
    forbidden = _forbidden_violations(message, crm_goal)
    length = len(message)
    cta = _cta_present(message)
    return {
        "len": length,
        "cov": coverage,
        "tone": tone,
        "style": style,
        "density": density,
        "rep_token": rep_token,
        "rep_ngram": rep_ngram,
        "len_ok": length_ok,
        "forbidden": forbidden,
        "cta": cta,
    }


def _call_gpt(context_block, messages):
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY is not set.")

    candidate_block = "\n\n".join(
        f"[{idx}]\n{msg if msg else '(빈 메시지)'}" for idx, msg in enumerate(messages)
    )

    system_prompt = (
        "너는 마케팅 문장 평가자다.\n"
        "목표는 전환 가능성이 더 높은 CRM 메시지를 고르는 것이다.\n\n"
        "다음 기준으로 후보를 비교하라:\n"
        "1. 수신자가 실제 행동(클릭/재구매)을 할 가능성\n"
        "2. persona와 구매 단계 적합성\n"
        "3. 상품·브랜드 핵심 장점 전달력\n"
        "4. 스타일 템플릿/이벤트 정보를 적절히 반영했는가\n"
        "5. 불필요한 장식 없이 명확한가\n\n"
        "가장 좋은 후보의 번호만 0 또는 1로 출력하라."
    )
    user_prompt = (
        "컨텍스트:\n"
        f"{context_block}\n\n"
        "후보:\n"
        f"{candidate_block}\n\n"
        "번호만 답해라."
    )

    payload = {
        "model": "gpt-5-nano",
        "input": [
            {
                "role": "system",
                "content": [{"type": "input_text", "text": system_prompt}],
            },
            {
                "role": "user",
                "content": [{"type": "input_text", "text": user_prompt}],
            },
        ],
    }

    request = urllib.request.Request(
        "https://api.openai.com/v1/responses",
        data=json.dumps(payload).encode("utf-8"),
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        },
        method="POST",
    )

    try:
        with urllib.request.urlopen(request, timeout=30) as response:
            data = json.loads(response.read().decode("utf-8"))
    except urllib.error.HTTPError as exc:
        body = exc.read().decode("utf-8", errors="replace")
        raise RuntimeError(f"OpenAI API error {exc.code}: {body}") from exc

    output_text = _extract_response_text(data)
    match = re.search(r"-?\d+", str(output_text))
    if not match:
        raise ValueError(f"Invalid evaluator response: {output_text}")
    choice = int(match.group(0))
    if choice not in (0, 1):
        raise ValueError(f"Evaluator index out of range: {choice}")
    return choice


def _extract_response_text(data):
    if isinstance(data, dict):
        output_text = data.get("output_text")
        if isinstance(output_text, str) and output_text.strip():
            return output_text.strip()

        output = data.get("output")
        if isinstance(output, list):
            parts = []
            for item in output:
                if not isinstance(item, dict):
                    continue
                content = item.get("content", [])
                if isinstance(content, list):
                    for block in content:
                        if isinstance(block, dict) and isinstance(block.get("text"), str):
                            parts.append(block["text"])
                        elif isinstance(block, str):
                            parts.append(block)
                elif isinstance(content, str):
                    parts.append(content)
            if parts:
                return "".join(parts).strip()

    return ""


@contextmanager
def _patch_exaone(pipeline_module, adapter_path=None, adapter_paths=None):
    import tone_correction

    class PatchedExaoneToneCorrector(tone_correction.ExaoneToneCorrector):
        _cache = {}

        def __init__(self, model_name="LGAI-EXAONE/EXAONE-4.0-1.2B"):
            if adapter_paths:
                key = (model_name, tuple(adapter_paths))
            else:
                key = (model_name, adapter_path)
            cached = self._cache.get(key)
            if cached:
                self.device = cached["device"]
                self.model_name = model_name
                self.tokenizer = cached["tokenizer"]
                self.model = cached["model"]
                return
            super().__init__(model_name=model_name)
            if adapter_paths:
                self._apply_adapters(adapter_paths)
            elif adapter_path:
                self._apply_adapters([adapter_path])
            self._cache[key] = {
                "device": self.device,
                "tokenizer": self.tokenizer,
                "model": self.model,
            }

        def _apply_adapters(self, paths):
            if not paths:
                return
            try:
                from peft import PeftModel
            except ImportError as exc:
                raise RuntimeError("peft is required to load adapters.") from exc

            self.model = PeftModel.from_pretrained(self.model, paths[0])
            if len(paths) == 1:
                try:
                    self.model.eval()
                except Exception:
                    pass
                return

            merged = None
            try:
                merged = self.model.merge_and_unload()
            except Exception:
                merged = None

            if merged is not None:
                self.model = merged
                self.model = PeftModel.from_pretrained(self.model, paths[1])
            else:
                try:
                    self.model.load_adapter(paths[1], adapter_name="adapter2")
                    try:
                        self.model.set_adapter(["default", "adapter2"])
                    except Exception:
                        self.model.set_adapter("adapter2")
                except Exception:
                    pass

            try:
                self.model.eval()
            except Exception:
                pass

    original = pipeline_module.ExaoneToneCorrector
    pipeline_module.ExaoneToneCorrector = PatchedExaoneToneCorrector
    try:
        yield
    finally:
        pipeline_module.ExaoneToneCorrector = original


def _run_pipeline_main(pipeline_main, row):
    argv = [
        "run_qwen_exaone_pipeline.py",
        "--persona",
        str(row["persona"]),
        "--brand",
        row["brand"],
        "--product",
        row["product"],
        "--stage_index",
        str(row["stage_index"]),
        "--style_index",
        str(row["style_index"]),
        "--is_event",
        "1" if row.get("is_event", False) else "0",
    ]
    old_argv = sys.argv
    try:
        sys.argv = argv
        return pipeline_main()
    finally:
        sys.argv = old_argv


def _row_key(row):
    return "{persona}|{brand}|{product}|{stage}|{style}|{event}".format(
        persona=row.get("persona", ""),
        brand=row.get("brand", ""),
        product=row.get("product", ""),
        stage=row.get("stage_index", ""),
        style=row.get("style_index", ""),
        event=int(bool(row.get("is_event", False))),
    )


def _load_checkpoint(checkpoint_path, row_key_map):
    if not checkpoint_path or not os.path.exists(checkpoint_path):
        return [], set()
    results = []
    with open(checkpoint_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            try:
                item = json.loads(line)
            except json.JSONDecodeError:
                continue
            row_key = item.get("row_key")
            if row_key and row_key in row_key_map:
                item["idx"] = row_key_map[row_key]
            if "idx" not in item:
                continue
            results.append(item)
    by_idx = {}
    for item in results:
        by_idx[item["idx"]] = item
    ordered = [by_idx[idx] for idx in sorted(by_idx)]
    return ordered, set(by_idx)


def _append_checkpoint(checkpoint_path, item):
    if not checkpoint_path:
        return
    checkpoint_dir = os.path.dirname(checkpoint_path)
    if checkpoint_dir:
        os.makedirs(checkpoint_dir, exist_ok=True)
    with open(checkpoint_path, "a", encoding="utf-8") as f:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")


def _count_wins(results):
    wins = {"raw": 0, "adapter1": 0}
    mapping = {
        "raw": "raw",
        "ad1": "adapter1",
        "adapter1": "adapter1",
    }
    for item in results:
        key = mapping.get(item.get("winner"))
        if key:
            wins[key] += 1
    return wins


def _write_report(out_path, summary, rows, max_examples):
    lines = []
    lines.append("# 어댑터 비교 리포트")
    lines.append("")
    lines.append(f"- CSV: {summary['csv']}")
    lines.append(f"- 어댑터1: {summary['adapter1']}")
    lines.append(f"- 샘플 수: {summary['samples']}")
    lines.append("- 표기: raw=기본 모델, ad1=어댑터1")
    lines.append("")
    lines.append("## 요약")
    lines.append("")
    for item in summary["metrics"]:
        lines.append(f"- {item}")
    lines.append("")
    lines.append("## 지표 설명")
    lines.append("")
    lines.append("- GPT 승자: gpt-5-nano가 동일 컨텍스트 기준으로 2개 후보 중 더 좋은 메시지를 선택한 결과.")
    lines.append("- 커버리지: 브랜드/제품/이벤트/스테이지 관련 용어가 메시지에 포함된 비율.")
    lines.append("- 톤 일치율: 브랜드 톤 키워드가 메시지에 포함된 비율.")
    lines.append("- 스타일 일치율: 스타일 템플릿에서 뽑은 키워드 포함 비율.")
    lines.append("- 정보 밀도: 컨텍스트 키워드 적중 수 / 메시지 토큰 수.")
    lines.append("- 반복 토큰 비율: (토큰 수 - 고유 토큰 수) / 토큰 수.")
    lines.append("- 반복 3-그램 비율: 반복된 3-그램 수 / 전체 3-그램 수.")
    lines.append("- 길이 적정: 스테이지별 권장 길이 범위 충족 여부.")
    lines.append("- 금지 맥락 위반율: forbidden_context 용어가 포함된 메시지 비율.")
    lines.append("- CTA 비율: CTA 키워드가 포함된 메시지 비율.")
    lines.append("")
    lines.append("## 샘플별 결과")
    lines.append("")
    lines.append(
        "| idx | persona | 브랜드/제품 | 스테이지 | 이벤트 | GPT 승자 | raw 길이 | ad1 길이 | raw 커버리지 | ad1 커버리지 | raw 톤 | ad1 톤 | raw 스타일 | ad1 스타일 | raw 밀도 | ad1 밀도 | raw 반복 토큰 | ad1 반복 토큰 | raw 반복 3g | ad1 반복 3g | raw 길이 적정 | ad1 길이 적정 | raw 금지 | ad1 금지 | raw CTA | ad1 CTA |"
    )
    lines.append(
        "| --- | --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | --- | ---: | ---: | --- | --- |"
    )
    for item in rows:
        lines.append(
            "| {idx} | {persona} | {brand_product} | {stage} | {event} | {winner} | {raw_len} | {ad1_len} | {raw_cov:.2f} | {ad1_cov:.2f} | {raw_tone:.2f} | {ad1_tone:.2f} | {raw_style:.2f} | {ad1_style:.2f} | {raw_density:.2f} | {ad1_density:.2f} | {raw_rep_token:.2f} | {ad1_rep_token:.2f} | {raw_rep_ngram:.2f} | {ad1_rep_ngram:.2f} | {raw_len_ok} | {ad1_len_ok} | {raw_forbidden} | {ad1_forbidden} | {raw_cta} | {ad1_cta} |".format(
                idx=item["idx"],
                persona=item["persona"],
                brand_product=item["brand_product"],
                stage=item["stage"],
                event=item["event"],
                winner=item["winner"],
                raw_len=item["raw_len"],
                ad1_len=item["adapter1_len"],
                raw_cov=item["raw_cov"],
                ad1_cov=item["adapter1_cov"],
                raw_tone=item["raw_tone"],
                ad1_tone=item["adapter1_tone"],
                raw_style=item["raw_style"],
                ad1_style=item["adapter1_style"],
                raw_density=item["raw_density"],
                ad1_density=item["adapter1_density"],
                raw_rep_token=item["raw_rep_token"],
                ad1_rep_token=item["adapter1_rep_token"],
                raw_rep_ngram=item["raw_rep_ngram"],
                ad1_rep_ngram=item["adapter1_rep_ngram"],
                raw_len_ok="yes" if item["raw_len_ok"] else "no",
                ad1_len_ok="yes" if item["adapter1_len_ok"] else "no",
                raw_forbidden=item["raw_forbidden"],
                ad1_forbidden=item["adapter1_forbidden"],
                raw_cta="yes" if item["raw_cta"] else "no",
                ad1_cta="yes" if item["adapter1_cta"] else "no",
            )
        )
    lines.append("")

    example_count = min(max_examples, len(rows))
    if example_count > 0:
        lines.append("## 예시")
        lines.append("")
        for item in rows[:example_count]:
            lines.append(f"### 샘플 {item['idx']}")
            lines.append("")
            lines.append("컨텍스트:")
            lines.append("")
            lines.append("```")
            lines.append(item["context"])
            lines.append("```")
            lines.append("")
            lines.append("Raw 메시지:")
            lines.append("")
            lines.append("```")
            lines.append(item["raw_message"] or "(빈 메시지)")
            lines.append("```")
            lines.append("")
            lines.append("Adapter1 메시지:")
            lines.append("")
            lines.append("```")
            lines.append(item["adapter1_message"] or "(빈 메시지)")
            lines.append("```")
            lines.append("")
            lines.append(f"GPT 승자: {item['winner']}")
            lines.append("")
    else:
        lines.append("## 예시")
        lines.append("")
        lines.append("예시가 없습니다 (max_examples가 0이거나 처리된 행이 없습니다).")
        lines.append("")

    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))




In [None]:
parser = argparse.ArgumentParser()
parser.add_argument("--csv_path", default=DEFAULT_CSV)
parser.add_argument("--adapter1_path", default=DEFAULT_ADAPTER1_DIR)
parser.add_argument("--out_path", default=None)
parser.add_argument("--checkpoint_path", default=None)
parser.add_argument("--max_rows", type=int, default=None)
parser.add_argument("--max_examples", type=int, default=3)
parser.add_argument("--skip_llm_eval", action="store_true")
parser.add_argument("--max_style_templates", type=int, default=3)
args = parser.parse_args()

if not os.path.exists(args.csv_path):
    raise FileNotFoundError(f"CSV not found: {args.csv_path}")
if not os.path.exists(args.adapter1_path):
    raise FileNotFoundError(f"Adapter not found: {args.adapter1_path}")

pipeline_module = _import_pipeline_module()
pipeline_main = pipeline_module.main
brand_stories = _load_json(os.path.join(PROJECT_DIR, "data", "brand_stories.json"))
crm_goals = _load_json(os.path.join(PROJECT_DIR, "data", "crm_goals.json"))

rows = []
row_key_map = {}
for idx, row in enumerate(_load_rows(args.csv_path), start=1):
    if args.max_rows is not None and idx > args.max_rows:
        break
    rows.append(row)
    key = _row_key(row)
    if key in row_key_map:
        _log(f"[WARN] Duplicate row key at idx={idx}.")
    else:
        row_key_map[key] = idx

if not rows:
    raise RuntimeError("No rows to evaluate.")

checkpoint_path = args.checkpoint_path or os.path.join(
    BASE_DIR, "adapter_comparison_2way_checkpoint.jsonl"
)
results, processed = _load_checkpoint(checkpoint_path, row_key_map)
if results:
    _log(f"Resume from checkpoint: {checkpoint_path} ({len(results)} rows)")

wins = _count_wins(results)

for idx, row in enumerate(rows, start=1):
    if idx in processed:
        _log(f"[Row {idx}] skipped (checkpoint)")
        continue
    _log(
        "[Row {idx}] persona={persona} brand={brand} product={product} "
        "stage_index={stage_index} style_index={style_index} is_event={is_event}".format(
            idx=idx,
            persona=row["persona"],
            brand=row["brand"],
            product=row["product"],
            stage_index=row["stage_index"],
            style_index=row["style_index"],
            is_event=row.get("is_event", False),
        )
    )

    _log("  Running raw pipeline...")
    raw_out = _run_pipeline_main(pipeline_main, row)

    _log("  Running adapter1 pipeline...")
    with _patch_exaone(pipeline_module, adapter_path=args.adapter1_path):
        adapter1_out = _run_pipeline_main(pipeline_main, row)

    raw_message = _extract_message(raw_out)
    adapter1_message = _extract_message(adapter1_out)

    context_block = _build_context_block(raw_out, args.max_style_templates)
    stage_name = raw_out.get("stage_name") or _get_stage_name(row["stage_index"])
    crm_goal = _get_crm_goal(crm_goals, row["stage_index"], stage_name)
    brand_story = _get_brand_story(brand_stories, raw_out.get("brand"))

    winner = "n/a"
    if not args.skip_llm_eval:
        choice = _call_gpt(context_block, [raw_message, adapter1_message])
        winner = CANDIDATE_LABELS[choice]
        wins[winner] += 1

    raw_metrics = _score_message(raw_message, raw_out, brand_story, crm_goal, stage_name)
    adapter1_metrics = _score_message(adapter1_message, raw_out, brand_story, crm_goal, stage_name)

    winner_short = {"raw": "raw", "adapter1": "ad1"}.get(winner, "n/a")
    row_key = _row_key(row)

    result = {
        "idx": idx,
        "row_key": row_key,
        "persona": row["persona"],
        "brand_product": f"{row['brand']} / {row['product']}",
        "stage": raw_out.get("stage_name") or raw_out.get("stage_kr") or "",
        "event": _format_event(raw_out.get("selected_event")),
        "winner": winner_short,
        "raw_len": raw_metrics["len"],
        "adapter1_len": adapter1_metrics["len"],
        "raw_cov": raw_metrics["cov"],
        "adapter1_cov": adapter1_metrics["cov"],
        "raw_tone": raw_metrics["tone"],
        "adapter1_tone": adapter1_metrics["tone"],
        "raw_style": raw_metrics["style"],
        "adapter1_style": adapter1_metrics["style"],
        "raw_density": raw_metrics["density"],
        "adapter1_density": adapter1_metrics["density"],
        "raw_rep_token": raw_metrics["rep_token"],
        "adapter1_rep_token": adapter1_metrics["rep_token"],
        "raw_rep_ngram": raw_metrics["rep_ngram"],
        "adapter1_rep_ngram": adapter1_metrics["rep_ngram"],
        "raw_len_ok": raw_metrics["len_ok"],
        "adapter1_len_ok": adapter1_metrics["len_ok"],
        "raw_forbidden": raw_metrics["forbidden"],
        "adapter1_forbidden": adapter1_metrics["forbidden"],
        "raw_cta": raw_metrics["cta"],
        "adapter1_cta": adapter1_metrics["cta"],
        "context": context_block,
        "raw_message": raw_message,
        "adapter1_message": adapter1_message,
    }
    results.append(result)
    _append_checkpoint(checkpoint_path, result)


def _avg_metric(results, key):
    values = [r[key] for r in results if key in r]
    return sum(values) / len(values) if values else 0.0


results = sorted(results, key=lambda item: item.get("idx", 0))
total = len(results) if results else 1

avg_cov = {c: _avg_metric(results, f"{c}_cov") for c in ("raw", "adapter1")}
avg_tone = {c: _avg_metric(results, f"{c}_tone") for c in ("raw", "adapter1")}
avg_style = {c: _avg_metric(results, f"{c}_style") for c in ("raw", "adapter1")}
avg_density = {c: _avg_metric(results, f"{c}_density") for c in ("raw", "adapter1")}
avg_rep_token = {c: _avg_metric(results, f"{c}_rep_token") for c in ("raw", "adapter1")}
avg_rep_ngram = {c: _avg_metric(results, f"{c}_rep_ngram") for c in ("raw", "adapter1")}
len_ok_rate = {
    c: sum(1 for r in results if r.get(f"{c}_len_ok")) / total
    for c in ("raw", "adapter1")
}
forbidden_rate = {
    c: sum(1 for r in results if r.get(f"{c}_forbidden", 0) > 0) / total
    for c in ("raw", "adapter1")
}
cta_rate = {
    c: sum(1 for r in results if r.get(f"{c}_cta")) / total
    for c in ("raw", "adapter1")
}
avg_len = {c: _avg_metric(results, f"{c}_len") for c in ("raw", "adapter1")}

timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
out_path = args.out_path or os.path.join(
    BASE_DIR, f"adapter_comparison_2way_{timestamp}.md"
)

summary = {
    "csv": args.csv_path,
    "adapter1": args.adapter1_path,
    "samples": len(results),
    "metrics": [
        "GPT 승자: raw {raw} / ad1 {ad1} (skip_llm_eval={skip})".format(
            raw=wins["raw"],
            ad1=wins["adapter1"],
            skip=args.skip_llm_eval,
        ),
        "평균 커버리지: raw {raw:.2f}, ad1 {ad1:.2f}".format(
            raw=avg_cov["raw"], ad1=avg_cov["adapter1"]
        ),
        "평균 톤 일치율: raw {raw:.2f}, ad1 {ad1:.2f}".format(
            raw=avg_tone["raw"], ad1=avg_tone["adapter1"]
        ),
        "평균 스타일 일치율: raw {raw:.2f}, ad1 {ad1:.2f}".format(
            raw=avg_style["raw"], ad1=avg_style["adapter1"]
        ),
        "평균 정보 밀도: raw {raw:.2f}, ad1 {ad1:.2f}".format(
            raw=avg_density["raw"], ad1=avg_density["adapter1"]
        ),
        "반복 토큰 비율: raw {raw:.2f}, ad1 {ad1:.2f}".format(
            raw=avg_rep_token["raw"], ad1=avg_rep_token["adapter1"]
        ),
        "반복 3-그램 비율: raw {raw:.2f}, ad1 {ad1:.2f}".format(
            raw=avg_rep_ngram["raw"], ad1=avg_rep_ngram["adapter1"]
        ),
        "길이 적정 비율: raw {raw:.2f}, ad1 {ad1:.2f}".format(
            raw=len_ok_rate["raw"], ad1=len_ok_rate["adapter1"]
        ),
        "금지 맥락 위반 비율: raw {raw:.2f}, ad1 {ad1:.2f}".format(
            raw=forbidden_rate["raw"], ad1=forbidden_rate["adapter1"]
        ),
        "CTA 비율: raw {raw:.2f}, ad1 {ad1:.2f}".format(
            raw=cta_rate["raw"], ad1=cta_rate["adapter1"]
        ),
        "평균 길이: raw {raw:.1f}, ad1 {ad1:.1f}".format(
            raw=avg_len["raw"], ad1=avg_len["adapter1"]
        ),
    ],
}

_write_report(out_path, summary, results, args.max_examples)
_log(f"Saved report: {out_path}")



In [14]:
out_path = "/content/drive/MyDrive/\uba4\uc0ac/comparison_dpo/comparison_3way_stack_01.md"


'content/drive/MyDrive/멋사/comparison_dpo'

In [18]:
_write_report(out_path, summary, results, args.max_examples)
_log(f"Saved report: {out_path}")

['# Adapter Comparison Report', '', '- CSV: /content/AmoRe_crm_generator/finetuning/random_persona_campaign.csv', '- Adapter: /content/drive/MyDrive/멋사/adapters_dpo_2', '- Samples: 10', '', '## Summary', '', '- GPT wins: adapter 5 / base 5 (skip_llm_eval=False)', '- Avg coverage: adapter 0.33, base 0.24', '- Avg tone match: adapter 0.00, base 0.00', '- Avg style match: adapter 0.05, base 0.07', '- Avg info density: adapter 0.15, base 0.19', '- Repeat token ratio: adapter 0.06, base 0.04', '- Repeat 3-gram ratio: adapter 0.00, base 0.00', '- Length ok rate: adapter 0.00, base 0.10', '- Forbidden violation rate: adapter 0.00, base 0.00', '- CTA rate: adapter 1.00, base 0.80', '- Avg length: adapter 349.8, base 297.5', '', '## Per-sample Results', '', '| idx | persona | brand/product | stage | event | gpt winner | base len | adapter len | base cov | adapter cov | base tone | adapter tone | base style | adapter style | base dens | adapter dens | base rep tok | adapter rep tok | base rep 3g

In [16]:
summary

{'csv': '/content/AmoRe_crm_generator/finetuning/random_persona_campaign.csv',
 'adapter': '/content/drive/MyDrive/멋사/adapters_dpo_2',
 'samples': 10,
 'metrics': ['GPT wins: adapter 5 / base 5 (skip_llm_eval=False)',
  'Avg coverage: adapter 0.33, base 0.24',
  'Avg tone match: adapter 0.00, base 0.00',
  'Avg style match: adapter 0.05, base 0.07',
  'Avg info density: adapter 0.15, base 0.19',
  'Repeat token ratio: adapter 0.06, base 0.04',
  'Repeat 3-gram ratio: adapter 0.00, base 0.00',
  'Length ok rate: adapter 0.00, base 0.10',
  'Forbidden violation rate: adapter 0.00, base 0.00',
  'CTA rate: adapter 1.00, base 0.80',
  'Avg length: adapter 349.8, base 297.5']}