# BERT WWM Ext Baseline
This notebook fine-tunes hfl/chinese-bert-wwm-ext on the cleaned WeChat intent dataset so it stays in sync with the other training workflows.


## Notebook Overview
- Resolves project directories dynamically so relative paths work regardless of how the notebook is launched.
- Loads 	elemarketing_intent_cn.jsonl plus the optional crosswoz.jsonl, applies blacklist/min-sample filters, and balances label counts.
- Fine-tunes hfl/chinese-bert-wwm-ext with Hugging Face Trainer, tracking weighted F1/accuracy during training.
- Persists the model and label mapping artefacts under ssets/models/chinese_bert_baseline for downstream services.


In [None]:
from __future__ import annotations

import json
from collections import Counter
from pathlib import Path

import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from transformers import set_seed

pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 50)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
NOTEBOOK_DIR = Path().resolve()


def find_project_root(start: Path) -> Path:
    for candidate in [start] + list(start.parents):
        if (candidate / "assets" / "models").exists():
            return candidate
    raise RuntimeError("Could not find project root (assets/models missing)")


PROJECT_ROOT = find_project_root(NOTEBOOK_DIR)
ASSETS_DIR = PROJECT_ROOT / "assets"
MODELS_DIR = ASSETS_DIR / "models"
DATA_DIR = MODELS_DIR / "few_shot_intent_sft" / "data"
BASELINE_DIR = MODELS_DIR / "chinese_bert_baseline"
CHECKPOINT_DIR = BASELINE_DIR / "checkpoints"

BASELINE_DIR.mkdir(parents=True, exist_ok=True)
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

TELEMARKETING_DATA = DATA_DIR / "telemarketing_intent_cn.jsonl"
CROSSWOZ_DATA = DATA_DIR / "crosswoz.jsonl"

MODEL_NAME = "hfl/chinese-bert-wwm-ext"
MAX_LENGTH = 128
BATCH_SIZE = 32
EPOCHS = 4
LEARNING_RATE = 2e-5
MIN_SAMPLES = 20
MAX_SAMPLES_PER_INTENT = 300
USE_CROSSWOZ = True
RANDOM_SEED = 42

set_seed(RANDOM_SEED)

print(f"Project root: {PROJECT_ROOT}")
print(f"Raw data directory: {DATA_DIR}")
print(f"Baseline artifacts: {BASELINE_DIR}")

Project root: D:\Files\Develop Projects\AI\Yuzuriha-Rin
Raw data directory: D:\Files\Develop Projects\AI\Yuzuriha-Rin\assets\models\few_shot_intent_sft\data
Baseline artifacts: D:\Files\Develop Projects\AI\Yuzuriha-Rin\assets\models\chinese_bert_baseline


In [3]:
BLACKLIST_INTENTS = {
    "查询类",
    "查询(产品信息)",
    "查询(价格)",
    "查询(优惠)",
    "查询(库存)",
    "查询(物流)",
    "查询(订单)",
    "查询(账户)",
    "查询(余额)",
    "实体(产品)",
    "实体(价格)",
    "实体(时间)",
    "实体(地点)",
    "实体(人名)",
    "实体(公司)",
    "实体识别",
    "产品推荐",
    "促销活动",
    "优惠信息",
    "下单",
    "支付",
    "退款",
    "投诉",
    "售后",
    "政治敏感",
    "污言秽语",
    "色情低俗",
    "暴力血腥",
    "违法犯罪",
    "广告营销",
    "诈骗信息",
    "肯定(没问题)",
    "否定(没有)",
    "转人工",
    "挂断电话",
    "保持通话",
    "重复",
    "澄清",
    "确认信息",
    "核实身份",
    "录音提示",
    "系统提示",
}

CROSSWOZ_INTENT_MAPPING = {
    "greet": "招呼用语",
    "thank": "礼貌用语",
    "bye": "结束用语",
}

In [4]:
def load_jsonl(file_path: Path) -> pd.DataFrame:
    if not file_path.exists():
        print(f"Skipping missing dataset: {file_path}")
        return pd.DataFrame(columns=["text", "label"])
    rows = []
    with file_path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            item = json.loads(line)
            rows.append({"text": item["text"].strip(), "label": item["label"]})
    return pd.DataFrame(rows)


def prepare_dataset() -> tuple[DatasetDict, dict[str, int], dict[int, str]]:
    print("Preparing datasets...")
    frames: list[pd.DataFrame] = []

    main_df = load_jsonl(TELEMARKETING_DATA)
    if main_df.empty:
        raise FileNotFoundError("telemarketing_intent_cn.jsonl is required")
    main_df["source"] = "telemarketing"
    frames.append(main_df)

    if USE_CROSSWOZ:
        crosswoz_df = load_jsonl(CROSSWOZ_DATA)
        if not crosswoz_df.empty:
            crosswoz_df["label"] = crosswoz_df["label"].map(CROSSWOZ_INTENT_MAPPING)
            crosswoz_df = crosswoz_df.dropna(subset=["label"]).copy()
            crosswoz_df["source"] = "crosswoz"
            frames.append(crosswoz_df)

    df = pd.concat(frames, ignore_index=True)
    df["text"] = df["text"].astype(str)
    print(f"Combined {len(df):,} rows from {df['source'].nunique()} datasets")

    before_filter = len(df)
    df = df[~df["label"].isin(BLACKLIST_INTENTS)].copy()
    print(f"Blacklist filter: {before_filter:,} -> {len(df):,} rows")

    intent_counts = Counter(df["label"])
    valid_labels = {
        label for label, count in intent_counts.items() if count >= MIN_SAMPLES
    }
    df = df[df["label"].isin(valid_labels)].copy()
    print(f"Minimum sample filter keeps {len(valid_labels)} intents ({len(df):,} rows)")

    balanced_parts = []
    for label in sorted(valid_labels):
        group = df[df["label"] == label]
        if len(group) > MAX_SAMPLES_PER_INTENT:
            group = group.sample(n=MAX_SAMPLES_PER_INTENT, random_state=RANDOM_SEED)
        balanced_parts.append(group)
    df = pd.concat(balanced_parts, ignore_index=True)
    print(
        f"Balanced dataset: {len(df):,} samples across {df['label'].nunique()} intents"
    )

    intents = sorted(df["label"].unique())
    label2id = {label: idx for idx, label in enumerate(intents)}
    id2label = {idx: label for label, idx in label2id.items()}
    df["label_id"] = df["label"].map(label2id)

    train_df, test_df = train_test_split(
        df[["text", "label_id"]],
        test_size=0.2,
        random_state=RANDOM_SEED,
        stratify=df["label_id"],
    )
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    train_ds = Dataset.from_pandas(
        train_df.rename(columns={"label_id": "label"}), preserve_index=False
    )
    test_ds = Dataset.from_pandas(
        test_df.rename(columns={"label_id": "label"}), preserve_index=False
    )

    print(f"Train split: {len(train_ds):,} | Test split: {len(test_ds):,}")
    return DatasetDict({"train": train_ds, "test": test_ds}), label2id, id2label

In [None]:
dataset, label2id, id2label = prepare_dataset()
num_labels = len(label2id)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)  # type: ignore


def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
    )


encoded_dataset = dataset.map(tokenize_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(  # type: ignore
    MODEL_NAME,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1_weighted": f1_score(labels, predictions, average="weighted"),
    }


training_args = TrainingArguments(  # type: ignore
    output_dir=str(CHECKPOINT_DIR),
    eval_strategy="epoch",  # type: ignore
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    report_to="none",
    logging_steps=50,
    seed=RANDOM_SEED,
)

trainer = Trainer(  # type: ignore
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,  # type: ignore
)

print("Starting training...")
train_result = trainer.train()

print("Evaluating best checkpoint...")
final_metrics = trainer.evaluate()
print(f"Accuracy: {final_metrics['eval_accuracy']:.4f}")
print(f"Weighted F1: {final_metrics['eval_f1_weighted']:.4f}")

print("Saving baseline artefacts...")
trainer.save_model(str(BASELINE_DIR))
tokenizer.save_pretrained(str(BASELINE_DIR))

mapping_path = BASELINE_DIR / "intent_mapping.json"
intents_txt = BASELINE_DIR / "intents.txt"
with mapping_path.open("w", encoding="utf-8") as f:
    json.dump(
        {"intent2id": label2id, "id2intent": {str(k): v for k, v in id2label.items()}},
        f,
        ensure_ascii=False,
        indent=2,
    )

with intents_txt.open("w", encoding="utf-8") as f:
    f.write("WeChat intent label list\n")
    f.write("=" * 50 + "\n")
    for idx, intent in enumerate(sorted(label2id.keys()), start=1):
        f.write(f"{idx:02d}. {intent}\n")

print(f"Artifacts stored in: {BASELINE_DIR}")
final_metrics

Preparing datasets...
Combined 20,279 rows from 2 datasets
Blacklist filter: 20,279 -> 19,287 rows
Minimum sample filter keeps 70 intents (19,149 rows)
Balanced dataset: 9,782 samples across 70 intents
Train split: 7,825 | Test split: 1,957


Map: 100%|██████████| 7825/7825 [00:00<00:00, 19429.37 examples/s]
Map: 100%|██████████| 1957/1957 [00:00<00:00, 23883.90 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-bert-wwm-ext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,2.308,1.629816,0.759325,0.732682
2,1.1106,0.955364,0.850281,0.841365
3,0.7505,0.760202,0.873786,0.867925
4,0.6235,0.69978,0.873786,0.868018


Evaluating best checkpoint...


Accuracy: 0.8738
Weighted F1: 0.8680
Saving baseline artefacts...
Artifacts stored in: D:\Files\Develop Projects\AI\Yuzuriha-Rin\assets\models\chinese_bert_baseline


{'eval_loss': 0.6997795104980469,
 'eval_accuracy': 0.8737864077669902,
 'eval_f1_weighted': 0.8680175319063638,
 'eval_runtime': 4.466,
 'eval_samples_per_second': 438.199,
 'eval_steps_per_second': 13.883,
 'epoch': 4.0}