# Intent Data Analysis
This notebook inspects the raw intent corpora powering the sticker selection model. It focuses on understanding label coverage, pruning noisy intents, and producing summary artefacts used during training.

## What you will find here
- Automatic project-path discovery so the notebook works from anywhere in the repo.
- Data ingestion for `telemarketing_intent_cn.jsonl` plus the optional `crosswoz.jsonl` corpus.
- Exploratory analysis (statistics, tables, charts) to inspect intent coverage and message length patterns.
- Filtering utilities (blacklist removal, minimum sample threshold, balanced sampling) to produce a clean dataset.
- Persistence of cleaned data and label reports under `assets/models/intent_predictor` for downstream notebooks.

In [None]:
from __future__ import annotations

import json
from collections import Counter
from datetime import datetime
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 50)

sns.set_theme(style="darkgrid", context="notebook")
plt.rcParams["figure.figsize"] = (11, 6)
plt.rcParams["axes.unicode_minus"] = False
plt.rcParams["font.family"] = ["DejaVu Sans", "Arial", "sans-serif"]

In [None]:
NOTEBOOK_DIR = Path().resolve()

def find_project_root(start: Path) -> Path:
    for candidate in [start] + list(start.parents):
        if (candidate / "assets" / "models").exists():
            return candidate
    raise RuntimeError("Could not find project root (missing assets/models)")

PROJECT_ROOT = find_project_root(NOTEBOOK_DIR)
ASSETS_DIR = PROJECT_ROOT / "assets"
MODELS_DIR = ASSETS_DIR / "models"
DATA_DIR = MODELS_DIR / "few_shot_intent_sft" / "data"
MODEL_DIR = MODELS_DIR / "intent_predictor"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

TELEMARKETING_DATA = DATA_DIR / "telemarketing_intent_cn.jsonl"
CROSSWOZ_DATA = DATA_DIR / "crosswoz.jsonl"

PROCESSED_DATA_PATH = MODEL_DIR / "clean_wechat_intents.parquet"
LABEL_COUNTS_PATH = MODEL_DIR / "label_distribution.csv"

RANDOM_SEED = 42
MIN_SAMPLES = 20
MAX_SAMPLES_PER_INTENT = 300
USE_CROSSWOZ = True

print(f"Project root: {PROJECT_ROOT}")
print(f"Raw data directory: {DATA_DIR}")
print(f"Artifacts will be stored in: {MODEL_DIR}")

In [None]:
BLACKLIST_INTENTS = {
    "查询类",
    "查询(产品信息)",
    "查询(价格)",
    "查询(优惠)",
    "查询(库存)",
    "查询(物流)",
    "查询(订单)",
    "查询(账户)",
    "查询(余额)",
    "实体(产品)",
    "实体(价格)",
    "实体(时间)",
    "实体(地点)",
    "实体(人名)",
    "实体(公司)",
    "实体识别",
    "产品推荐",
    "促销活动",
    "优惠信息",
    "下单",
    "支付",
    "退款",
    "投诉",
    "售后",
    "政治敏感",
    "污言秽语",
    "色情低俗",
    "暴力血腥",
    "违法犯罪",
    "广告营销",
    "诈骗信息",
    "肯定(没问题)",
    "否定(没有)",
    "转人工",
    "挂断电话",
    "保持通话",
    "重复",
    "澄清",
    "确认信息",
    "核实身份",
    "录音提示",
    "系统提示",
}

RAW_DATASETS = [
    ("telemarketing", TELEMARKETING_DATA),
    ("crosswoz", CROSSWOZ_DATA),
]


def read_jsonl(path: Path) -> pd.DataFrame:
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return pd.DataFrame(rows)


def load_raw_dataset() -> pd.DataFrame:
    frames = []
    for name, path in RAW_DATASETS:
        if not path.exists():
            print(f"Skipping {name}: {path} not found")
            continue
        df = read_jsonl(path)
        df["dataset"] = name
        frames.append(df)
    if not frames:
        raise FileNotFoundError("No datasets available. Please place jsonl files in assets/models/few_shot_intent_sft/data")
    merged = pd.concat(frames, ignore_index=True)
    merged = merged.rename(columns={"text": "text", "label": "label"})
    merged["text"] = merged["text"].astype(str)
    return merged


def apply_filters(df: pd.DataFrame) -> pd.DataFrame:
    filtered = df[~df["label"].isin(BLACKLIST_INTENTS)].copy()
    label_counts = filtered["label"].value_counts()
    keep_labels = label_counts[label_counts >= MIN_SAMPLES].index
    filtered = filtered[filtered["label"].isin(keep_labels)].copy()
    filtered["text_length"] = filtered["text"].str.len()
    return filtered


def balance_dataset(df: pd.DataFrame, max_samples: int) -> pd.DataFrame:
    balanced_parts = []
    for label, group in df.groupby("label"):
        if len(group) > max_samples:
            sample = group.sample(n=max_samples, random_state=RANDOM_SEED)
        else:
            sample = group
        balanced_parts.append(sample)
    balanced = pd.concat(balanced_parts, ignore_index=True)
    balanced = balanced.sample(frac=1.0, random_state=RANDOM_SEED).reset_index(drop=True)
    balanced["text_length"] = balanced["text"].str.len()
    return balanced

In [None]:
raw_df = load_raw_dataset()
print(f"Loaded {len(raw_df):,} rows from {raw_df['dataset'].nunique()} datasets")
raw_df.head()

In [None]:
raw_summary = (
    raw_df.groupby("dataset")
    .agg(samples=("text", "count"), unique_labels=("label", "nunique"))
    .sort_values("samples", ascending=False)
)
raw_summary

### Clean and balance the dataset
1. Remove intents on the blacklist that do not fit the WeChat chat use case.
2. Drop intents with fewer than `MIN_SAMPLES` examples.
3. Cap large classes to `MAX_SAMPLES_PER_INTENT` to avoid bias.
4. Shuffle to avoid unintentional ordering artifacts.

In [None]:
filtered_df = apply_filters(raw_df)
balanced_df = balance_dataset(filtered_df, MAX_SAMPLES_PER_INTENT)

print(f"Filtered dataset: {len(filtered_df):,} rows · {filtered_df['label'].nunique()} intents")
print(f"Balanced dataset: {len(balanced_df):,} rows · {balanced_df['label'].nunique()} intents")

balanced_df.head()

In [None]:
intent_counts = balanced_df["label"].value_counts()
length_stats = balanced_df["text_length"].describe()

print("Length (chars) stats:")
print(length_stats)

intent_counts.head(10)

### Label distribution (top 30 intents)
The bar plot highlights the top intents after filtering and balancing.

In [None]:
top_n = 30
plot_series = intent_counts.head(top_n)[::-1]
plt.figure(figsize=(10, 12))
sns.barplot(x=plot_series.values, y=plot_series.index, palette="viridis")
plt.title(f"Top {top_n} intents by sample count")
plt.xlabel("Samples")
plt.ylabel("Intent")
plt.tight_layout()
plt.show()

### Text length overview
Character-length histograms and per-dataset box plots help us understand complexity and tailor truncation length.

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(balanced_df["text_length"], bins=40, kde=True, color="#3b82f6")
plt.axvline(balanced_df["text_length"].median(), color="red", linestyle="--", label="median")
plt.title("Message length distribution")
plt.xlabel("Characters per message")
plt.legend()
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 5))
sns.boxplot(data=balanced_df, x="dataset", y="text_length", palette="Set2")
plt.title("Length spread per source dataset")
plt.xlabel("Dataset")
plt.ylabel("Characters")
plt.tight_layout()
plt.show()

### Dataset vs intent heatmap
The heatmap highlights which intents originate from which dataset, revealing potential coverage gaps.

In [None]:
pivot = (
    balanced_df.groupby(["dataset", "label"])
    .size()
    .unstack(fill_value=0)
)
plt.figure(figsize=(12, 6))
sns.heatmap(
    pivot,
    cmap="YlGnBu",
    cbar_kws={"label": "Samples"},
    linewidths=0.5,
)
plt.title("Dataset contribution heatmap")
plt.xlabel("Intent")
plt.ylabel("Dataset")
plt.tight_layout()
plt.show()

### Persist processed data for reuse
Saving the balanced dataset speeds up repeated experiments and keeps `assets/models/intent_predictor` self-contained.

In [None]:
balanced_df.to_parquet(PROCESSED_DATA_PATH, index=False)
intent_counts.to_csv(LABEL_COUNTS_PATH, header=["samples"])

print(f"Processed dataset saved to: {PROCESSED_DATA_PATH}")
print(f"Label distribution saved to: {LABEL_COUNTS_PATH}")
print(f"Timestamp: {datetime.utcnow().isoformat()}Z")

### Quick samples
Use this cell to inspect a few random rows whenever you re-run the notebook.

In [None]:
balanced_df.sample(5, random_state=RANDOM_SEED)