# CapsWriter-Offline 独立热词与纠错系统 (Portable Standalone)

本 Notebook 完整整合了以下核心逻辑，**逻辑分支与原始代码库完全对等**：
- **音素处理** (algo_phoneme)
- **相似度算法** (algo_calc)
- **FastRAG 加速检索** (rag_fast)
- **拼音纠错** (PhonemeCorrector, 包含 `similar_threshold` 相关逻辑)
- **纠错历史 RAG** (RectificationRAG)
- **调试工具** (Phoneme Debug)
- **LLM 集成** (Prompt Builder & Ollama Client)

数据准备和输出方式参照自 `hotword_system_demo.ipynb`。

In [1]:
import sys
import os
import re
import time
import json
import requests
import threading
import logging
from typing import List, Tuple, Dict, Set, Union, Literal, Optional, NamedTuple
from dataclasses import dataclass
from collections import defaultdict
from difflib import SequenceMatcher
from pathlib import Path

# 配置日志
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)

# 确保控制台输出 UTF-8
if sys.platform == 'win32':
    if hasattr(sys.stdout, 'reconfigure'):
        sys.stdout.reconfigure(encoding='utf-8')

# --- 依赖库导入 ---
try:
    from pypinyin import pinyin, Style
except ImportError:
    pinyin = None; Style = None

try:
    import numpy as np
    HAS_NUMPY = True
except ImportError:
    HAS_NUMPY = False

try:
    from numba import njit
    HAS_NUMBA = True
except ImportError:
    HAS_NUMBA = False

## 1. 核心模型与音素处理 (algo_phoneme)

In [2]:
@dataclass(frozen=True, slots=True)
class Phoneme:
    value: str
    lang: Literal['zh', 'en', 'num', 'other']
    is_word_start: bool = False
    is_word_end: bool = False
    char_start: int = 0
    char_end: int = 0

    @property
    def is_tone(self) -> bool: return self.value.isdigit()
    @property
    def info(self) -> Tuple[str, str, bool, bool, bool, int, int]:
        return (self.value, self.lang, self.is_word_start, self.is_word_end, self.is_tone, self.char_start, self.char_end)

def normalize_text(text: str) -> str:
    res = []; prev = ''
    for c in text:
        if c.isalnum() or '\u4e00' <= c <= '\u9fff':
            if c.isupper() and prev.islower(): res.append(' ')
            elif c.isdigit() and prev.isalpha(): res.append(' ')
            elif prev.isdigit() and c.isalpha(): res.append(' ')
            res.append(c.lower()); prev = c
        else:
            if res and res[-1] != ' ': res.append(' ')
            prev = ''
    return "".join(res).strip()

def split_mixed_label(text: str) -> List[str]:
    tokens = []; s = text.lower()
    while s:
        if s[0] == ' ': s = s[1:]; continue
        m = re.match(r'[a-z]+', s)
        if m: tokens.append(m.group(0)); s = s[len(m.group(0)):]
        else:
            m = re.match(r'[0-9]+', s)
            if m: tokens.append(m.group(0)); s = s[len(m.group(0)):]
            else: tokens.append(s[0]); s = s[1:]
    return tokens

def get_phoneme_info(text: str, split_char: bool = True) -> List[Phoneme]:
    if not pinyin: return [Phoneme(c, 'zh', char_start=i, char_end=i+1) for i, c in enumerate(text)]
    seq = []; pos = 0
    while pos < len(text):
        c = text[pos]
        if '\u4e00' <= c <= '\u9fff':
            start = pos; pos += 1
            while pos < len(text) and '\u4e00' <= text[pos] <= '\u9fff': pos += 1
            frag = text[start:pos]
            try:
                pi = pinyin(frag, style=Style.INITIALS, strict=False)
                pf = pinyin(frag, style=Style.FINALS, strict=False)
                pt = pinyin(frag, style=Style.TONE3, neutral_tone_with_five=True)
                for i in range(min(len(frag), len(pi), len(pf), len(pt))):
                    idx = start + i; init, fin, tone = pi[i][0], pf[i][0], pt[i][0]
                    if init: seq.append(Phoneme(init, 'zh', is_word_start=True, char_start=idx, char_end=idx+1))
                    if fin: seq.append(Phoneme(fin, 'zh', is_word_start=not init, char_start=idx, char_end=idx+1))
                    if tone and tone[-1].isdigit(): seq.append(Phoneme(tone[-1], 'zh', is_word_end=True, char_start=idx, char_end=idx+1))
            except: 
                for i, char in enumerate(frag): seq.append(Phoneme(char, 'zh', is_word_start=True, is_word_end=True, char_start=start+i, char_end=start+i+1))
        elif 'a' <= c.lower() <= 'z' or '0' <= c <= '9':
            start = pos; pos += 1
            while pos < len(text):
                cur = text[pos]
                if not ('a' <= cur.lower() <= 'z' or '0' <= cur <= '9'): break
                if (text[pos-1].islower() and cur.isupper()) or (text[pos-1].isalpha() and cur.isdigit()) or (text[pos-1].isdigit() and cur.isalpha()): break
                pos += 1
            token = text[start:pos].lower(); lang = 'num' if token.isdigit() else 'en'
            if split_char:
                for i, char in enumerate(token): seq.append(Phoneme(char, lang, is_word_start=(i==0), is_word_end=(i==len(token)-1), char_start=start+i, char_end=start+i+1))
            else: seq.append(Phoneme(token, lang, is_word_start=True, is_word_end=True, char_start=start, char_end=pos))
        else: pos += 1
    return seq

def get_phoneme_seq(text: str) -> List[Phoneme]:
    normalized = normalize_text(text)
    seq = []
    for token in split_mixed_label(normalized):
        if re.match(r'^[a-z0-9]+$', token):
            lang = 'num' if token.isdigit() else 'en'
            seq.append(Phoneme(token, lang, is_word_start=True, is_word_end=True))
        elif len(token) == 1:
            if not pinyin: seq.append(Phoneme(token, 'zh', is_word_start=True, is_word_end=True))
            else:
                try:
                    pi = pinyin(token, style=Style.INITIALS, strict=False)
                    pf = pinyin(token, style=Style.FINALS, strict=False)
                    pt = pinyin(token, style=Style.TONE3, neutral_tone_with_five=True)
                    has_init = pi and pi[0] and pi[0][0]
                    if has_init: seq.append(Phoneme(pi[0][0], 'zh', is_word_start=True))
                    if pf and pf[0] and pf[0][0]: seq.append(Phoneme(pf[0][0], 'zh', is_word_start=not has_init))
                    tone = pt[0][0][-1] if pt[0][0][-1].isdigit() else '5'
                    seq.append(Phoneme(tone, 'zh', is_word_end=True))
                except: seq.append(Phoneme(token, 'zh', is_word_start=True, is_word_end=True))
        else: seq.append(Phoneme(token, 'zh', is_word_start=True, is_word_end=True))
    return seq

## 2. 相似度算法 (algo_calc)

In [3]:
SIMILAR_PHONEMES = [{'an', 'ang'}, {'en', 'eng'}, {'in', 'ing'}, {'ian', 'iang'}, {'uan', 'uang'}, {'z', 'zh'}, {'c', 'ch'}, {'s', 'sh'}, {'l', 'n'}, {'f', 'h'}, {'ai', 'ei'}]

def _lcs_length(s1: str, s2: str) -> int:
    m, n = len(s1), len(s2)
    if m < n: s1, s2 = s2, s1; m, n = n, m
    if n == 0: return 0
    prev = [0] * (n + 1); curr = [0] * (n + 1)
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            curr[j] = prev[j-1] + 1 if s1[i-1] == s2[j-1] else max(prev[j], curr[j-1])
        prev, curr = curr, prev
    return prev[n]

def _get_tuple_cost(t1: Tuple, t2: Tuple) -> float:
    if t1[1] != t2[1]: return 1.0
    if t1[0] == t2[0]: return 0.0
    if t1[1] == 'zh':
        pair = {t1[0], t2[0]}
        for s in SIMILAR_PHONEMES:
            if pair.issubset(s): return 0.5
    if t1[1] == 'en':
        lcs = _lcs_length(t1[0], t2[0])
        max_len = max(len(t1[0]), len(t2[0]))
        if max_len > 0: return 1.0 - (lcs / max_len)
    return 1.0

def fuzzy_substring_distance(seq1: Union[List[Phoneme], List[Tuple]], seq2: Union[List[Phoneme], List[Tuple]]) -> float:
    n, m = len(seq1), len(seq2)
    if n == 0: return 0.0
    if m == 0: return float(n)
    t1 = [p.info if isinstance(p, Phoneme) else p for p in seq1]
    t2 = [p.info if isinstance(p, Phoneme) else p for p in seq2]
    prev = [0.0] * (m + 1); curr = [0.0] * (m + 1)
    for i in range(1, n + 1):
        curr[0] = float(i)
        for j in range(1, m + 1):
            cost = _get_tuple_cost(t1[i-1], t2[j-1])
            curr[j] = min(prev[j]+1.0, curr[j-1]+1.0, prev[j-1]+cost)
        prev, curr = curr, prev
    return min(prev)

def fuzzy_substring_score(seq1, seq2) -> float:
    n = len(seq1)
    if n == 0: return 0.0
    dist = fuzzy_substring_distance(seq1, seq2)
    return max(0.0, 1.0 - (dist / n))

## 3. RAG 加速检索与核心算法 (rag_fast)

In [4]:
if HAS_NUMBA and HAS_NUMPY:
    @njit(cache=True)
    def _fuzzy_substring_numba(main, sub):
        n, m = len(sub), len(main)
        if n == 0 or m == 0: return float(n)
        dp = np.zeros((n+1, m+1), dtype=np.float32)
        for i in range(1, n+1): dp[i, 0] = float(i)
        for i in range(1, n+1):
            for j in range(1, m+1):
                cost = 0.0 if sub[i-1] == main[j-1] else 1.0
                dp[i, j] = min(dp[i-1, j]+1.0, dp[i, j-1]+1.0, dp[i-1, j-1]+cost)
        return np.min(dp[n, 1:])

class FastRAG:
    def __init__(self, threshold=0.6):
        self.threshold = threshold
        self.ph_to_id = {}; self.index = defaultdict(list); self.hotword_count = 0
    def _encode(self, phs: List[Phoneme]):
        ids = []
        for p in phs:
            if p.value not in self.ph_to_id: self.ph_to_id[p.value] = len(self.ph_to_id) + 1
            ids.append(self.ph_to_id[p.value])
        return np.array(ids, dtype=np.int32) if HAS_NUMPY else ids
    def add_hotwords(self, hotwords: Dict[str, List[Phoneme]]):
        for hw, phs in hotwords.items():
            if not phs: continue
            codes = self._encode(phs)
            for i in range(min(len(codes), 2)): self.index[codes[i]].append((hw, codes))
            self.hotword_count += 1
    def search(self, input_phs: List[Phoneme], top_k=10):
        if not input_phs: return []
        input_codes = self._encode(input_phs); unique = set(input_codes); candidates = []
        for c in unique: candidates.extend(self.index.get(c, []))
        seen = set(); results = []
        for hw, cands in candidates:
            if hw in seen or len(cands) > len(input_codes) + 3: continue
            seen.add(hw)
            if HAS_NUMBA and HAS_NUMPY: dist = _fuzzy_substring_numba(input_codes, cands)
            else: dist = self._python_dist(input_codes, cands)
            score = 1.0 - (dist / len(cands))
            if score >= self.threshold: results.append((hw, round(float(score), 3)))
        results.sort(key=lambda x: x[1], reverse=True)
        return results[:top_k]
    def _python_dist(self, main, sub):
        n, m = len(sub), len(main)
        dp = [[0.0] * (m+1) for _ in range(n+1)]
        for i in range(1, n+1): dp[i][0] = float(i)
        for i in range(1, n+1):
            for j in range(1, m+1):
                cost = 0.0 if sub[i-1] == main[j-1] else 1.0
                dp[i][j] = min(dp[i-1, j]+1.0, dp[i, j-1]+1.0, dp[i-1, j-1]+cost)
        return min(dp[n][1:])

## 4. 纠错系统逻辑 (hot_phoneme & hot_rectification)

In [5]:
class MatchResult(NamedTuple):
    start: int; end: int; score: float; hotword: str

class CorrectionResult(NamedTuple):
    text: str; matchs: List[Tuple[str, float]]; similars: List[Tuple[str, float]]

class PhonemeCorrector:
    def __init__(self, threshold: float = 0.7, similar_threshold: float = None):
        self.threshold = threshold
        self.similar_threshold = similar_threshold if similar_threshold is not None else threshold - 0.2
        self.hotwords: Dict[str, List[Phoneme]] = {}
        self.fast_rag = FastRAG(threshold=min(self.threshold, self.similar_threshold) - 0.1)
        self._lock = threading.Lock()
    def update_hotwords(self, text: str):
        lines = [l.strip() for l in text.splitlines() if l.strip() and not l.strip().startswith('#')]
        new_hotwords = {}
        for hw in lines:
            phs = get_phoneme_info(hw)
            if phs: new_hotwords[hw] = phs
        with self._lock:
            self.hotwords = new_hotwords
            self.fast_rag = FastRAG(threshold=min(self.threshold, self.similar_threshold) - 0.1)
            self.fast_rag.add_hotwords(new_hotwords)
        return len(new_hotwords)
    def load_hotwords_file(self, path: str):
        if os.path.exists(path):
            with open(path, 'r', encoding='utf-8') as f: return self.update_hotwords(f.read())
        return 0
    def _find_matches(self, fast_results, input_processed):
        matches = []; similars = []; input_len = len(input_processed)
        for hw, _ in fast_results:
            hw_phs = self.hotwords[hw]; hw_compare = [p.info[:5] for p in hw_phs]; target_len = len(hw_compare)
            if target_len > input_len: continue
            for i in range(input_len - target_len + 1):
                seg = input_processed[i : i + target_len]
                if seg[0][1] != 'en' and seg[0][0] != hw_compare[0][0]: continue
                if not seg[0][2]: continue
                is_end_ok = seg[-1][3] or (i+target_len < input_len and input_processed[i+target_len][1]=='zh' and input_processed[i+target_len][4])
                if not is_end_ok: continue
                score = fuzzy_substring_score(hw_compare, seg)
                m = MatchResult(seg[0][5], seg[-1][6], score, hw); similars.append(m)
                if score >= self.threshold: matches.append(m)
        seen = set(); sorted_sims = sorted(similars, key=lambda x: x.score, reverse=True)
        sims_final = [m for m in sorted_sims if m.score >= self.similar_threshold and not (m.hotword in seen or seen.add(m.hotword))]
        return matches, sims_final
    def _resolve_and_replace(self, text, matches):
        matches.sort(key=lambda x: (x.score, x.end - x.start), reverse=True)
        final = []; all_info = []; occupied = []; seen = set()
        for m in matches:
            if (m.hotword, m.score) not in seen: all_info.append((m.hotword, m.score)); seen.add((m.hotword, m.score))
            if m.score < self.threshold or any(not (m.end <= rs or m.start >= re) for rs, re in occupied): continue
            if text[m.start:m.end] != m.hotword: final.append(m)
            occupied.append((m.start, m.end))
        res = list(text); final.sort(key=lambda x: x.start, reverse=True)
        for m in final: res[m.start:m.end] = list(m.hotword)
        return "".join(res), [(m.hotword, m.score) for m in sorted(final, key=lambda x: x.start)], all_info
    def correct(self, text, k=10):
        in_phs = get_phoneme_info(text)
        if not in_phs: return CorrectionResult(text, [], [])
        with self._lock:
            fast_res = self.fast_rag.search(in_phs, top_k=100); processed = [p.info for p in in_phs]
            matches, sims = self._find_matches(fast_res, processed)
        nt, fhw, ainfo = self._resolve_and_replace(text, matches)
        return CorrectionResult(nt, fhw, [(m.hotword, m.score) for m in sims[:k]])

def _get_word_boundaries(text):
    bounds = []; i, n = 0, len(text)
    while i < n:
        if not (text[i].isalnum() or '\u4e00' <= text[i] <= '\u9fff'): i += 1; continue
        s = i
        if '\u4e00' <= text[i] <= '\u9fff': i += 1
        else:
            low = text[i].islower()
            while i < n and text[i].isalnum():
                if text[i].isupper() and low and i > s: break
                low = text[i].islower(); i += 1
        bounds.append((s, i, text[s:i]))
    return bounds

def extract_diff_fragments(w, r):
    wb = _get_word_boundaries(w); rb = _get_word_boundaries(r)
    m = SequenceMatcher(None, [b[2] for b in wb], [b[2] for b in rb]); frags = []
    for tag, i1, i2, j1, j2 in m.get_opcodes():
        if tag in ('replace', 'delete') and i2 > i1: frags.append(w[wb[i1][0]:wb[i2-1][1]])
        if tag in ('replace', 'insert') and j2 > j1: frags.append(r[rb[j1][0]:rb[j2-1][1]])
    return list(dict.fromkeys(frags))

class RectificationRAG:
    def __init__(self, threshold=0.5):
        self.threshold = threshold; self.records = []; self._lock = threading.Lock()
    def load_rectify_text(self, text):
        recs = []
        for block in text.split('---'):
            lines = [l.strip() for l in block.split('\n') if l.strip() and not l.strip().startswith('#')]
            if len(lines) >= 2:
                w, r = lines[0], lines[1]; frags = extract_diff_fragments(w, r) or [w]
                recs.append({'wrong': w, 'right': r, 'fphs': {f: get_phoneme_seq(f) for f in frags}})
        with self._lock: self.records = recs
    def load_rectify_file(self, path: str):
        if os.path.exists(path):
            with open(path, 'r', encoding='utf-8') as f: self.load_rectify_text(f.read())
    def search(self, text, top_k=5):
        in_phs = get_phoneme_seq(text); matches = []
        with self._lock:
            for rec in self.records:
                best = 0.0
                for fphs in rec['fphs'].values():
                    if not fphs: continue
                    score = fuzzy_substring_score(fphs, in_phs)
                    if score > best: best = score
                if best >= self.threshold: matches.append((rec['wrong'], rec['right'], round(best, 3)))
        return sorted(matches, key=lambda x: x[2], reverse=True)[:top_k]

## 5. 调试工具 (Phoneme Debug)

In [6]:
def get_phoneme_cost(p1: Phoneme, p2: Phoneme) -> float:
    if p1.lang != p2.lang: return 1.0
    if p1.value == p2.value: return 0.0
    if p1.lang == 'zh' and p2.lang == 'zh':
        pair = {p1.value, p2.value}
        for s in SIMILAR_PHONEMES: 
            if pair.issubset(s): return 0.5
    if p1.lang == 'en' and p2.lang == 'en':
        lcs_len = _lcs_length(p1.value, p2.value)
        max_len = max(len(p1.value), len(p2.value))
        return 1.0 - (lcs_len / max_len)
    return 1.0

def find_best_match(main_seq: List[Phoneme], sub_seq: List[Phoneme]) -> Tuple[float, int, int]:
    n, m = len(sub_seq), len(main_seq)
    if n == 0 or m == 0: return 0.0, 0, 0
    valid_starts = [j for j in range(m) if main_seq[j].is_word_start]
    dp = [[0.0] * (m + 1) for _ in range(n + 1)]
    for j in range(m + 1):
        if j not in valid_starts: dp[0][j] = float('inf')
    for i in range(1, n + 1): dp[i][0] = dp[i-1][0] + 1.0
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = get_phoneme_cost(sub_seq[i-1], main_seq[j-1])
            dp[i][j] = min(dp[i-1][j] + 1.0, dp[i][j-1] + 1.0, dp[i-1][j-1] + cost)
    min_dist, end_pos, best_start = float('inf'), 0, 0
    for j in range(1, m + 1):
        if dp[n][j] < min_dist:
            curr_i, curr_j = n, j
            while curr_i > 0:
                cost = get_phoneme_cost(sub_seq[curr_i-1], main_seq[curr_j-1])
                if curr_j > 0 and abs(dp[curr_i][curr_j] - (dp[curr_i-1][curr_j-1] + cost)) < 1e-9:
                    curr_i -= 1; curr_j -= 1
                elif abs(dp[curr_i][curr_j] - (dp[curr_i-1][curr_j] + 1.0)) < 1e-9: curr_i -= 1
                elif curr_j > 0 and abs(dp[curr_i][curr_j] - (dp[curr_i][curr_j-1] + 1.0)) < 1e-9: curr_j -= 1
                else: curr_i -= 1
            if curr_j in valid_starts: 
                min_dist = dp[n][j]; end_pos = j; best_start = curr_j
    return 1.0 - (min_dist / n), best_start, end_pos

def test_pair(input_text, hotword, split_char=True):
    print(f"--- Testing: '{input_text}' vs '{hotword}' ---")
    input_seq = get_phoneme_info(input_text, split_char=split_char)
    target_seq = get_phoneme_info(hotword, split_char=split_char)
    print(f"Input Seq: {[p.value for p in input_seq]}")
    print(f"Target Seq: {[p.value for p in target_seq]}")
    score, start, end = find_best_match(input_seq, target_seq)
    print(f"Score: {score:.4f}")
    if score > 0:
        matched_segment = input_seq[start:end]
        print(f"Matched Segment: {[p.value for p in matched_segment]}")
    print("\n")

## 6. LLM 集成 (Prompt Builder & Ollama Client)

In [7]:
class PromptBuilder:
    def __init__(self, system_prompt: str = "你是一个输入法纠错助。"):
        self.system_prompt = system_prompt
        self.prompt_prefix_hotwords = "热词列表："
        self.prompt_prefix_rectify = "纠错历史：\n"
        self.prompt_prefix_input = "用户输入："

    def build(self, user_content: str, hotwords: List[Tuple[str, float]] = None, rectify_matches: List[Tuple[str, str, float]] = None) -> List[Dict]:
        messages = [{"role": "system", "content": self.system_prompt}]
        context_parts = []
        if hotwords:
            words = [hw for hw, _ in hotwords]
            context_parts.append(f"{self.prompt_prefix_hotwords}[{', '.join(words)}]")
        if rectify_matches:
            lines = [self.prompt_prefix_rectify]
            for wrong, right, _ in rectify_matches:
                lines.append(f"- {wrong} => {right}")
            context_parts.append("\n".join(lines) + "\n\n") 
        context_str = "\n\n".join(context_parts)
        full_user_content = f"{context_str}{self.prompt_prefix_input}{user_content}"
        messages.append({"role": "user", "content": full_user_content})
        return messages

def ollama_chat(messages: List[Dict], model: str = "gemma3:4b", stream: bool = True):
    url = "http://localhost:11434/api/chat"
    payload = {"model": model, "messages": messages, "stream": stream}
    try:
        response = requests.post(url, json=payload, stream=stream)
        if not stream: return response.json().get('message', {}).get('content', '')
        full_res = ""
        for line in response.iter_lines():
            if line:
                chunk = json.loads(line)
                content = chunk.get('message', {}).get('content', '')
                full_res += content
                print(content, end="", flush=True)
                if chunk.get('done'): break
        print(); return full_res
    except Exception as e:
        print(f"\n[Error calling Ollama]: {e}"); return ""

## 7. 综合数据准备

In [14]:
# --- A. 数据准备 ---

hotwords_data = """
Claude
Bilibili
Microsoft
麦当劳
肯德基
# 这是一个注释
VsCode
七浦路
"""


rectify_data = """
# 纠错历史记录
# 用 --- 分段
# 每段两行：第一行是原句，第二行是修正结果

把那个锯子给我
把那个句子给我
---
cloud code is good
Claude Code is good
---
今天天其不错
今天天气不错
"""


test_cases_text = """
我想去吃买当劳和肯得鸡
喜欢刷Bili Bili
请把那个锯子发给我一下
今天天及真的很好
Hello klaude
I think klaud code is very good
我很喜欢 cloud
"""
cases = [l.strip() for l in test_cases_text.strip().split('\n') if l.strip()]

## 8. 系统初始化

In [9]:
# --- B. 系统初始化与数据加载 ---

# 初始化纠错器和检索器
corrector = PhonemeCorrector(threshold=0.8)
rectifier = RectificationRAG(threshold=0.5)

# 从字符串加载热词
corrector.update_hotwords(hotwords_data)
rectifier.load_rectify_text(rectify_data)

# 从文本文件加载热词
# corrector.load_hotwords_file("hot.txt")
# rectifier.load_rectify_file("hot-rectify.txt")

## 9. 执行综合纠错与 RAG 演示

In [15]:
# --- C. 执行综合纠错演示 ---
print("\n" + "="*50)
print("【 CapsWriter-Offline 综合纠错系统演示 】")
print("="*50)

for i, t in enumerate(cases):
    print(f"\nCase {i+1}: '{t}'")
    result = corrector.correct(t)
    print(f"  [纠错结果] {result.text}")
    if result.matchs: print(f"  [匹配热词] {result.matchs}")
    if result.similars: print(f"  [相似推荐] {result.similars}")
    rag_results = rectifier.search(t)
    if rag_results:
        print(f"  [RAG 相似历史]")
        for wrong, right, score in rag_results:
            print(f"    - '{wrong}' => '{right}' (相似度: {score:.3f})")


【 CapsWriter-Offline 综合纠错系统演示 】

Case 1: '我想去吃买当劳和肯得鸡'
  [纠错结果] 我想去吃麦当劳和肯德基
  [匹配热词] [('麦当劳', 0.8888888888888888), ('肯德基', 1.0)]
  [相似推荐] [('肯德基', 1.0), ('麦当劳', 0.8888888888888888)]
  [RAG 相似历史]
    - '今天天其不错' => '今天天气不错' (相似度: 0.667)

Case 2: '喜欢刷Bili Bili'
  [纠错结果] 喜欢刷Bilibili
  [匹配热词] [('Bilibili', 1.0)]
  [相似推荐] [('Bilibili', 1.0)]

Case 3: '请把那个锯子发给我一下'
  [纠错结果] 请把那个锯子发给我一下
  [RAG 相似历史]
    - '分段' => '把那个锯子给我' (相似度: 0.857)

Case 4: '今天天及真的很好'
  [纠错结果] 今天天及真的很好
  [RAG 相似历史]
    - '今天天其不错' => '今天天气不错' (相似度: 0.667)
    - '分段' => '把那个锯子给我' (相似度: 0.500)

Case 5: 'Hello klaude'
  [纠错结果] Hello Claude
  [匹配热词] [('Claude', 0.8333333333333334)]
  [相似推荐] [('Claude', 0.8333333333333334)]

Case 6: 'I think klaud code is very good'
  [纠错结果] I think klaud code is very good
  [相似推荐] [('VsCode', 0.6666666666666667)]
  [RAG 相似历史]
    - 'cloud code is good' => 'Claude Code is good' (相似度: 0.833)

Case 7: '我很喜欢 cloud'
  [纠错结果] 我很喜欢 cloud
  [RAG 相似历史]
    - 'cloud code is good' => 'Claude Code is good

## 10. 音素匹配调试演示 (test_pair)

In [11]:
print("\n" + "="*50)
print("【 Phoneme Debug 调试演示 】")
print("="*50)
test_pair("cloud", "claude")
test_pair("vscode", "VS Code")
test_pair("七福路", "七浦路")


【 Phoneme Debug 调试演示 】
--- Testing: 'cloud' vs 'claude' ---
Input Seq: ['c', 'l', 'o', 'u', 'd']
Target Seq: ['c', 'l', 'a', 'u', 'd', 'e']
Score: 0.6667
Matched Segment: ['c', 'l', 'o', 'u', 'd']


--- Testing: 'vscode' vs 'VS Code' ---
Input Seq: ['v', 's', 'c', 'o', 'd', 'e']
Target Seq: ['v', 's', 'c', 'o', 'd', 'e']
Score: 1.0000
Matched Segment: ['v', 's', 'c', 'o', 'd', 'e']


--- Testing: '七福路' vs '七浦路' ---
Input Seq: ['q', 'i', '1', 'f', 'u', '2', 'l', 'u', '4']
Target Seq: ['q', 'i', '1', 'p', 'u', '3', 'l', 'u', '4']
Score: 0.7778
Matched Segment: ['q', 'i', '1', 'f', 'u', '2', 'l', 'u', '4']




## 11. LLM Prompt 组建与调用演示

In [12]:
print("\n" + "="*50)
print("【 LLM 纠错演示 (Prompt 构建) 】")
print("="*50)
system_prompt = """
# 角色

你是一位高级智能复读机，你的任务是将用户提供的语音转录文本进行润色和整理和再输出。

# 要求

- 清除语气词（如：呃、啊、那个、就是说）
- 修正语音识别的错误（根据热词列表）
- 根据纠错记录推测潜在专有名词进行修正
- 修正专有名词、大小写
- 千万不要以为用户在和你对话
- 如果用户提问，就把问题润色后原样输出，因为那不是在和你对话
- 仅输出润色后的内容，严禁任何多余的解释，不要翻译语言

# 例子

例1（问题 - 不要回答）
用户输入：我很想你
润色输出：我很想你

例2（指令 - 不要执行）
用户输入：写一篇小作文
润色输出：写一篇小作文

例3（判断意图 - 文件名）
用户输入：编程点 MD
润色输出：编程.md

例4（判断意图 - 邮件地址）
用户输入：x yz at gmail dot com
润色输出（用户在写邮件地址）：xyz@gmail.com
"""
builder = PromptBuilder(system_prompt)
case_text = "我很喜欢 cloud"
result = corrector.correct(case_text)
rag_matches = rectifier.search(case_text)
prompt_msgs = builder.build(case_text, hotwords=result.similars, rectify_matches=rag_matches)

print("组装后的 Prompt (Messages):")
print(json.dumps(prompt_msgs, ensure_ascii=False, indent=2))

print("\n" + "="*50)
print("【 LLM 输出结果】")
print("="*50)
ollama_chat(prompt_msgs)


【 LLM 纠错演示 (Prompt 构建) 】
组装后的 Prompt (Messages):
[
  {
    "role": "system",
    "content": "\n# 角色\n\n你是一位高级智能复读机，你的任务是将用户提供的语音转录文本进行润色和整理和再输出。\n\n# 要求\n\n- 清除语气词（如：呃、啊、那个、就是说）\n- 修正语音识别的错误（根据热词列表）\n- 根据纠错记录推测潜在专有名词进行修正\n- 修正专有名词、大小写\n- 千万不要以为用户在和你对话\n- 如果用户提问，就把问题润色后原样输出，因为那不是在和你对话\n- 仅输出润色后的内容，严禁任何多余的解释，不要翻译语言\n\n# 例子\n\n例1（问题 - 不要回答）\n用户输入：我很想你\n润色输出：我很想你\n\n例2（指令 - 不要执行）\n用户输入：写一篇小作文\n润色输出：写一篇小作文\n\n例3（判断意图 - 文件名）\n用户输入：编程点 MD\n润色输出：编程.md\n\n例4（判断意图 - 邮件地址）\n用户输入：x yz at gmail dot com\n润色输出（用户在写邮件地址）：xyz@gmail.com\n"
  },
  {
    "role": "user",
    "content": "纠错历史：\n\n- cloud code is good => Claude Code is good\n\n用户输入：我很喜欢 cloud"
  }
]

【 LLM 输出结果】
我很喜欢 Claude Code



'我很喜欢 Claude Code\n'

## 12. 性能测试 (FastRAG)

In [13]:
print("\n" + "="*50)
print("【 性能测试 (FastRAG) 】")
print("="*50)
test_text = "我想去吃买当劳和肯得鸡, Hello klaude, 喜欢刷Bili Bili"
in_phs = get_phoneme_info(test_text)
if HAS_NUMBA:
    for _ in range(5): _ = corrector.fast_rag.search(in_phs)
start = time.time(); iterations = 1000
for _ in range(iterations): _ = corrector.fast_rag.search(in_phs)
elapsed = time.time() - start
print(f"测试文本: {test_text[:40]}...")
print(f"测试轮数: {iterations}")
print(f"平均耗时: {elapsed/iterations*1000:.3f}ms")
print(f"吞吐量: {iterations/elapsed:.1f} 次/秒")
print("="*50)


【 性能测试 (FastRAG) 】
测试文本: 我想去吃买当劳和肯得鸡, Hello klaude, 喜欢刷Bili Bili...
测试轮数: 1000
平均耗时: 0.028ms
吞吐量: 35124.9 次/秒
