In [3]:
# ===== Q4: data statistics helpers (no LaTeX output) =====
import os, re
from typing import List, Dict
from transformers import T5TokenizerFast

DATA_DIR = "data"  # 如路径不同改这里

def _read_lines(path: str) -> List[str]:
    with open(path, "r", encoding="utf-8") as f:
        lines = [ln.strip() for ln in f]
    return [ln for ln in lines if ln != ""]

def _char_stats(lines: List[str]) -> Dict[str, float]:
    lens = [len(s) for s in lines]
    if not lens:
        return {"mean": 0.0, "min": 0, "max": 0}
    return {"mean": sum(lens)/len(lens), "min": min(lens), "max": max(lens)}

# 朴素分词用于“预处理前”的词表规模
_NL_TOKEN_RE  = re.compile(r"[A-Za-z0-9_]+")
_SQL_TOKEN_RE = re.compile(r"[A-Za-z0-9_]+|[(),.*=<>!;'%+-/]+")

def _vocab_size_simple(lines: List[str], kind: str) -> int:
    toks = []
    if kind == "nl":
        for s in lines: toks += _NL_TOKEN_RE.findall(s.lower())
    elif kind == "sql":
        for s in lines: toks += _SQL_TOKEN_RE.findall(s.lower())
    else:
        raise ValueError("kind must be 'nl' or 'sql'")
    return len(set(toks))

def _token_len_stats_t5(lines: List[str], tokenizer: T5TokenizerFast) -> Dict[str, float]:
    # 长度统计：包含特殊符号（更接近模型真实输入）
    enc = tokenizer(lines, add_special_tokens=True, padding=False, truncation=False, return_length=True)
    lens = enc["length"]
    if not lens:
        return {"mean": 0.0, "min": 0, "max": 0}
    return {"mean": sum(lens)/len(lens), "min": min(lens), "max": max(lens)}

def _token_vocab_size_t5(lines: List[str], tokenizer: T5TokenizerFast) -> int:
    # 词表规模：不计特殊符号
    ids = []
    for s in lines:
        ids += tokenizer.encode(s, add_special_tokens=False)
    return len(set(ids))


In [4]:
# ===== Compute and print numbers only =====
# 1) 读取
train_nl  = _read_lines(os.path.join(DATA_DIR, "train.nl"))
dev_nl    = _read_lines(os.path.join(DATA_DIR, "dev.nl"))
train_sql = _read_lines(os.path.join(DATA_DIR, "train.sql"))
dev_sql   = _read_lines(os.path.join(DATA_DIR, "dev.sql"))

# 2) 预处理前（字符 & 简单词表）
before_train = {
    "num_examples": len(train_nl),
    "sent_chars": _char_stats(train_nl),
    "sql_chars":  _char_stats(train_sql),
    "vocab_nl":   _vocab_size_simple(train_nl, "nl"),
    "vocab_sql":  _vocab_size_simple(train_sql, "sql"),
}
before_dev = {
    "num_examples": len(dev_nl),
    "sent_chars": _char_stats(dev_nl),
    "sql_chars":  _char_stats(dev_sql),
    "vocab_nl":   _vocab_size_simple(dev_nl, "nl"),
    "vocab_sql":  _vocab_size_simple(dev_sql, "sql"),
}

# 3) 预处理后（T5 tokenizer 的 token 级）
tokenizer = T5TokenizerFast.from_pretrained("google-t5/t5-small")

after_train = {
    "sent_tokens": _token_len_stats_t5(train_nl, tokenizer),
    "sql_tokens":  _token_len_stats_t5(train_sql, tokenizer),
    "vocab_nl_tokens":  _token_vocab_size_t5(train_nl, tokenizer),
    "vocab_sql_tokens": _token_vocab_size_t5(train_sql, tokenizer),
}
after_dev = {
    "sent_tokens": _token_len_stats_t5(dev_nl, tokenizer),
    "sql_tokens":  _token_len_stats_t5(dev_sql, tokenizer),
    "vocab_nl_tokens":  _token_vocab_size_t5(dev_nl, tokenizer),
    "vocab_sql_tokens": _token_vocab_size_t5(dev_sql, tokenizer),
}

# 4) 打印（保留 1 位小数，方便你直接粘）
def _fmt(x): 
    return f"{x:.1f}" if isinstance(x, float) else str(x)

print("=== BEFORE preprocessing (chars / simple vocab) ===")
print(f"Number of examples: Train={before_train['num_examples']}  Dev={before_dev['num_examples']}")
print(f"Mean sentence length (chars): Train={_fmt(before_train['sent_chars']['mean'])}  Dev={_fmt(before_dev['sent_chars']['mean'])}")
print(f"Mean SQL query length (chars): Train={_fmt(before_train['sql_chars']['mean'])}  Dev={_fmt(before_dev['sql_chars']['mean'])}")
print(f"Min/Max sentence length (chars): Train={before_train['sent_chars']['min']} / {before_train['sent_chars']['max']}  "
      f"Dev={before_dev['sent_chars']['min']} / {before_dev['sent_chars']['max']}")
print(f"Min/Max SQL query length (chars): Train={before_train['sql_chars']['min']} / {before_train['sql_chars']['max']}  "
      f"Dev={before_dev['sql_chars']['min']} / {before_dev['sql_chars']['max']}")
print(f"Vocabulary size (natural language): Train={before_train['vocab_nl']}  Dev={before_dev['vocab_nl']}")
print(f"Vocabulary size (SQL): Train={before_train['vocab_sql']}  Dev={before_dev['vocab_sql']}")

print("\n=== AFTER preprocessing (T5 tokens) ===")
print(f"Mean sentence length (tokens): Train={_fmt(after_train['sent_tokens']['mean'])}  Dev={_fmt(after_dev['sent_tokens']['mean'])}")
print(f"Mean SQL query length (tokens): Train={_fmt(after_train['sql_tokens']['mean'])}  Dev={_fmt(after_dev['sql_tokens']['mean'])}")
print(f"Min/Max sentence length (tokens): Train={after_train['sent_tokens']['min']} / {after_train['sent_tokens']['max']}  "
      f"Dev={after_dev['sent_tokens']['min']} / {after_dev['sent_tokens']['max']}")
print(f"Min/Max SQL query length (tokens): Train={after_train['sql_tokens']['min']} / {after_train['sql_tokens']['max']}  "
      f"Dev={after_dev['sql_tokens']['min']} / {after_dev['sql_tokens']['max']}")
print(f"Vocabulary size (natural language, tokens): Train={after_train['vocab_nl_tokens']}  Dev={after_dev['vocab_nl_tokens']}")
print(f"Vocabulary size (SQL, tokens): Train={after_train['vocab_sql_tokens']}  Dev={after_dev['vocab_sql_tokens']}")


=== BEFORE preprocessing (chars / simple vocab) ===
Number of examples: Train=4225  Dev=466
Mean sentence length (chars): Train=62.2  Dev=61.8
Mean SQL query length (chars): Train=600.7  Dev=582.7
Min/Max sentence length (chars): Train=6 / 215  Dev=10 / 139
Min/Max SQL query length (chars): Train=73 / 1438  Dev=78 / 1370
Vocabulary size (natural language): Train=858  Dev=443
Vocabulary size (SQL): Train=540  Dev=340

=== AFTER preprocessing (T5 tokens) ===
Mean sentence length (tokens): Train=18.1  Dev=18.1
Mean SQL query length (tokens): Train=217.4  Dev=211.1
Min/Max sentence length (tokens): Train=3 / 60  Dev=4 / 44
Min/Max SQL query length (tokens): Train=26 / 511  Dev=31 / 503
Vocabulary size (natural language, tokens): Train=791  Dev=465
Vocabulary size (SQL, tokens): Train=555  Dev=395
