In [1]:
from pathlib import Path
from collections import Counter
from re import compile
from test_utility import ensure_data


def count_word_freq(path: Path) -> tuple[Counter, int, int]:
    f = Counter()
    with open(path, encoding='utf-8') as src:
        pattern = compile("\\s+")
        for line in src:
            for word in pattern.split(line.strip()):
                f[word] += 1
    return f, sum(f.values()), sum(len(w) * f[w] for w in f.keys())


def count_corpus(train_path: Path, test_path: Path) -> tuple[float, float, float, float, float, float, float, float, float]:
    train_counter, train_freq, train_chars = count_word_freq(train_path)
    test_counter, test_freq, test_chars = count_word_freq(test_path)
    test_oov = sum(test_counter[w] for w in test_counter.keys() - train_counter.keys())
    return train_chars / 10000, len(train_counter) / 10000, train_freq / 10000, train_chars / train_freq, test_chars / 10000, len(test_counter) / 10000, test_freq / 10000, test_chars / test_freq, test_oov / test_freq * 100


sighan05 = ensure_data('icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip')

print('|语料库|字符数|词语种数|总词频|平均词长|字符数|词语种数|总词频|平均词长|OOV|')

for data in 'pku', 'msr', 'as', 'cityu':
    train_path = sighan05 / 'training' / f'{data}_training.utf8'
    test_path = sighan05 / 'gold' / (f'{data}_testing_gold.utf8' if data == 'as' else f'{data}_test_gold.utf8')
    print('|%s|%.0f万|%.0f万|%.0f万|%.1f|%.0f万|%.0f万|%.0f万|%.1f|%.2f%%|' % ((data.upper(),) + count_corpus(train_path, test_path)))

|语料库|字符数|词语种数|总词频|平均词长|字符数|词语种数|总词频|平均词长|OOV|
|PKU|183万|6万|111万|1.6|17万|1万|10万|1.7|5.75%|
|MSR|405万|9万|237万|1.7|18万|1万|11万|1.7|2.65%|
|AS|837万|14万|545万|1.5|20万|2万|12万|1.6|4.33%|
|CITYU|240万|7万|146万|1.7|7万|1万|4万|1.7|7.40%|
