In [1]:
import sentencepiece as spm
import os
import re

In [26]:
data_dir = 'ML_tools/NLP/sentencepiece/data_dir/'
os.path.exists(data_dir)

True

In [4]:
train_dev_raw_zh = os.path.join(data_dir,'train_dev.raw.zh')
train_dev_raw_en = os.path.join(data_dir,'train_dev.raw.en')

In [9]:
print('----------------------------------粗略查看中英文情况------------------------------')
count = 0
test_zh_list = list()
with open(train_dev_raw_zh, 'r') as file:
    for line in file:
        if count == 10:
            break
        test_zh_list.append(line)
        count = count + 1
print('----------------------------------中文文字情况------------------------------')
for each_zh in test_zh_list:
    print(each_zh)

count = 0
test_en_list = list()
with open(train_dev_raw_en, 'r') as file:
    for line in file:
        if count == 10:
            break
        test_en_list.append(line)
        count = count + 1
print('----------------------------------英文文字情况------------------------------')
for each_en in test_en_list:
    print(each_en)

----------------------------------粗略查看中英文情况------------------------------
----------------------------------中文文字情况------------------------------
非常謝謝你，克里斯。能有這個機會第二度踏上這個演講台

真是一大榮幸。我非常感激。

這個研討會給我留下了極為深刻的印象，我想感謝大家 對我之前演講的好評。

我是由衷的想這麼說，有部份原因是因為 —— 我真的有需要!

請你們設身處地為我想一想！

我曾搭乘副總統專機八年。

現在我卻必須脫了鞋子才能上飛機!

讓我跟你們說一個很短的故事，你們就會明白我的日子是怎麼過的。

這是一個真實的故事 — 徹頭徹尾都是真實的。

在我跟我夫人蒂佩爾離開 —— 白宮 —— 後 我們從那什維爾的家開車到 東邊 50 英哩外的一個我們擁有的小農場 —

----------------------------------英文文字情况------------------------------
Thank you so much, Chris.

And it's truly a great honor to have the opportunity to come to this stage twice; I'm extremely grateful.

I have been blown away by this conference, and I want to thank all of you for the many nice comments about what I had to say the other night.

And I say that sincerely, partly because  I need that.

Put yourselves in my position.

I flew on Air Force Two for eight years.

Now I have to take off my shoes or boots to get on an airplane!

I'll tell you one quick story to illus

## step1 Q2B
中文的标点符号 空格等内容都是全角字 要先将其转化为半角字

In [10]:
def strQ2B(ustring):
    # 全角字变半角字
    # 对字母 数字 标点符号 从全角字转成半角字
    ss = []
    for s in ustring:
        rstring = ""
        for uchar in s:
            inside_code = ord(uchar)
            if inside_code == 12288:  # Full width space: direct conversion
                inside_code = 32
            elif (inside_code >= 65281 and inside_code <= 65374):  # Full width chars (except space) conversion
                inside_code -= 65248
            rstring += chr(inside_code)
        ss.append(rstring)
    return ''.join(ss)

## step2 去除符号 统一处理标点符号

In [18]:
def clean_s(s, lang):
    """
    清洗中英字符串
    :param s:  string 代表每个句子
    :param lang: string 取值为zh-en 代表语言
    :return: 清洗过后的string句子
    """
    if lang == 'en':
        s = re.sub(r"\([^()]*\)", "", s) # remove ([text])         # 去掉([text])这个模式的文本
        s = s.replace('-', '') # remove '-'                        # 去掉-
        s = re.sub('([.,;!?()\"])', r' \1 ', s) # keep punctuation # 保留标点符号 在前后加上空格成为一个word
    elif lang == 'zh':
        s = strQ2B(s) # Q2B
        s = re.sub(r"\([^()]*\)", "", s) # remove ([text])        # 去掉([text])这个模式的文本
        s = s.replace(' ', '')
        s = s.replace('—', '')
        s = s.replace('“', '"')
        s = s.replace('”', '"')
        s = s.replace('_', '')                                    # 去掉 tab “ — _
        s = re.sub('([。,;!?()\"~「」])', r' \1 ', s) # keep punctuation # 保留标点符号 在前后加上空格成为一个word
    s = ' '.join(s.strip().split())
    return s

## step3 按照长度进行限制 进行清洗

In [29]:
def len_s(s, lang):
    """
    返回句子的长度(词的个数) 中文就是直接返回长度 英文是要通过空格分隔之后返回长度
    :param s: string句子文本
    :param lang: 语言类型
    :return: 句子长度
    """
    if lang == 'zh':
        return len(s)
    return len(s.split())

def clean(l1, l2, max_len = 1000, min_len = 1, ratio = 9):
    """
    清洗语句
    :param l1: language1
    :param l2: language2
    :param max_len: 限制文本最大长度 大于该长度舍弃
    :param min_len: 限制文本最小长度 小于改长度舍弃
    :param ratio: 当两个语言的文本长度差异过大 比值大于ratio时 舍弃
    :return: None
    """
    if os.path.exists(os.path.join(data_dir, 'train_dev.clean.{}'.format(l1))) \
            and os.path.exists(os.path.join(data_dir, 'train_dev.clean.{}'.format(l2))):
        print('{} and {} clean file exists.'.format(l1, l2))
        return
    # 如果存在l1和l2的clean 文件 就跳过该函数

    with open(os.path.join(data_dir, 'train_dev.raw.{}'.format(l1)), 'r') as l1_raw_file:
        with open(os.path.join(data_dir, 'train_dev.raw.{}'.format(l2)), 'r') as l2_raw_file:
            with open(data_dir + 'train_dev.clean.{}'.format(l1), 'w') as l1_clean_file:
                with open(data_dir + 'train_dev.clean.{}'.format(l2), 'w') as l2_clean_file:
                    for s1 in l1_raw_file: # for 就相当于readline()
                        s1 = s1.strip()
                        s2 = l2_raw_file.readline().strip() # s2 直接readline
                        s1 = clean_s(s1, l1)
                        s2 = clean_s(s2, l2)
                        s1_len = len_s(s1, l1)
                        s2_len = len_s(s2, l2)
                        if min_len > 0:
                            if s1_len < min_len or s2_len < min_len: # 删除长度小于min_len的部分
                                continue
                        if max_len > 0:
                            if s1_len > max_len or s2_len > max_len: # 删除长度大于max_len的部分
                                continue
                        if ratio > 0:
                            if s1_len / s2_len > ratio or s2_len / s1_len > ratio: # 删除长度比值大于ration的部分
                                continue
                        print(s1, file = l1_clean_file)
                        print(s2, file = l2_clean_file)


In [30]:
clean(l1 = 'zh', l2 = 'en')

## step4 sentencepiece处理

In [39]:
def train_sentence_model(vocab_size, l1, l2):
    if os.path.exists(os.path.join(data_dir, 'spm{}.model'.format(vocab_size))):
        print('spm{}.model'.format(vocab_size) + ' exists.')
        return

    spm.SentencePieceTrainer.train(
        input =','.join([
            os.path.join(data_dir, 'train_dev.clean.{}'.format(l1)),
            os.path.join(data_dir, 'train_dev.clean.{}'.format(l2))
        ]),
        model_prefix = os.path.join(data_dir, 'spm{}.model'.format(vocab_size)), # 输出文件路径
        vocab_size = vocab_size, # 词汇表大小
        character_coverage = 0.9995, # 模型覆盖的字符数量 对于有丰富字符集的如中文 合适的默认值是0.9995
        model_type = 'bpe', # 模式是bpe
        input_sentence_size=1e6, #
        shuffle_input_sentence=True, # 随机采样输入
        normalization_rule_name='nmt_nfkc_cf', # 正则化方法
    )
train_sentence_model(vocab_size=8000, l1 = 'zh', l2 = 'en')

In [42]:
spm_model = spm.SentencePieceProcessor(model_file = os.path.join(data_dir, 'spm{}.model.model'.format(8000)))
def transform_with_spm_model(l1, l2):
    if os.path.exists(os.path.join(data_dir, 'train_dev.final.{}'.format(l1))) \
        and os.path.exists(os.path.join(data_dir, 'train_dev.final.{}'.format(l2))):
        print('{} and {} final file exists.'.format(l1, l2))
        return

    language_list = [l1, l2]
    for lang in language_list:
        with open(os.path.join(data_dir, 'train_dev.final.{}'.format(lang)), 'w') as final_file:
            with open(os.path.join(data_dir, 'train_dev.clean.{}'.format(lang)), 'r') as clean_file:
                for line in clean_file:
                    line = line.strip()
                    tok = spm_model.encode(line, out_type = str)
                    print(' '.join(tok), file = final_file)
transform_with_spm_model('zh', 'en')