In [1]:

import tqdm
import numpy as np
import codecs
#import jieba
from collections import Counter#计数器
import torch

import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence ,pack_padded_sequence,pad_packed_sequence

from torchtext.data.utils import get_tokenizer#分词器

In [2]:
from transformers import BertTokenizer, BertConfig, BertForMaskedLM, BertForNextSentencePrediction
 
from transformers import BertModel
 
model_name = 'bert-base-chinese'
MODEL_PATH = '/bert-base-chinese/'
 
 # a.通过词典导入分词器
tokenizer_zh = BertTokenizer.from_pretrained(model_name) 

In [3]:
UNK_IDX = 0 #未知
PAD_IDX = 1  #
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#DEBUG = True  

In [4]:
#构建词汇表
def build_dict(sentences, max_words = 50000):
    vocab = Counter(np.concatenate(sentences)).most_common(max_words)#最大单词数是50000
    word_to_id = {w[0]: index + 2 for index, w in enumerate(vocab)}
    word_to_id['UNK'] = UNK_IDX  #0
    word_to_id['PAD'] = PAD_IDX  #1
    id_to_word = {v: k for k, v in word_to_id.items()}
    return word_to_id,id_to_word

In [5]:
# 利用词典对原始句子编码 单词->数字

def encode(en_sentences, ch_sentences, en_wtoi, zh_wtoi, sort_by_len=True):
    
    out_en_sentences = [[en_wtoi.get(w, UNK_IDX) for w in sent] for sent in en_sentences]
    out_ch_sentences = [[zh_wtoi.get(w, UNK_IDX) for w in sent] for sent in ch_sentences]
        
    
    #返回w对应的值，否则返回UNK_IDX
    def len_argsort(seq):#按照长度进行排序
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))
       
    # 把中文和英文按照同样的顺序排序
    if sort_by_len:
        sorted_index = len_argsort(out_en_sentences)
        out_en_sentences = [out_en_sentences[i] for i in sorted_index]
        out_ch_sentences = [out_ch_sentences[i] for i in sorted_index]
        
    return out_en_sentences, out_ch_sentences

In [6]:

DIR_PATH="/mnt/seq2seq_att/en-zh"
with codecs.open(DIR_PATH+'/train.zh','r','utf-8') as f1:
    target_text=f1.read()
with codecs.open(DIR_PATH+'/train.en','r','utf-8') as f2:
    source_text=f2.read()
with codecs.open(DIR_PATH+'/test.zh','r','utf-8') as f3:
    test_target_text=f3.read()
with codecs.open(DIR_PATH+'/test.en','r','utf-8') as f4:
    test_source_text=f4.read()

In [7]:
####设置句子最大长度为80
max_length=80


from torchtext.data.utils import get_tokenizer#分词器
tokenizer_en = get_tokenizer('basic_english')#按空格进行分割
train_en = []
for sentence in tqdm.tqdm(source_text.split("\n")):
    text= tokenizer_en(sentence.lower())
    if len(text)>max_length:
        text=text[0:max_length]    
    train_en.append(["BOS"] + text+ ["EOS"])#小写
    

test_en = []
for sentence in tqdm.tqdm(test_source_text.split("\n")):
    text= tokenizer_en(sentence.lower())
    if len(text)>max_length:
        text=text[0:max_length]    
    test_en.append(["BOS"] + text + ["EOS"])#小写


train_zh = []
for sentence in tqdm.tqdm(target_text.split("\n")):
    #train_zh.append(["BOS"] + tokenizer_cn(sentence) + ["EOS"])

    text=tokenizer_zh.tokenize(sentence)
    if len(text)>max_length:
        text=text[0:max_length]
    train_zh.append(["BOS"] + text + ["EOS"])

test_zh = []
for sentence in tqdm.tqdm(test_target_text.split("\n")):
    text=tokenizer_zh.tokenize(sentence)
    if len(text)>max_length:
        text=text[0:max_length]
    test_zh.append(["BOS"] +text+ ["EOS"])
    
train_zh =train_zh[0:2897366] 


100%|██████████| 2897366/2897366 [01:02<00:00, 46367.17it/s]
100%|██████████| 4001/4001 [00:00<00:00, 55975.72it/s]
100%|██████████| 3781317/3781317 [19:31<00:00, 3226.76it/s] 
100%|██████████| 4001/4001 [00:01<00:00, 2939.16it/s]


In [9]:
###生成词典
en_wtoi, en_itow = build_dict(train_en)
zh_wtoi, zh_itow = build_dict(train_zh)

In [10]:
###整数编码
train_en_encode, train_zh_encode = encode(train_en, train_zh, en_wtoi, zh_wtoi)
test_en_encode, test_zh_encode = encode(test_en, test_zh, en_wtoi, zh_wtoi)

In [12]:
import numpy as np
train_en_encode_save=np.array(train_en_encode)
np.save('train_en_bert.npy',train_en_encode_save)

train_zh_encode_save=np.array(train_zh_encode)
np.save('train_zh_bert.npy',train_zh_encode_save)

test_zh_encode_save=np.array(test_zh_encode)
np.save('test_zh_bert.npy',test_zh_encode_save)

test_en_encode_save=np.array(test_en_encode)
np.save('test_en_bert.npy',test_en_encode_save)

  train_en_encode_save=np.array(train_en_encode)
  train_zh_encode_save=np.array(train_zh_encode)
  test_zh_encode_save=np.array(test_zh_encode)
  test_en_encode_save=np.array(test_en_encode)


In [13]:
np.save('en_wtoi_bert.npy',en_wtoi)

np.save('en_itow_bert.npy',en_itow)

np.save('zh_wtoi_bert.npy',zh_wtoi)

np.save('zh_itow_bert.npy',zh_itow)