In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np
import pandas as pd

import pickle
import jieba
import h5py

SOS_token = 0
EOS_token = 1

In [None]:

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    #通过不断输入sentence（字符串的格式），构建词与下标的对应（词典），方便制作one-hot。
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
    def save(self, path):
        with open(path,'wb') as f:
            pickle.dump([self.name,self.word2index, self.word2count, self.index2word, self.n_words],f)
    
    def load(self,path):
        with open(path,'rb') as f:
            name, self.word2index, self.word2count, self.index2word, self.n_words = pickle.load(f)
        if self.name != name:
            print('error: Name error------------------------------!')
            
            
##################################################################

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def normalizeChinese(s):
    try:
        s.encode("gb2312")
    except UnicodeEncodeError:
        return ' '
    s = re.sub(r"[~!@#$%^&* ]+",r' ', s)
    return s


#lang1 = 'zh'  lang2 = 'en'
#默认英文到中文
def readTrainLangs(lang1, lang2, reverse=True,fenci = False):
    print("Reading lines...")

    zh_lines = open('../data/train.%s'% lang1).read().strip().split('\n')
    #zh_lines = zh_lines[0:20]  #for test

    zh_data_list = []
    if fenci:
        #jieba 分词
        for line in zh_lines:
            seg_line = jieba.cut(line,cut_all=False)
            #dic = [seg for seg in seg_line]
            dic = ' '.join(seg_line)
            tmp = ' '
            for char in dic.split(' '):
                val = normalizeChinese(char)
                tmp += val+' '
            zh_data_list.append(tmp)
    else: #用空格按字分开
        for line in zh_lines:
            dic = ' '.join(line)
            tmp = ' '
            for char in dic.split(' '):
                val = normalizeChinese(char)##去除生僻词
                tmp += val+' '
            zh_data_list.append(tmp)

    en_lines = open('../data/train.%s'% lang2).read().strip().split('\n')
    #en_lines = en_lines[0:20]  #for test
    
    # Split every line into pairs and normalize
    #去掉一些标点符号
    en_data_list = [[normalizeString(s) for s in l.split('\t')] for l in en_lines]
    pairs = []
    if reverse:
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
        for en,zh in zip(en_data_list,zh_data_list):
            input_lang.addSentence(en[0])
            output_lang.addSentence(zh)
            pairs.append([en[0].encode('utf-8'),zh.encode('gb2312')])
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
        for en,zh in zip(en_data_list,zh_data_list):
            input_lang.addSentence(zh)
            output_lang.addSentence(en[0])
            pairs.append([zh.encode('gb2312'), en[0].encode('utf-8')])
            
    return input_lang, output_lang, pairs
##################################################

#这部分就是对数据进行处理的函数了，上面写的函数都会在这里被调用
#最后得到三个变量input_lang，output_lang分别是源语言和目标语言的类，包含它们各自的词典。
#pairs是一个列表，列表的元素是一个二元tuple，tuple里面的内容是一句源语言字符串，一句目标语言字符串。
def prepareData(lang1, lang2, reverse=True, fenci=False):
    input_lang, output_lang, pairs = readTrainLangs(lang1, lang2, reverse, fenci)
    print("Read %s sentence pairs" % len(pairs))
    print(pairs[0][0].decode('utf-8'),pairs[0][1].decode('gb2312'))
    #pairs = filterPairs(pairs)
    #print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
#     for pair in pairs:
#         input_lang.addSentence(pair[0].decode('utf-8'))
#         output_lang.addSentence(pair[1].decode('gb2312'))
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


In [None]:
inputLang, outputLang, pairs = prepareData('zh','en')

In [None]:
inputLang.save('../data/en_train.pkl')
outputLang.save('../data/zh_train.pkl')

h5 = h5py.File('../data/train_afterProcess.h5py','w')
h5.create_dataset('pairs',data=pairs,dtype = 'S400')
h5.close()

In [21]:
import dataProcess as dp
h5py_file = h5py.File('../data/train_afterProcess.h5py','r')
pairs = h5py_file['pairs']
print(pairs[0][0].decode('utf-8'))
print(pairs[0][1].decode('gb2312'))

inputlang = dp.Lang('en')
outputlang = dp.Lang('zh')
inputlang.load('../data/en_train.pkl')
outputlang.load('../data/zh_train.pkl')

print(inputlang.name,inputlang.n_words)
print(outputlang.name,outputlang.n_words)
#h5py_file.close()

a pair of red crowned cranes have staked out their nesting territory
 一 对 丹 顶 鹤 正 监 视 着 它 们 的 筑 巢 领 地 
error: Name error------------------------------!
error: Name error------------------------------!
en 388091
zh 6754


In [None]:
import numpy as np
tt = ['我是中国人'.encode('gb2312'),'sdfeow dsf df dsf '.encode('utf-8')]
h5 = h5py.File('../data/test1.h5py','w')
h5.create_dataset('pairs',data=tt,dtype = 'S400')
h5.close()

#  data analysis

In [10]:
count = inputlang.word2count
wordFreqLess10 = 0#保存词频小于某个阈值的词
wordFreqMore100 = 0
all_word_count=0
print('input english word count:' , inputlang.n_words)
for word in count:
    all_word_count += count[word]
    if count[word] <= 10:
        wordFreqLess10 +=count[word]
    if count[word] <= 10:
        wordFreqMore100 +=1
print('word count larger than 100:',inputlang.n_words - wordFreqMore100)
print('word count Less than 10:',wordFreqLess10)
print('rate of word count less than 10:', float(wordFreqLess10)/float(all_word_count))

input english word count: 388091
word count larger than 100: 82103
word count Less than 10: 677382
rate of word count less than 10: 0.005871136314350413


388089
388089
388091


In [22]:
print(len(inputlang.word2count))
print(len(inputlang.word2index))
print(len(inputlang.index2word))
all_en_words = inputlang.word2count.copy()
for word in all_en_words:
    if  all_en_words[word] <= 10:
        inputlang.word2count.pop(word)
        index = inputlang.word2index[word]
        inputlang.word2index.pop(word)
        inputlang.index2word.pop(index)
        
print(len(inputlang.word2count))
print(len(inputlang.word2index))
print(len(inputlang.index2word))




388089
388089
388091
82101
82101
82103


In [18]:
testdict = {'a':23,'b':2,'c':4}
test2 = {1:'d',34:'f',22:'t'}
print(test2)
test2.pop(34)
print(test2)
a = testdict.copy()
print(a)
for word in testdict:
    if testdict[word] < 3:
        a.pop(word)
print(a)

{1: 'd', 34: 'f', 22: 't'}
{1: 'd', 22: 't'}
{'b': 2, 'c': 4, 'a': 23}
{'c': 4, 'a': 23}


### 匹配生僻字与非法字符

In [None]:
import re

def is_rare_name(string):
    pattern = re.compile(u"[~!@#$%^&* ]")
    match = pattern.search(string)
    if match:
        return True
    try:
        string.encode("gb2312")
    except UnicodeEncodeError:
        return True
    return False

def normalizeChinese(s):
    try:
        s.encode("gb2312")
    except UnicodeEncodeError:
        return ' '
    s = re.sub(r"[~!@#$%^&* ]+",r' ', s)
    return s


In [None]:
str1 = '我 是 @ 中 国 人 ， 我 # 不 才 囧 怪。'
#str1 = 'I an # a boy'
l = [normalizeChinese(s) for s in str1.split(' ')]
print(l)
tmp = ' '
for s in str1.split(' '):
    val = normalizeChinese(s)
    tmp += val+' '
print(tmp)

In [None]:
result = re.search(r'.','I love FishC.com!')
print(result.group())

In [None]:
s = '我 是 @ 中 国 人 ， 我 # 不 囧 才 怪。'
s = re.sub(r"[~!@#$%^&* ]+",r' ', s)
print (s)