In [None]:
%matplotlib inline

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np
import pickle
import time
import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from html.parser import HTMLParser

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

In [None]:
use_cuda = torch.cuda.is_available()
print(use_cuda)

In [None]:
def readWord2vecDict(file_dir):
    with open(file_dir, 'rb') as f:
        word2vecDict = pickle.load(f)
    f.close()
    return word2vecDict


en_word2vecDict = readWord2vecDict("en_dict_512.bin")
zh_word2vecDict = readWord2vecDict("zh_dict_512.bin")

# print(en_word2vecDict["premiumization"])
# print(zh_word2vecDict["我"])

In [None]:
# ls = list(zh_word2vecDict.values())

# for key in zh_word2vecDict.keys():
#     zh_word2vecDict[key] = zh_word2vecDict[key] * 0.01
# ls = list(zh_word2vecDict.values())
# # ls = np.array(ls)
# min_=1
# max_=-1
# for i in range(len(ls)):
#     if min(ls[i]) < min_:
#         min_ = min(ls[i])
#     if max(ls[i]) > max_:
#         max_ = max(ls[i])
# print("after:")
# print(min_, max_)
    
    

# for key in en_word2vecDict.keys():
#     en_word2vecDict[key] = en_word2vecDict[key] * 0.01
# ls = list(en_word2vecDict.values())
# # ls = np.array(ls)
# min_=1
# max_=-1
# for i in range(len(ls)):
#     if min(ls[i]) < min_:
#         min_ = min(ls[i])
#     if max(ls[i]) > max_:
#         max_ = max(ls[i])
# print("before")
# print(min_, max_)

In [None]:
en_word2vecDict["SOS"] = -np.ones(512)
en_word2vecDict["UNK"] = np.zeros(512)
en_word2vecDict["EOS"] = np.ones(512)
zh_word2vecDict["SOS"] = -np.ones(512)
zh_word2vecDict["UNK"] = np.zeros(512)
zh_word2vecDict["EOS"] = np.ones(512)

In [None]:
SOS_token = -1
EOS_token = 1

class Lang:
    def __init__(self, name, word2vecDict):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {-1: "SOS", 0:"UNK", 1:"EOS"}
        self.n_words = 3  # Count SOS, EOS
        self.word2vecDict = word2vecDict

    def addSentence(self, sentence):
        for word in sentence.strip().split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word in self.word2vecDict:
            if word not in self.word2index:
                self.word2index[word] = self.n_words
                self.word2count[word] = 1
                self.index2word[self.n_words] = word
                self.n_words += 1
            else:
                self.word2count[word] += 1

In [None]:
class read_Langs(object):
    def __init__(self,dir):
        self.dir = dir
    def __iter__(self):
        for line in open(self.dir, encoding = 'utf-8'):
            if len(line.strip().split('\t'))==2:
                yield line.strip().split('\t')
            else:
                pass
def readLangs():
    pairs = read_Langs('train_pair')
    pairs = [pair for pair in pairs]
    input_lang = Lang("en", en_word2vecDict)
    output_lang = Lang("zh", zh_word2vecDict)
    return input_lang, output_lang, pairs

In [None]:
MAX_LENGTH = 20

def filterPair(p):
#     print(str(p[0]) + str(p[1]), end = "\r")
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [None]:
def prepareData():
    input_lang, output_lang, pairs = readLangs()
    pairs = filterPairs(pairs)
    print("Read %s sentence pairs" % len(pairs))
#     i=0
    for pair in pairs:
#         print(i)
#         i+=1
#         print(pair)
        input_lang.addSentence(pair[0].strip())
        output_lang.addSentence(pair[1].strip())
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    print("Count of sentence pairs:")
    print(len(pairs))
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData()



In [None]:
# with open("input_lang.lang", 'wb') as f:
#     word2vecDict = pickle.dump(input_lang, f)
# f.close()
# with open("output_lang.lang", 'wb') as f:
#     word2vecDict = pickle.dump(output_lang, f)
# f.close()
# with open("pairs.lang", 'wb') as f:
#     word2vecDict = pickle.dump(pairs, f)
# f.close()
        
# with open("input_lang.lang", 'rb') as f:
#     input_lang = pickle.load(f)
# f.close()
# with open("output_lang.lang", 'rb') as f:
#     output_lang = pickle.load(f)
# f.close()
# with open("pairs.lang", 'rb') as f:
#     pairs = pickle.load(f)
# f.close()

In [None]:
def embedding(input, lang, word2vecDict):  #input(1, seq_len)
#     print(input)
    input = input.cpu().data.numpy()
#         print("input[i][0]")
#         print(input[i][0])
#         print("lang.index2word[input[i][0]]")
#         print(lang.index2word[input[i][0]])
    embeded = torch.FloatTensor(1, len(input[0]), 512)
    for i in range(len(input[0])):
        word = lang.index2word[input[0][i]]
        if word in word2vecDict:
            embeded[0, i] = torch.FloatTensor(word2vecDict[word])
        else:
            embeded[0, i] = torch.FloatTensor(word2vecDict["UNK"])
#         print("embedding[i]")
#         print(embedding[i])
    return Variable(embeded)  #embedding(1, seq_len, embedding_size)

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] if word in lang.word2index else 0 for word in sentence.strip().split(' ')]


def variableFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.insert(0, SOS_token)
    indexes.append(EOS_token)
    result = Variable(torch.LongTensor(indexes))
    return result.cuda() if use_cuda else result


def variablesFromPair(pair):
    input_variable = variableFromSentence(input_lang, pair[0])
    target_variable = variableFromSentence(output_lang, pair[1])
    return (input_variable, target_variable)

In [None]:
def grad_multiply(x, scale):
    return GradMultiply.apply(x, scale)


class GradMultiply(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, scale):
        ctx.scale = scale
        res = x.new(x)
        ctx.mark_shared_storage((x, res))
        return res

    @staticmethod
    def backward(ctx, grad):
        return grad * ctx.scale, None

In [None]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder,self).__init__()
        self.embedding_size = 512
        self.hidden_size = 512
        self.n_layers = 13
        self.linear1 = nn.Linear(self.embedding_size, self.hidden_size)
#         self.linear2 = nn.Linear(self.hidden_size, 2*self.hidden_size)
#         self.linear3 = nn.Linear(2*self.hidden_size, 2*self.hidden_size)
        self.out_channels = 2*self.hidden_size
        self.kernel = (3, self.hidden_size)
        self.padding = (int((self.kernel[0]-1)/2), 0)
        #conv input(N_batches, Channels_in, Height_in, Width_in)
        self.conv = nn.Conv2d(1, self.out_channels, self.kernel, (1,1), self.padding)
        
        
    def forward(self, input):  #input is only a sequence of indexes of words
        input = input[1:input.size(0)]  #delete the SOS_token
        input = input.unsqueeze(0)  #in input(seq_len), out input(1, seq_len) where 1 means batch size is 1
        global en_word2vecDict, input_lang
        input = embedding(input, input_lang, en_word2vecDict).cuda()*0.1  #out input(1, seq_len, embedding_size)
        input = F.dropout(input, p=0.1, training=self.training)
        input = input.unsqueeze(1) #out input(1, 1, seq_len, hidden_size)
        input_temp = input
#         input = self.linear1(input)  #out input(1, 1, seq_len, hidden_size)
        for _ in range(self.n_layers):
            input_ = input
            input = F.dropout(input, p=0.1, training=self.training)
            input = self.conv(input).transpose(1,3)  #out input(1, 1, seq_len, 2*hidden_size) 
            input = F.glu(input, 3)  #out input(1, 1, seq_len, hidden_size)
            input = (input + input_)*math.sqrt(0.5)
#             input = F.relu(input)
#         attn = input
#         out = input + input_temp
        input = grad_multiply(input, 1.0 / (2.0 * self.n_layers))  # scale gradients (this only affects backward, not forward)
        return input, input + input_temp

In [None]:
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder,self).__init__()
        self.embedding_size = 512
        self.hidden_size = 512
        global output_lang
        self.dict_size = output_lang.n_words
        print(self.dict_size)
        self.n_layers = 13
        self.linear1 = nn.Linear(self.embedding_size, self.hidden_size)
        self.linear2 = nn.Linear(self.hidden_size, 2*self.hidden_size)
        self.linear3 = nn.Linear(self.hidden_size, self.dict_size)
        self.linear4 = nn.Linear(2*self.hidden_size, 2*self.hidden_size)
        self.in_channels = 1
        self.out_channels = 2*self.hidden_size
        self.kernel = (3, self.hidden_size)
        self.stride = (1, 1)
        self.padding = (0, 0)  #mannually pad, without using its inner auto padding
        self.conv = nn.Conv2d(self.in_channels, self.out_channels, self.kernel, self.stride, self.padding)
        
        
    def forward(self, target, enc_attn, enc_out):  #target is only a sequence of indexes of words
        input = target[0:-1]  #delete the EOS_token
        input = input.unsqueeze(0)  #target(tar_seq_len), input(1, tar_seq_len) where 1 means batch size is 1
        global zh_word2vecDict, output_lang
        input = embedding(input, output_lang, zh_word2vecDict).cuda()*0.1 #out input(1, tar_seq_len, embedding_size)
        input = F.dropout(input, p=0.1, training=self.training)
        input = input.unsqueeze(1) #out input(1, 1, tar_seq_len, hidden_size)
        input_temp = input
#         input = self.linear1(input)  #out input(1, 1, tar_seq_len, hidden_size)
        for _ in range(self.n_layers):
            input_ = input
            input = F.dropout(input, p=0.1, training=self.training)
            cat_temp = Variable(torch.FloatTensor(torch.zeros(1, 1, self.kernel[0]-1, self.hidden_size))).cuda()
            input = torch.cat((cat_temp, input), 2)  #pad left with kernel_size-1 elements
            input = self.conv(input)  #out input(1, 2*hidden_size, tar_seq_len, 1) 
                                        #where the second 1 means 2*hidden_size changes into is 1
            input = input.transpose(1, 3)  #out input(1, 1, tar_seq_len, 2*hidden_size)
            input = F.glu(input, 3)  #out input(1, 1, tar_seq_len, hidden_size)
            input__ = input
            input = (input + input_temp)*math.sqrt(0.5)
            input = torch.bmm(input.squeeze(0), enc_attn.squeeze(0).transpose(1, 2))  #out input(1, tar_seq_len, sor_seq_len)
            #softmax:result_i_j = exp(x_i_j)/sum(exp(x_j)), input should be in two directions
            input = F.softmax(input.squeeze(0)).unsqueeze(0) #out input(1, tar_seq_len, sor_seq_len)
            input = torch.bmm(input, enc_out.squeeze(0))  #out input(1, tar_seq_len, hidden_size)
            input = input*(enc_out.size(2) * math.sqrt(1.0/enc_out.size(2)))
            input = (input.unsqueeze(0) + input__)*math.sqrt(0.5) #out input(1, 1, tar_seq_len, hidden_size)
            input = (input + input_)*math.sqrt(0.5)
#             input = F.relu(input)
        input = input.squeeze(0).squeeze(0)  #out input(tar_seq_len, hidden_size)
        input = F.dropout(input, p=0.1, training=self.training)
        input = F.log_softmax(self.linear3(input))  #out input(tar_seq_len, dict_size) has tar_seq_len choices of words
        return input

In [None]:
class NMTModel(nn.Module):
    """
    NMTModel:
    Input:
        encoder:
        decoder:
        attention:
        generator:
    return:
    """

    def __init__(self, encoder, decocer):
        super(NMTModel, self).__init__()
        self.encoder = encoder
        self.decocer = decocer

    def forward(self, source, target):
        # attn(1, 1, seq_len, 2*hidden_size)
        # out(1, 1, seq_len, 2*hidden_size)
        attn, source_seq_out = self.encoder(source)

        # out(tar_seq_len, dict_size)
        out = self.decocer(target, attn, source_seq_out)

        return out

In [None]:
def train(input_variable, target_variable, NMTModel, NMTModel_optimizer, criterion):

    NMTModel_optimizer.zero_grad()
   
    loss = 0
    
    for i in range(len(input_variable)):
        out = NMTModel(input_variable[i], target_variable[i])
#         print("out")
#         print(out)
#         print("target_to_compare")
#         print(target_variable[i][1:target_variable[i].size(0)])
        loss += criterion(out, target_variable[i][1:target_variable[i].size(0)])
    loss.backward()
    
    NMTModel_optimizer.step()

    return loss.data[0]/(len(input_variable))

In [None]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def saveModel(model, model_path):
    torch.save(model.state_dict(), model_path)

def loadModel(model, model_path):
    model.load_state_dict(torch.load(model_path))

In [None]:
def trainIters(NMTModel, n_iters, print_every=1000, plot_every=100, save_every = 5000, learning_rate=0.01):
    NMTModel.train(True)
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    
    NMTModel_optimizer = optim.SGD(NMTModel.parameters(), lr=learning_rate, momentum=0.99,nesterov=True)
    
    criterion = nn.NLLLoss()
    
    batch_size = 64
    
    for iter in range(1, n_iters+1):
        training_pairs = [variablesFromPair(random.choice(pairs)) for _ in range(batch_size)]
        input_variable_batch = [training_pairs[i][0] for i in range(batch_size)]
        target_variable_batch = [training_pairs[i][1] for i in range(batch_size)]
        
        loss = train(input_variable_batch, target_variable_batch, NMTModel, NMTModel_optimizer, criterion)
    
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print("%s  (%d  %d%%)  %f" %(timeSince(start, iter/n_iters), iter, iter/n_iters*100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
            
        if iter % save_every == 0:
            saveModel(NMTModel, "./NMTModel_" + str(time.strftime('%d-%H',time.localtime(time.time()))) + ".mod")
    showPlot(plot_losses)

In [None]:
def weight_init(model):
    print(model.parameters())
    for param in model.parameters():
        print(type(param.data))
        torch.nn.init.normal(param.data, 0, 0.01)
#     print(model.weight)
    #torch.nn.init.normal(model.weight.data, mean=0, std=0.5)

In [None]:
encoder = Encoder()
decoder = Decoder()
nmtmodel = NMTModel(encoder, decoder)
# weight_init(nmtmodel)

if True:
    
    loadModel(nmtmodel, "./NMTModel_23-04.mod")

    
if use_cuda:
    nmtmodel = nmtmodel.cuda()

In [None]:
# trainIters(nmtmodel, n_iters = 40000, print_every=100, plot_every = 10, save_every = 5000, learning_rate = 0.0001)

In [None]:
def evaluate(model, input_sentence):
    model.train(False)
    output_sentence = ""
    pair = [input_sentence, output_sentence]
    input_variable, _= variablesFromPair(pair)
    output_variable = Variable(torch.LongTensor([-1,1])).cuda()
    end_flag = False
    while(end_flag == False):
#         print(input_variable)
#         print(output_variable)
        out = model(input_variable, output_variable)
        topi = out.data.topk(1, dim = 1)
        indexes = topi[1].transpose(0,1).cpu().numpy()[0]
#         print(indexes)
        output_sentence = ""
        for i in range(len(indexes)):
            if i != len(indexes)-1 :
#                 print(str(i) + ": "  + str(output_lang.index2word[indexes[i]]))
                output_sentence += output_lang.index2word[indexes[i]] + " "
            else:
#                 print(str(i) + ": "  + str(output_lang.index2word[indexes[i]]))
                if output_lang.index2word[indexes[i]] == "EOS":
                    end_flag = True
                else:
                    output_sentence += output_lang.index2word[indexes[i]]
        pair = [input_sentence, output_sentence]
        input_variable, output_variable= variablesFromPair(pair)
    print(output_sentence)
    return output_sentence
#     print(topi)
#     print(indexes)

In [None]:
input_sentence = "For Access To Guest the floors 23 and 24 , Use Key Card Is Located on floor 24 ."
evaluate(nmtmodel, input_sentence.lower())

In [None]:
class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.sentence_dict = {}
        self.getdataflag = False
        self.sentence_dict_key = ""

    def handle_starttag(self, tag, attrs):
        if tag == "seg":
            self.getdataflag = True
            self.sentence_dict_key = attrs[0][1]
            #print(attrs)
            
    def handle_data(self, data):
        if self.getdataflag == True:
            self.sentence_dict[self.sentence_dict_key] = data
        self.getdataflag = False

In [None]:
sgmfile = open("test_b.sgm",'r')
text = sgmfile.read()
parser = MyHTMLParser()
parser.feed(text)
en_sentences_dict = parser.sentence_dict.copy()

print(en_sentences_dict["999"])

In [None]:
def normalizeString(s):
    #s=jieba.cut(s)
    #正则表达式，标点符号前加空格 去掉特殊标点符号 
    s = re.sub(r"(\,)", r" \1 ", s)
    s = re.sub(r"(\.)", r" \1 ", s)
    s = re.sub(r"(\?)", r" \1 ", s)
    s = re.sub(r"(\!)", r" \1 ", s)
    s = re.sub(r"(\:)", r" \1 ", s)

#     s = re.sub(r"([,!?])", r" \1", s)
#     s = re.sub(r"(\d),(\d)",r"\1,\2", s)
    s = re.sub(r"[^\da-zA-Z.!,'?]+", r" ", s)
    #s = re.sub(r"[a-zA-Z] - [a-zA-Z]", r"\1-\2 ", s)
    return s.lower().strip()
ss = "It's (df) df.,fdf@ !"
print(normalizeString(ss))

In [None]:
for i in range(len(en_sentences_dict)):
    en_sentences_dict[str(i+1)] = normalizeString(en_sentences_dict[str(i+1)].strip()).strip()

In [None]:
def evaluateAll(model, test_sentences_dict):
    print(len(test_sentences_dict))
    i=0
#     j=0
    test_file = open('test_result','w')
    for i in range(len(test_sentences_dict)):
        print(i)
        out_sentence = evaluate(model, test_sentences_dict[str(i+1)]).strip() + "\n"
        out_sentence = re.sub('UNK', '', out_sentence)
        out_sentence = re.sub(' ', '', out_sentence)
        test_file.write(out_sentence)
    test_file.close()

In [None]:
evaluateAll(nmtmodel, en_sentences_dict)