# NMT

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm
from tensorflow.python.layers import core as layers_core
import tensorflow as tf
import tensorflow.compat.v1 as tfc
import tensorflow_addons as tfa

### Read data

In [3]:
enline, chline = None, None
enlines, chlines = [], []
sentence_length = 10
file_path = '../data/word/segmented_train_seg_by_word.txt'
pbar = tqdm()
i = 0
early_exist_line_num = 1000000
with open(file_path, 'r') as lines:
    for line in lines:
        i += 1
        if i % 2 == 1:
            enline = line.lower().strip('\n').split()
            continue          
        else:
            chline =[w for w in line.strip('\n').replace(' ','')]
        if len(enline) > sentence_length and len(chline) > sentence_length:
            continue    
        enlines.append(enline)
        chlines.append(chline)  
        if i % 100000 == 0:
            pbar.update(i)  
        if i == early_exist_line_num:
            break
     

3100000it [00:02, 1785037.29it/s]

In [4]:
len(chlines)

301514

In [5]:
print(chlines[:4])
print(enlines[:4])

[['一', '对', '二', '总', '不', '是', '好', '事', '，'], ['一', '对', '二', '胜', '。'], ['一', '对', '五', '年', '没', '见', '过', '的', '姐', '妹', '一', '场', '激', '烈', '的', '争', '吵', '？'], ['一', '对', '五', '百', '诶', '。']]
[['fighting', 'two', 'against', 'one', 'is', 'never', 'ideal', ','], ['deuces', 'the', 'winner', '.'], ['an', 'incredibly', 'emotional', 'fight', 'between', '2', 'sisters', '？'], ['one', 'against', '500', '.']]


In [29]:
def addchar(word2id, id2word, word):
    id = len(word2id)
    word2id[word] = id
    id2word[id] = word

ch2id, id2ch, en2id, id2en = {}, {}, {},{}
most_common_k = 100000
specialchars = ['<eos>','<start>','<end>','<unk>','<pad>']

chwords, enwords = set(), set()
for ch in chlines: 
    for c in ch:
        chwords.add(c)
for en in enlines: 
    for e in en:
        enwords.add(e)

for word, _ in Counter(chwords).most_common(most_common_k):
    addchar(ch2id, id2ch, word)
    
for word, _ in Counter(enwords).most_common(most_common_k):
    addchar(en2id, id2en, word)
    
for one in specialchars:
    addchar(ch2id,id2ch,one)
    addchar(en2id,id2en,one)

In [7]:
len(en2id), len(ch2id)
# ch2id['<unk>']

(47028, 4770)

### Prepare data for english to chinese translation

In [8]:
dat_x_in, dat_y_in,dat_y_out =[], [], []
dat_x_len, dat_y_len = [], []
unknown_en_id = en2id['<unk>']
eos_en_id = en2id['<eos>']
# english sentences to id array
for en_sentence in enlines:
    id_en_sentence = [ en2id.get(w, unknown_en_id) for w in en_sentence ]
    id_en_sentence.append(eos_en_id) #mark end of sentence
    dat_x_in.append(id_en_sentence)
    dat_x_len.append(len(id_en_sentence))
    
# chinse sentences to id array
start_ch_id = ch2id['<start>']
end_ch_id = ch2id['<end>']
unknown_ch_id = ch2id['<unk>']
for ch_sentence in chlines:
    id_ch_sentence = [ ch2id.get(w, unknown_ch_id) for w in ch_sentence ]
    dat_y_in.append([start_ch_id] + id_ch_sentence)
    dat_y_out.append(id_ch_sentence + [end_ch_id])
    dat_y_len.append(len(id_ch_sentence) + 1)   

### Transform data

In [9]:
m = 100
[id2en[i] for i in dat_x_in[m]], [id2ch[i] for i in dat_y_in[m]]

(['one', 'pair', 'of', 'elongated', 'canines', '.', '<eos>'],
 ['<start>', '一', '对', '延', '长', '犬', '齿', '。'])

In [30]:
# build input and lable
data_x_in, data_y = [],[]

for ch_sentence, en_sentence in zip(chlines, enlines):
    ch_id_sen = [ch2id.get(i, unknown_ch_id) for i in ch_sentence]
    data_x_in.append(ch_id_sen)
    data_y.append(0)

    en_id_sen = [en2id.get(i, unknown_en_id) for i in en_sentence]
    data_x_in.append(en_id_sen)
    data_y.append(1)

data_x_in = tf.keras.preprocessing.sequence.pad_sequences(data_x_in, padding='post', value=en2id['<pad>'])
data_y = np.asarray(data_y)
    

### Buil model

In [11]:
batch_size = 256
embedding_size = 100
vocabulary_size = len(en2id)
num_units = 50

In [32]:
tfc.reset_default_graph()
tfc.disable_eager_execution()
config = tfc.ConfigProto(log_device_placement =True, allow_soft_placement = True)
config.gpu_options.allow_growth = True
session = tfc.Session(config = config)

with tfc.device('/gpu:1'):
    initializer =tfc.random_uniform_initializer(-0.08, 0.08)
    tfc.get_variable_scope().set_initializer(initializer)

    x = tfc.placeholder('int32', [None, None])
    y = tfc.placeholder('int32', [None])
    x_len = tfc.placeholder('int32', [None]) # control sequence lenth for RNN
    learning_rate = tfc.placeholder(tfc.float32, shape=[])

    # embedding
    embedding_encoder = tfc.get_variable('embedding_encoder', [vocabulary_size, embedding_size], dtype=tfc.float32)
    encode_embedding_inp = tfc.nn.embedding_lookup(embedding_encoder, x)

    # build rnn cell
    encoder_cell = tfc.nn.rnn_cell.BasicLSTMCell(num_units)
    encoder_outputs, encoder_states = tfc.nn.dynamic_rnn(encoder_cell, encode_embedding_inp, sequence_length= x_len, time_major=False, dtype=tfc.float32)

    # loss
    model_logistic = tfc.layers.dense(encoder_states[0], 1)
    model_prediction = tfc.nn.sigmoid(model_logistic)
    loss = tfc.nn.sigmoid_cross_entropy_with_logits(labels=tfc.cast(y, tfc.float32), logits=tfc.reshape(model_logistic, (-1,)))
    loss1 = tfc.reduce_mean(loss)
    optimizer = tfc.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)

Device mapping: no known devices.


  encoder_cell = tfc.nn.rnn_cell.BasicLSTMCell(num_units)
  model_logistic = tfc.layers.dense(encoder_states[0], 1)


In [21]:
session.run(tf.global_variables_initializer())


### Training

In [33]:
def data_generate(data_x_in, data_y, batch_size = 1000):
    for i in range(0, len(data_x_in), batch_size):
        if i + batch_size < len(data_x_in):
            yield data_x_in[i: i + batch_size], data_y[i: i + batch_size]
        else:
            yield data_x_in[i: len(data_x_in)], data_y[i: len(data_x_in)]

In [None]:
loss = []
begging_learning_rate = 0.1
gen = data_generate(data_x_in, data_y, batch_size)