In [1]:
import re
import warnings
warnings.filterwarnings("ignore")
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply, Reshape
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
import keras.backend as K
import keras
import numpy as np
import random
import tqdm
import matplotlib.pyplot as plt
import string
from unicodedata import normalize
from keras.utils.vis_utils import model_to_dot, plot_model
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu

Using TensorFlow backend.


In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

def clean_data(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return np.array(cleaned)

In [3]:
filename = "fra.txt"
doc = load_doc(filename)
pairs = to_pairs(doc)

In [4]:
# choose sample size
n_train = 20000
clean_pairs = clean_data(pairs)[0:n_train, :]
# clean_pairs = clean_data(pairs)
input_texts = clean_pairs[:, 0]
target_texts = clean_pairs[:, 1]

# create word level input sequence
input_sequences = []
for t in input_texts:
    input_sequences.append(t.split())
# create word level target sequence
target_sequences = []
for t in target_texts:
    cur_seq = t.split()
    cur_seq.append('<eos>') # add end sentence lable
    target_sequences.append(cur_seq)

In [5]:
max_encoder_seq_length = max(len(line) for line in input_sequences)
max_decoder_seq_length = max(len(line) for line in target_sequences)

In [6]:
def text2sequences(max_len, lines):
    tokenizer = Tokenizer(oov_token='<unk>')
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index

In [7]:
X_train, X_test, y_train, y_test = train_test_split(input_sequences, target_sequences, test_size=0.2, random_state=42)
source_text_to_int, source_vocab_to_int = text2sequences(max_encoder_seq_length, X_train)
target_text_to_int, target_vocab_to_int = text2sequences(max_decoder_seq_length, y_train)

In [8]:
source_vocab_to_int['pad'] = 0
target_vocab_to_int['pad'] = 0

In [10]:
X = np.array(source_text_to_int)
Y = np.array(target_text_to_int)

In [11]:
# One Hot Encoding to X and Y
Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(source_vocab_to_int)), X)))
Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(target_vocab_to_int)), Y)))

In [12]:
# 自定义softmax函数
def softmax(x, axis=1):
    """
    Softmax activation function.
    """
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

In [27]:
Tx = 5
Ty = 13

(16000, 5)

In [13]:
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor_tanh = Dense(32, activation = "tanh")
densor_relu = Dense(1, activation = "relu")
activator = Activation(softmax, name='attention_weights')
dotor = Dot(axes = 1)

NameError: name 'Tx' is not defined