In [1]:
import numpy as np
import pandas as pd

In [2]:
# limit memory usage
import resource 
  
def limit_memory(maxsize): 
    if maxsize > 16: 
        print("Max size was set to max 16 GB")
        maxsize = 16
    soft, hard = resource.getrlimit(resource.RLIMIT_AS) 
    maxsize = maxsize * 1024 * 1000000
    resource.setrlimit(resource.RLIMIT_AS, (maxsize, hard)) 

limit_memory(8)

## Read data

In [3]:
def read_data(file_name):    
    with open(file_name, "r") as f:
        lines = [l.upper().split("\t") for l in f.read().splitlines()]
    df = pd.DataFrame(lines, columns = ["id", "s1", "s2", "score"])
    df.score = df.score.astype(float)
    return df

def read_data_train(file_name):    
    with open(file_name, "r") as f:
        lines = [l.upper().split("\t")[:3] for l in f.read().splitlines()]
    df = pd.DataFrame(lines, columns = ["id", "s1", "s2"])
    return df

In this notebook I decided to use a sample with the size 30000 to speed up the calculations and to fit in memory. In parallel, I run the same notebook in Google Colab [here](https://drive.google.com/file/d/1QgfMsusHv9AeLa0-8DdiFnt8oN4_u_13/view?usp=sharing).  Unfortunatelly the notebook was disconnected from the server, so I did not get the results.

In [4]:
test_file_name = "opusparcus_v2/opusparcus_v2/en-test.txt"
train_file_name = "opusparcus_v2/opusparcus_v2/en-train-100K.txt"
dev_file_name = "opusparcus_v2/opusparcus_v2/en-dev.txt"
test_data = read_data(test_file_name)
train_data = read_data_train(train_file_name).iloc[:30000]
dev_data = read_data(dev_file_name)

In [5]:
train_data.head()

Unnamed: 0,id,s1,s2
0,EN-N7,JUMBY NOW WANTS TO BE BORN .,JUMBY WANT BIRTH .
1,EN-N8,IT WAS A DIFFICULT AND LONG DELIVERY .,THE DELIVERY WAS DIFFICULT AND LONG .
2,EN-N12,I LIKE TO BE BEAUTIFUL EVERYDAY .,I LIKE TO BE PRETTY EVERYDAY .
3,EN-N22,BERNADETTE WANTS A PRENUP .,BERNADETTE WANTS TO GET A PRENUP .
4,EN-N45,DON 'T SAY YOU DON 'T REMEMBER ME .,DON 'T TELL ME YOU DON 'T REMEMBER ME .


In [6]:
test_data.shape, train_data.shape, dev_data.shape

((1445, 4), (30000, 3), (1455, 4))

In [7]:
l1 = train_data.s1.apply(len)
l2 = train_data.s2.apply(len)
MAX_INPUT_SIZE = max(max(l1), max(l2))
print("The maximum lenght in the train dataset =", MAX_INPUT_SIZE)

The maximum lenght in the train dataset = 209


In [8]:
# decided to use only pheases 50-character long to avoid memmory error
train_data = train_data[(train_data.s1.apply(len)<=50)&(train_data.s2.apply(len)<=50)]
test_data = test_data[(test_data.s1.apply(len)<=50)&(test_data.s2.apply(len)<=50)]
dev_data = dev_data[(dev_data.s1.apply(len)<=50)&(dev_data.s2.apply(len)<=50)]

In [9]:
test_data.shape, train_data.shape, dev_data.shape

((1402, 4), (29851, 3), (1419, 4))

In [10]:
l1 = train_data.s1.apply(len)
l2 = train_data.s2.apply(len)
MAX_INPUT_SIZE = max(max(l1), max(l2))
print("The maximum lenght in the train dataset =", MAX_INPUT_SIZE)

The maximum lenght in the train dataset = 50


In [11]:
START = "s"
END = "e"
NONE = "n"

## Character-level data preparation

In [12]:
%%time 

MAX_INPUT_SIZE = MAX_INPUT_SIZE + 2

def gen_char_seq(s1, s2):
    x1 = s1 + NONE*(MAX_INPUT_SIZE-len(s1))
    s2_se = START + s2 + NONE*(MAX_INPUT_SIZE-len(s2)-2) + END
    return x1, s2_se[:-1], s2_se[1:]

def gen_char_seq_data(df):
    X1 = []
    X2 = []
    Y = []
    for v in df[["s1", "s2"]].values:
        # use the first sentence as input and another as output and vise versa
        x1,x2,y = gen_char_seq(v[1], v[0])
        X1.append(x1)
        X2.append(x2)
        Y.append(y)
        x1,x2,y = gen_char_seq(v[0], v[1])
        X1.append(x1)
        X2.append(x2)
        Y.append(y)
    return X1,X2,Y
        
        
char_X1, char_X2, char_Y = gen_char_seq_data(train_data)

CPU times: user 128 ms, sys: 2.47 ms, total: 131 ms
Wall time: 154 ms


In [13]:
char_X1[0:2], char_X2[0:2], char_Y[0:2]

(['JUMBY WANT BIRTH .nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn',
  'JUMBY NOW WANTS TO BE BORN .nnnnnnnnnnnnnnnnnnnnnnnn'],
 ['sJUMBY NOW WANTS TO BE BORN .nnnnnnnnnnnnnnnnnnnnnn',
  'sJUMBY WANT BIRTH .nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn'],
 ['JUMBY NOW WANTS TO BE BORN .nnnnnnnnnnnnnnnnnnnnnne',
  'JUMBY WANT BIRTH .nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnne'])

In [15]:
# Map characters to integers
# to avoid memory error, I used limmited number of symbols
chars = [' ', "'", ',', '-', '.',':', '?', 'A', 'B', 'C', 'D', 'E', 'F',
       'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
       'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] + [START, END, NONE] 
mapping = dict((c, i) for i, c in enumerate(chars))
reverse_mapping = dict((c, i) for i, c in mapping.items())

print(mapping)

{' ': 0, "'": 1, ',': 2, '-': 3, '.': 4, ':': 5, '?': 6, 'A': 7, 'B': 8, 'C': 9, 'D': 10, 'E': 11, 'F': 12, 'G': 13, 'H': 14, 'I': 15, 'J': 16, 'K': 17, 'L': 18, 'M': 19, 'N': 20, 'O': 21, 'P': 22, 'Q': 23, 'R': 24, 'S': 25, 'T': 26, 'U': 27, 'V': 28, 'W': 29, 'X': 30, 'Y': 31, 'Z': 32, 's': 33, 'e': 34, 'n': 35}


In [16]:
def apply_map(X_str):
    X_int = []
    for i in X_str:
        X_int.append(list([mapping[char] if char in chars else mapping[NONE] for char in i]))
    return np.array(X_int)

encoder_input = apply_map(char_X1)
dencoder_input = apply_map(char_X2)
dencoder_output = apply_map(char_Y)

In [17]:
encoder_input.shape, dencoder_input.shape, dencoder_output.shape

((59702, 52), (59702, 51), (59702, 51))

In [18]:
%%time
# convert one hot encoding

encoder_input_bytes = np.zeros((encoder_input.shape[0], encoder_input.shape[1], len(mapping)),dtype="float32")
dencoder_input_bytes = np.zeros((dencoder_input.shape[0], dencoder_input.shape[1], len(mapping)),dtype="float32")
dencoder_output_bytes = np.zeros((dencoder_output.shape[0], dencoder_output.shape[1], len(mapping)),dtype="float32")

for i,row in enumerate(encoder_input):
    for j,number in enumerate(row):
        encoder_input_bytes[i,j,number] = 1.0

for i,row in enumerate(dencoder_input):
    for j,number in enumerate(row):
        dencoder_input_bytes[i,j,number] = 1.0

for i,row in enumerate(dencoder_output):
    for j,number in enumerate(row):
        dencoder_output_bytes[i,j,number] = 1.0

CPU times: user 3.69 s, sys: 408 ms, total: 4.1 s
Wall time: 4.1 s


## Build and train a model

Model structure is based on [this tutorial](https://keras.io/examples/nlp/lstm_seq2seq/)

In [19]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

latent_dim = 128
num_encoder_tokens = len(mapping)
num_decoder_tokens = len(mapping)

# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
encoder = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

Here I trained the model for 30 epochs with 30000 train sample. It took less than hour to finish training. In my Colaboratory notebook the training lasts for 62 epochs with all the data and lasted almost 12 hours before disconnection. The best score in Colab was almost 90%. Here I've got 88%. 

In [20]:
batch_size = 128 
epochs = 30

model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
model.fit(
    [encoder_input_bytes, dencoder_input_bytes],
    dencoder_output_bytes,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fe74048d880>

In [29]:
model.save("s2s")

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: s2s/assets


In [30]:
model2 = keras.models.load_model("s2s")

In [31]:
# Construct the encoder and decoder.
encoder_inputs = model2.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model2.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = model2.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,), name="input_3")
decoder_state_input_c = keras.Input(shape=(latent_dim,), name="input_4")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model2.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model2.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

## Evaluate results

In [32]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, mapping[START]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_mapping[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == END or len(decoded_sentence) > MAX_INPUT_SIZE:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_sentence

In [33]:
%%time
print("Input row:", train_data.iloc[0].s1)
print("True row:", train_data.iloc[0].s2)
print("Predicted row:", decode_sequence(encoder_input_bytes[0:1]))

Input row: JUMBY NOW WANTS TO BE BORN .
True row: JUMBY WANT BIRTH .
Predicted row: GET THE RIGHT , DARLING .nnnnnnnnnnnnnnnnnnnnnnnnne
CPU times: user 2.71 s, sys: 15.4 ms, total: 2.72 s
Wall time: 2.69 s


Check on all dev data

In [25]:
dev_char_X1, dev_char_X2, dev_char_Y = gen_char_seq_data(dev_data)

dev_encoder_input = apply_map(dev_char_X1)

dev_encoder_input_bytes = np.zeros((dev_encoder_input.shape[0], dev_encoder_input.shape[1], len(mapping)),dtype="float32")

for i,row in enumerate(dev_encoder_input):
    for j,number in enumerate(row):
        dev_encoder_input_bytes[i,j,number] = 1.0

In [34]:
%%time
dev_true = []
dev_preds = []

for i, s in enumerate(dev_char_Y):
    clean_s = "".join([c for c in s if c not in [START, END, NONE]])
    dev_true.append(clean_s)
    pred_s = decode_sequence(np.array([dev_encoder_input_bytes[i]]))
    clean_pred_s = "".join([c for c in pred_s if c not in [START, END, NONE]])
    dev_preds.append(clean_pred_s)

CPU times: user 1h 31min 34s, sys: 1min 8s, total: 1h 32min 42s
Wall time: 1h 30min 35s


In [40]:
list(zip(dev_true[:5], dev_preds[:5]))

[('300 HEAVY HORSE ?', "WE 'VE GOT TO GO ."),
 ('WE HAVE NO CHANCE .', 'ANYBODY ON THE ?'),
 ("WHEN 'D YOU LAST SEE HIM ?", 'WHEN WILL HE BE THAT ?'),
 ('WHEN WAS THE LAST TIME YOU SAW HIM ?', 'WHEN WILL HE BE THAT ?'),
 ('ANYONE WHO CAN VERIFY THAT ?', 'CAN I TALK TO YOU FOR A MINUTE ?')]

In [45]:
import rouge

def get_rouge(true, preds):
    def prepare_results(p, r, f):
        return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)
    
    evaluator = rouge.Rouge(metrics=['rouge-l'],
                           max_n=3,
                           limit_length=True,
                           length_limit=100,
                           length_limit_type='words',
                           apply_avg=True,
                           apply_best=False,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)

    scores = evaluator.get_scores(true, preds)
    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        print(prepare_results(results['p'], results['r'], results['f']))

In [46]:
get_rouge(dev_true, dev_preds)

	rouge-l:	P: 18.37	R: 18.69	F1: 18.20


Check on 3-4 scored dev data

In [50]:
dev_char_X1, dev_char_X2, dev_char_Y = gen_char_seq_data(dev_data[(dev_data.score>3)&(dev_data.score<=4)])

dev_encoder_input = apply_map(dev_char_X1)

dev_encoder_input_bytes = np.zeros((dev_encoder_input.shape[0], dev_encoder_input.shape[1], len(mapping)),dtype="float32")

for i,row in enumerate(dev_encoder_input):
    for j,number in enumerate(row):
        dev_encoder_input_bytes[i,j,number] = 1.0

In [51]:
%%time
dev_true4 = []
dev_preds4 = []

for i, s in enumerate(dev_char_Y):
    clean_s = "".join([c for c in s if c not in [START, END, NONE]])
    dev_true4.append(clean_s)
    pred_s = decode_sequence(np.array([dev_encoder_input_bytes[i]]))
    clean_pred_s = "".join([c for c in pred_s if c not in [START, END, NONE]])
    dev_preds4.append(clean_pred_s)

CPU times: user 56min 24s, sys: 48.8 s, total: 57min 13s
Wall time: 57min 54s


In [54]:
list(zip(dev_true4[:5], dev_preds4[:5]))

[("WHEN 'D YOU LAST SEE HIM ?", 'WHEN WILL HE BE THAT ?'),
 ('WHEN WAS THE LAST TIME YOU SAW HIM ?', 'WHEN WILL HE BE THAT ?'),
 ('ANYONE WHO CAN VERIFY THAT ?', 'CAN I TALK TO YOU FOR A MINUTE ?'),
 ('CAN ANYONE CORROBORATE THAT ?', "ANYTHING THE BEGIN 'S TO THAT ?"),
 ("NOTHING 'S CHANGED .", "IT 'S NOTHING TO SAY .")]

In [55]:
get_rouge(dev_true4, dev_preds4)

	rouge-l:	P: 21.91	R: 22.09	F1: 21.64


The accuracy of the model is not good enough to say it works fine. The phases generated by NN contain real words, but sometimes they do not have sense. Also, the connection with input sentence is poor. But I trained a relatively small model, only for 30 epochs and on a train sample of 30000. I expect that if I trained larger model for more time and with all the data, the results would be better. Also I checked Rouge-L score on all the dev data and only with labeled score 3-4. As a result, the model perform better on better data: the Rouge-L is bigger with data labeled 3-4.