In [15]:
cd '/content/drive/MyDrive/vin/NLP/nmt_attention2'

/content/drive/MyDrive/vin/NLP/nmt_attention2


In [16]:
import tensorflow as tf

In [17]:
#load tokenizer
import pickle

with open('tokenizer/tokenizer.pickle', 'rb') as f:
  data = pickle.load(f)
  en_tokenizer = data['en_tokenizer']
  vi_tokenizer = data['vi_tokenizer']

In [18]:
def preprocess_sentence(s):
  s = s.strip()
  s = '<s> ' + s + ' </s>'
  return s

In [19]:
def convert_text_to_sequences(sentence_array, lang_tokenizer):
  tensor = lang_tokenizer.texts_to_sequences(sentence_array)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
  return tensor

In [20]:
processed_data_path = 'processed_text_data/'
processed_en_train_path = processed_data_path + 'train/train.en'
processed_vi_train_path = processed_data_path + 'train/train.vi'

processed_en_dev_path = processed_data_path + 'dev/tst2012.en'
processed_vi_dev_path = processed_data_path + 'dev/tst2012.vi'

processed_en_test_path = processed_data_path + 'test/tst2013.en'
processed_vi_test_path = processed_data_path + 'test/tst2013.vi'

processed_en_vocab_path = processed_data_path + 'vocab/vocab.en'
processed_vi_vocab_path = processed_data_path + 'vocab/vocab.vi'

In [22]:
def load_data(en_path, vi_path, min_len=1, max_len=50):
  with open(en_path, 'r', encoding='UTF-8') as f:
    en_data = f.read().split('\n')
  with open(vi_path, 'r', encoding='UTF-8') as f:
    vi_data = f.read().split('\n')
  
  en = []
  vi = []
  for en_sent, vi_sent in zip(en_data, vi_data):
    en_len = len(en_sent.strip().split(" "))
    vi_len = len(vi_sent.strip().split(" "))
    if en_len < min_len or en_len > max_len or vi_len < min_len or vi_len > max_len:
      continue
    en.append(preprocess_sentence(en_sent))
    vi.append(preprocess_sentence(vi_sent))
  return en, vi

In [None]:
en_train, vi_train = load_data(processed_en_train_path, processed_vi_train_path)
en_dev, vi_dev = load_data(processed_en_dev_path, processed_vi_dev_path)
en_test, vi_test = load_data(processed_en_test_path, processed_vi_test_path)

In [None]:
en_train_tensor = convert_text_to_sequences(en_train, en_tokenizer)
en_dev_tensor = convert_text_to_sequences(en_dev, en_tokenizer)
en_test_tensor = convert_text_to_sequences(en_test, en_tokenizer)

In [None]:
vi_train_tensor = convert_text_to_sequences(vi_train, vi_tokenizer)
vi_dev_tensor = convert_text_to_sequences(vi_dev, vi_tokenizer)
vi_test_tensor = convert_text_to_sequences(vi_test, vi_tokenizer)

In [None]:
# save tensor to file
with open('sequences_data/en_data.pickle', 'wb') as handle:
    pickle.dump(
        {
         'en_train_tensor': en_train_tensor,
         'en_dev_tensor': en_dev_tensor,
         'en_test_tensor': en_test_tensor
        }, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# save tensor to file
with open('sequences_data/vi_data.pickle', 'wb') as handle:
    pickle.dump(
        {
         'vi_train_tensor': vi_train_tensor,
         'vi_dev_tensor': vi_dev_tensor,
         'vi_test_tensor': vi_test_tensor
        }, handle, protocol=pickle.HIGHEST_PROTOCOL)