[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Masao-Taketani/JPN-EN-Transformer-translator/blob/master/notebooks/test/train.ipynb)

In [1]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 3.4MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.91


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
import sentencepiece as spm
import tensorflow as tf
from tensorflow.data.experimental import AUTOTUNE
from sklearn.model_selection import train_test_split

In [4]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64
jpn_sp_model = "drive/My Drive/deep_learning_models/JPN-EN-Transformer-translator/jpn_spm.model"
en_sp_model = "drive/My Drive/deep_learning_models/JPN-EN-Transformer-translator/en_spm.model"
jpn_sp = spm.SentencePieceProcessor()
en_sp = spm.SentencePieceProcessor()
jpn_sp.Load(jpn_sp_model)
en_sp.Load(en_sp_model)

True

In [11]:
def read_data(fpath):
  with open(fpath, "r") as f:
    return f.read()

def get_data(fpath):
  data = read_data(fpath)
  data_list = []
  for line in data.split("\n"):
    data_list.append(line)
  return data_list

def get_max_len_and_list(fpath):
  data = read_data(fpath)
  max_len = 0
  li = []
  for line in data.split("\n"):
    li.append(line)
    if max_len < len(line):
      max_len = len(line)
  return max_len, li

In [19]:
jpn_path = "drive/My Drive/deep_learning_models/JPN-EN-Transformer-translator/jpn_data.txt"
en_path = "drive/My Drive/deep_learning_models/JPN-EN-Transformer-translator/en_data.txt"
jpn_data = get_data(jpn_path)
en_data = get_data(en_path)
train_jpn, val_jpn, train_en, val_en = train_test_split(jpn_data, en_data, test_size=0.05)

print("train size:", len(train_jpn), "test size:", len(test_jpn))

train size: 142298 test size: 7490


In [74]:
### When you have tf.Tensor(string) and .numpy() method is used inside of the tf.py_function, 
it is converted to just a string. Not a numpy.
def encode(jpn, en):
  print("en", type(en.numpy()))
  jpn_enc = [jpn_sp.PieceToId("<s>")] + jpn_sp.EncodeAsIds(jpn.numpy()) + [jpn_sp.PieceToId("</s>")]
  en_enc = [en_sp.PieceToId("<s>")] + en_sp.EncodeAsIds(en.numpy()) + [en_sp.PieceToId("</s>")]
  return jpn_enc, en_enc

def tf_encode(jpn, en):
  print(jpn, en)
  result_jpn, result_en = tf.py_function(encode, [jpn, en], [tf.int64, tf.int64])
  result_jpn.set_shape([None])
  result_en.set_shape([None])
  return result_jpn, result_en

In [17]:
# test
t1, t2 = encode("こんにちは。今日は比較的涼しい日ですね。", "Hello. Today is a relatively cool day, isn't it?")
print(t1)
print(t2)

[1, 5, 101, 126, 10, 211, 6, 4, 2182, 3176, 7892, 314, 5439, 275, 93, 553, 4, 2]
[1, 4241, 4, 1344, 9, 8, 2767, 108, 1625, 103, 13, 9, 42, 10, 19, 29, 20, 2]


In [75]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_jpn, train_en))
train_dataset = train_dataset.map(tf_encode)
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((val_jpn, val_en))
val_dataset = val_dataset.map(tf_encode)
val_dataset = val_dataset.padded_batch(BATCH_SIZE)

Tensor("args_0:0", shape=(), dtype=string) Tensor("args_1:0", shape=(), dtype=string)
Tensor("args_0:0", shape=(), dtype=string) Tensor("args_1:0", shape=(), dtype=string)
