In [None]:
import pandas as pd
from pathlib import Path
from numpy import array
import numpy as np
import string
#data_path = "Downloads/raw.tar/raw/raw_text.txt"
data_path = "Downloads/raw.tar/raw/jpn.txt"
with open(data_path, 'r', encoding='utf-8') as j:
  lines = j.read()

#Organizes our data so that we can start training the model
#since our data before this process is just a somewhat structure text file.
def to_lines(text):
  sents = text.strip().split('\n')
  sents = [i.split('\t') for i in sents]
  return sents

data = to_lines(lines)
# We should be see 167.130 french samples to be used for our model
print(len(data))

108941


In [None]:
print(data[:10])
array_data = array(data)
#array_data = np.delete(array_data,2,1)
print(array_data.shape)

[['Go.', '行け。', 'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #7421985 (Ninja)'], ['Go.', '行きなさい。', 'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #7421986 (Ninja)'], ['Hi.', 'こんにちは。', 'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #373351 (tommy_san)'], ['Hi.', 'もしもし。', 'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #385517 (mookeee)'], ['Hi.', 'やっほー。', 'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #3480285 (arnab)'], ['Hi.', 'こんにちは！', 'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #3480287 (arnab)'], ['Run.', '走れ。', 'CC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #5955868 (tatoebane)'], ['Run.', '走って！', 'CC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #5955869 (tatoebane)'], ['Who?', '誰？', 'CC-BY 2.0 (France) Attribution: tatoeba.org #2083030 (CK) & #638666 (arihato)'], ['Wow!', 'すごい！', 'CC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #214733 (arihato)

In [None]:
#preprocessing our data
#Removing punctuation
array_data[:,0] = [s.translate(str.maketrans('','',string.punctuation)) for s in array_data[:,0]]
array_data[:,1] = [s.translate(str.maketrans('','',string.punctuation)) for s in array_data[:,1]]
#Making all characters lower case
for i in range(len(array_data)):
  array_data[i,0] = array_data[i,0].lower()
  array_data[i,1] = array_data[i,1].lower()

smaller_data_set = array_data

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
#Tokenizer function to create tokenizer for our data sample sets
def tokenization(lines):
  tokenizer =Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

medium_size_set = array_data
#create english tokenizer
eng_data_tokenized = tokenization(medium_size_set[:,0])
eng_vocab_size = len(eng_data_tokenized.word_index) + 1
#create japanese & French tokenizer
sample_data_tokenized = tokenization(medium_size_set[:,1])
sample_vocab_size = len(sample_data_tokenized.word_index) + 1

#Created a varaible that sets the max word length in a sentence
eng_length = 47
sample_length = 54
print(eng_vocab_size, sample_vocab_size)


12159 89247


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

#encode and pad sequences, padding to a maxium sentence length as mention earlier
def encode_sequences(tokenizer, length, lines):
  #integer encode sequences
  seq = tokenizer.texts_to_sequences(lines)
  #pad sequences with 0 values
  seq = pad_sequences(seq, maxlen=length, padding='post')
  return seq

In [None]:
from sklearn.model_selection import train_test_split

#Splitting data into train and test set to be used by our model
train_set, test_set = train_test_split(smaller_data_set, test_size=0.02, random_state=42)

train_sample = encode_sequences(sample_data_tokenized, sample_length, train_set[:,1])
train_english = encode_sequences(eng_data_tokenized, eng_length, train_set[:,0])

test_sample = encode_sequences(japan_data_tokenized, sample_length, test_set[:,1])
test_english = encode_sequences(eng_data_tokenized, eng_length, test_set[:,0])

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, RepeatVector, GRU, Bidirectional, TimeDistributed, Attention
#building our NMT model to be used to predict our english translation of our sample text
def define_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
  model = Sequential()
  model.add(Embedding(in_vocab, units,  mask_zero=True))
  model.add(Bidirectional(LSTM(units, return_sequences=False)))
  model.add(RepeatVector(out_timesteps))
  model.add(Bidirectional(LSTM(units, return_sequences=True)))
  model.add(TimeDistributed(Dense(out_vocab, activation='softmax')))
  return model

In [None]:
from tensorflow.keras import optimizers
from tensorflow.keras import metrics
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
model = define_model(sample_vocab_size, eng_vocab_size, sample_length, eng_length, 512)
model.build((None,10))
print(model.summary())
adam = optimizers.Adam(learning_rate=.001)
model.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

None


In [None]:
#training our model
fit_model = model.fit(train_sample, train_english.reshape(train_english.shape[0], train_english.shape[1], 1), epochs=10, batch_size=100, validation_split = 0.20)


Epoch 1/10
[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5920s[0m 7s/step - accuracy: 0.8778 - loss: 1.2713 - val_accuracy: 0.8886 - val_loss: 0.7983
Epoch 2/10
[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5831s[0m 7s/step - accuracy: 0.8897 - loss: 0.7617 - val_accuracy: 0.8879 - val_loss: 0.7943
Epoch 3/10
[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5874s[0m 7s/step - accuracy: 0.8923 - loss: 0.7077 - val_accuracy: 0.8813 - val_loss: 0.8161
Epoch 4/10
[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5723s[0m 7s/step - accuracy: 0.8967 - loss: 0.6497 - val_accuracy: 0.8807 - val_loss: 0.8212
Epoch 5/10
[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5717s[0m 7s/step - accuracy: 0.9034 - loss: 0.5733 - val_accuracy: 0.8826 - val_loss: 0.8481
Epoch 6/10
[1m855/855[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5640s[0m 7s/step - accuracy: 0.9115 - loss: 0.4920 - val_accuracy: 0.8816 - val_loss: 0.8877
Epoch 7/10
[1m8

In [None]:
#predicting our translation
training_sample = test_sample[:2000]
preds = model.predict(training_sample.reshape((training_sample.shape[0], training_sample.shape[1])))


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 642ms/step


In [None]:
#these predictions are sequences of integers. We need to convert these integers to their corresponding words
import numpy as np
classes_x=np.argmax(preds,axis=-1)
print(classes_x[0], test_english[0])
def get_word(n, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index == n[0]:
      return word
  return None

[ 11  54  18 711   9 293 900   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0] [23 70  3 35  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0]


In [None]:
#Convert opur prediction results into our english sentences
preds_text = []
for i in classes_x:
  temp = []
  for j in range(len(i)):
    word = [i[j]]
    t = get_word(word, eng_data_tokenized)
    if j>0:
      previous_word = [i[j-1]]
      if (t == get_word(previous_word, eng_data_tokenized)) or (t == None):
        temp.append('')
      else:
        temp.append(t)
    else:
      if(t == None):
        temp.append('')
      else:
        temp.append(t)
  preds_text.append(' '.join(temp))

In [None]:
test_sample_english = test_set[:2000]
pred_df = pd.DataFrame({'actual': test_sample_english[:,0], 'predicted': preds_text})
#Sample our results
pred_df.sample(15)

Unnamed: 0,actual,predicted
1179,i rang the bell and waited,he is me of this same he ...
435,lets try one more time,lets try again ...
1672,does anybody have a kleenex,does anyone have a tissue ...
13,i dozed off in class,he happened about this and gets ...
1590,we stayed at a farm house,there is clear of this same heart ...
1673,i get up early in the morning,im an riser ...
1317,im working on it,tom is me of same he ...
1826,i lost my glasses,ive lost my glasses ...
1268,ive forgotten your number,i dialed your telephone number ...
962,mary was wearing a navy blue skirt,this was the matter of heart ...


In [None]:
#BLEU save file to go to our BLEU score script
import numpy
df = pd.DataFrame(preds_text)
df.to_csv("Downloads/raw.tar/raw/exppred512jpn.csv")
df = pd.DataFrame(test_sample_english)
df.to_csv("Downloads/raw.tar/raw/exptestsample512jpn.csv")