In [4]:
import pandas as pd
import nltk
nltk.download('punkt')
import numpy as np
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequencesqUF`q

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lucy1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def load_data(path):
    input_file = path
    with open(input_file, "r") as f:
        data = f.read()
    return data.split('\n')

english_sentences = load_data(r'C:/Nullclass/data/english.txt')
french_sentences = load_data(r'C:/Nullclass/data/french.txt')

In [7]:
source_texts = english_sentences
target_texts = french_sentences

In [8]:
def clean_text(text):
    if isinstance(text, float) or isinstance(text, int):
        return ''
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator).lower()

In [9]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    sequences = tokenizer.texts_to_sequences(x)
    return sequences, tokenizer

In [10]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen=length, padding='post')

In [11]:
def preprocess(x,y):
    x=[clean_text(str(text)) for text in x]
    y=[clean_text(str(text)) for text in y]
    preprocess_x, x_tk=tokenize(x)
    preprocess_y, y_tk=tokenize(y)
    preprocess_x=pad(preprocess_x)
    preprocess_y=pad(preprocess_y)
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(source_texts,target_texts)

In [12]:
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 345


In [13]:
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Bidirectional,TimeDistributed,Dropout,GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy


def bidirectional_embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

    model = Sequential()
    model.add(Embedding(english_vocab_size+1, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(Bidirectional(LSTM(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size+1, activation='softmax')))

    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(0.005),
                  metrics = ['accuracy'])

    return model

tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))

embed_rnn_model = bidirectional_embed_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

print(embed_rnn_model.summary())

embed_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)



  super().__init__(**kwargs)


None
Epoch 1/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 3s/step - accuracy: 0.5590 - loss: 2.1580 - val_accuracy: 0.8753 - val_loss: 0.4091
Epoch 2/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 3s/step - accuracy: 0.8829 - loss: 0.3756 - val_accuracy: 0.9351 - val_loss: 0.1986
Epoch 3/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m360s[0m 3s/step - accuracy: 0.9350 - loss: 0.2025 - val_accuracy: 0.9569 - val_loss: 0.1371
Epoch 4/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m374s[0m 3s/step - accuracy: 0.9558 - loss: 0.1394 - val_accuracy: 0.9668 - val_loss: 0.1070
Epoch 5/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m385s[0m 4s/step - accuracy: 0.9665 - loss: 0.1055 - val_accuracy: 0.9718 - val_loss: 0.0905
Epoch 6/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m356s[0m 3s/step - accuracy: 0.9720 - loss: 0.0875 - val_accuracy: 0.9763 - val_loss: 0.0774
Epoch 7/10
[1m10

<keras.src.callbacks.history.History at 0x1dd9aad8d70>

In [16]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [17]:
print("Prediciton:")
print(logits_to_text(embed_rnn_model.predict(tmp_x[:100])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentences[:1])

print('\nOriginal text:')
print(english_sentences[:1])

Prediciton:
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 655ms/step
new jersey est parfois calme pendant l automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

Original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


In [18]:
embed_rnn_model.save('pretrained.keras')

In [19]:
print(embed_rnn_model.get_weights())

[array([[-0.00239147, -1.3862165 , -1.3040106 , ..., -0.50113237,
         0.02495333,  0.43675843],
       [ 0.15047267,  0.83134097, -0.1424689 , ..., -0.3543408 ,
        -0.63300884,  0.92535436],
       [ 0.1943003 ,  0.01013347, -0.0825605 , ...,  0.2122727 ,
         0.3016726 , -0.03523199],
       ...,
       [-0.27123454,  0.03688552,  0.17212953, ..., -0.2787341 ,
         0.6279647 ,  0.19578253],
       [-0.3075216 , -0.1625571 ,  0.06124796, ...,  0.10617936,
        -0.5926467 , -0.3868385 ],
       [-0.3873246 , -0.1927234 ,  0.12885958, ..., -0.37342831,
         0.0906743 , -0.3844197 ]], dtype=float32), array([[-0.02966088,  0.04311202, -0.08463567, ..., -0.00557168,
        -0.07933155, -0.1349866 ],
       [-0.06033138, -0.05540761,  0.01857993, ..., -0.00634943,
         0.03396818, -0.05120571],
       [ 0.07637933, -0.12657245,  0.05776694, ..., -0.04717188,
        -0.06075583,  0.03857426],
       ...,
       [-0.01963929, -0.0989877 ,  0.00030773, ...,  0.096