In [None]:
#STEP ONE : IMPORTS

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import tensorflow.keras.backend as K

from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, RepeatVector
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.layers import InputLayer, Embedding, LSTM, RepeatVector, TimeDistributed, Dense

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from tensorflow.keras.utils import to_categorical


In [None]:
#STEP 2 : FILE READING AND DATA COLLECTION.

In [None]:
english_french = pd.read_csv('C:/Users/user/Desktop/AI and Data Science Workshop/MyNLPModel/data/eng_-french.csv')
english_french = english_french.sample(500)
print(english_french.head()) 

                                  English words/sentences  \
2685                                         No one came.   
166319  I would think you have other things to keep yo...   
21561                                  We want a rematch.   
104888                    She left home after three days.   
15832                                   No one will know.   

                                   French words/sentences  
2685                                    Personne ne vint.  
166319          J'imagine que d'autres choses t'occupent.  
21561                          Nous voulons une revanche.  
104888  Elle est partie de chez elle au bout de trois ...  
15832                                  Personne ne saura.  


In [None]:
#STEP 3 : MAKING THE WORDS LOWERCASE

In [None]:
english_french['French words/sentences'] = english_french['French words/sentences'].str.lower()
english_french['English words/sentences'] = english_french['English words/sentences'].str.lower()

In [None]:
english_french.shape

In [None]:
#STEP 4 : TOKENIZING THE DATA

In [None]:
english_tokenizer = Tokenizer()
french_tokenizer = Tokenizer()

In [None]:
english_tokenizer.fit_on_texts(english_french['English words/sentences'])
french_tokenizer.fit_on_texts(english_french['French words/sentences'])

In [None]:
X = english_tokenizer.texts_to_sequences(english_french['English words/sentences'])
y = french_tokenizer.texts_to_sequences(english_french['French words/sentences'])

In [None]:
print(len(english_tokenizer.word_index))
print(len(french_tokenizer.word_index))

In [None]:
#STEP 5 : CONVERTING SENTENCES TO SEQUENCES.

In [None]:
X = english_tokenizer.texts_to_sequences(english_french['English words/sentences'])
y = french_tokenizer.texts_to_sequences(english_french['French words/sentences'])
english_french.head()

Unnamed: 0,English words/sentences,French words/sentences
2685,no one came.,personne ne vint.
166319,i would think you have other things to keep yo...,j'imagine que d'autres choses t'occupent.
21561,we want a rematch.,nous voulons une revanche.
104888,she left home after three days.,elle est partie de chez elle au bout de trois ...
15832,no one will know.,personne ne saura.


In [None]:
print("English sequences sample:", X[:5])
print("French sequences sample:", y[:5])

In [None]:

max_eng_len = max(len(seq) for seq in X)
max_fr_len = max(len(seq) for seq in y)

In [None]:
#STEP 6 : PAD SEQUENCES.

In [None]:
X_padded = pad_sequences(X, maxlen=max_eng_len, padding='post')
y_padded = pad_sequences(y, maxlen=max_fr_len, padding='post')


In [None]:
# Print shapes after padding
print("English padded shape:", X_padded.shape)
print("French padded shape:", y_padded.shape)

English padded shape: (500, 18)
French padded shape: (500, 19)


In [None]:
y_one_hot = np.array([to_categorical(seq, num_classes=len(french_tokenizer.word_index) + 1) for seq in y_padded])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_one_hot, test_size=0.2, random_state=42)

In [None]:
#STEP 10 : TRAINING THE MODEL

In [None]:
# Define the model
model = Sequential()
model.add(InputLayer(input_shape=(max_eng_len,)))
model.add(Embedding(input_dim=len(english_tokenizer.word_index) + 1, output_dim=128))
model.add(LSTM(128))
model.add(RepeatVector(max_fr_len))
model.add(LSTM(128, return_sequences=True))
model.add(TimeDistributed(Dense(len(french_tokenizer.word_index) + 1, activation='softmax')))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))



Epoch 1/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 190ms/step - accuracy: 0.5021 - loss: 6.5684 - val_accuracy: 0.6289 - val_loss: 4.0047
Epoch 2/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 63ms/step - accuracy: 0.6393 - loss: 3.3984 - val_accuracy: 0.6289 - val_loss: 2.9482
Epoch 3/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 76ms/step - accuracy: 0.6347 - loss: 2.8127 - val_accuracy: 0.6289 - val_loss: 3.0049
Epoch 4/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 62ms/step - accuracy: 0.6380 - loss: 2.7244 - val_accuracy: 0.6289 - val_loss: 2.9084
Epoch 5/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 62ms/step - accuracy: 0.6472 - loss: 2.5411 - val_accuracy: 0.6289 - val_loss: 2.8245
Epoch 6/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 65ms/step - accuracy: 0.6419 - loss: 2.4829 - val_accuracy: 0.6289 - val_loss: 2.7763
Epoch 7/10
[1m13/13[0m [32m━━

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.6389 - loss: 2.7549
Test Loss: 2.7587995529174805
Test Accuracy: 0.6394737362861633


In [None]:
#STEP 12 : DEFINING THE PRE-PROCESS_INPUT FUNCTION

In [None]:
# Preprocess input sentence
def preprocess_input(sentence, tokenizer, max_len):
    tokens = word_tokenize(sentence.lower())
    token_ids = [tokenizer.word_index.get(word, 0) for word in tokens]  # Use 0 for unknown words
    padded_token_ids = pad_sequences([token_ids], maxlen=max_len, padding='post')
    return padded_token_ids

In [None]:
#STEP 13 : DEFINING THE DECODE_SEQUENCE FUNCTION

In [None]:
def decode_sequence(encoded_seq, tokenizer):
    index_to_word = {index: word for word, index in tokenizer.word_index.items()}
    decoded_sentence = []
    for idx in encoded_seq:
        if idx > 0 and idx in index_to_word:
            word = index_to_word[idx]
            decoded_sentence.append(word)
    return ' '.join(decoded_sentence)

In [None]:
# STEP 14 : TRANSLATE USER INPUT

In [None]:
def translate_user_input(input_sentence, model, english_tokenizer, french_tokenizer, max_eng_len, max_fr_len):
    encoded_input = preprocess_input(input_sentence, english_tokenizer, max_eng_len)
    predicted_output = model.predict(encoded_input)
    predicted_indices = np.argmax(predicted_output, axis=-1)
    translated_sentence = decode_sequence(predicted_indices.flatten(), french_tokenizer)
    return translated_sentence

In [None]:
#STEP 11 : CHECKING THE DATA

In [None]:
model.summary()

In [None]:
#STEP 15 : RUNNING THE MODEL

In [None]:
user_input = input("Enter an English sentence to translate: ")
translated_sentence = translate_user_input(user_input, model, english_tokenizer, french_tokenizer, max_eng_len, max_fr_len)
print(f"Translated to French: {translated_sentence}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Translated to French: je je
