In [1]:
#STEP ONE : IMPORTS

In [41]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import tensorflow.keras.backend as K

from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, RepeatVector, InputLayer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.utils import plot_model


from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from tensorflow.keras.utils import to_categorical



In [3]:
#STEP 2 : FILE READING AND DATA COLLECTION.

In [38]:
english_french = pd.read_csv('C:/Users/user/Desktop/AI and Data Science Workshop/MyNLPModel/data/eng_-french.csv')
english_french = english_french.sample(4000)
english_french.head()

Unnamed: 0,English words/sentences,French words/sentences
87737,A little work won't kill you.,Un peu de travail ne te tuera pas.
122156,They make good use of their rooms.,Ils utilisent leurs chambres à bon escient.
27886,Bring me some water.,Apporte-moi de l'eau !
65244,What more could you want?,Que pourrais-tu vouloir de plus ?
81668,He didn't have a single pen.,Il ne disposait d'aucun stylo.


In [5]:
#STEP 3 : MAKING THE WORDS LOWERCASE

In [6]:
english_french['French words/sentences'] = english_french['French words/sentences'].str.lower()
english_french['English words/sentences'] = english_french['English words/sentences'].str.lower()

In [None]:
english_french.shape

(4000, 2)

In [8]:
#STEP 4 : TOKENIZING THE DATA

In [9]:
english_tokenizer = Tokenizer()
french_tokenizer = Tokenizer()

In [10]:
english_tokenizer.fit_on_texts(english_french['English words/sentences'])
french_tokenizer.fit_on_texts(english_french['French words/sentences'])

In [11]:
#STEP 5 : CONVERTING SENTENCES TO SEQUENCES.

In [12]:
X = english_tokenizer.texts_to_sequences(english_french['English words/sentences'])
y = french_tokenizer.texts_to_sequences(english_french['French words/sentences'])
english_french.head()

Unnamed: 0,English words/sentences,French words/sentences
80498,you must study much harder.,tu dois étudier beaucoup plus.
14313,how's the family?,comment va la famille ?
86617,we leave tomorrow afternoon.,nous partons demain après-midi.
17076,what if i refuse?,et si je refuse ?
38661,what you say is true.,ce que tu dis est vrai.


In [13]:
print("English sequences sample:", X[:5])
print("French sequences sample:", y[:5])

English sequences sample: [[2, 117, 535, 98, 1434], [536, 4, 304], [27, 172, 256, 537], [22, 70, 1, 981], [22, 2, 137, 6, 223]]
French sequences sample: [[9, 155, 1106, 100, 34], [67, 108, 11, 1718], [17, 1719, 333, 186, 554], [56, 50, 1, 1720], [13, 5, 9, 257, 12, 216]]


In [14]:
max_eng_len = max(len(seq) for seq in X)
max_fr_len = max(len(seq) for seq in y)

In [15]:
#STEP 6 : PAD SEQUENCES.

In [16]:
X_padded = pad_sequences(X, maxlen=max_eng_len, padding='post')
y_padded = pad_sequences(y, maxlen=max_fr_len, padding='post')

# Print shapes
print("Max length of English sequences:", max_eng_len)
print("Max length of French sequences:", max_fr_len)
print("Shape of X_padded:", X_padded.shape)
print("Shape of y_padded:", y_padded.shape)

Max length of English sequences: 32
Max length of French sequences: 31
Shape of X_padded: (4000, 32)
Shape of y_padded: (4000, 31)


In [17]:
#STEP 12 : DEFINING THE PRE-PROCESS_INPUT FUNCTION

In [18]:
def preprocess_input(sentence, english_tokenizer, max_eng_len):
    tokens = sentence.lower().split()
    token_ids = [english_tokenizer.word_index.get(word, 0) for word in tokens]
    padded_token_ids = pad_sequences([token_ids], maxlen=max_eng_len, padding='post')
    return padded_token_ids

In [19]:
# Print shapes after padding
print("English padded shape:", X_padded.shape)
print("French padded shape:", y_padded.shape)

English padded shape: (4000, 32)
French padded shape: (4000, 31)


In [20]:
#STEP 10 : TRAINING THE MODEL

In [46]:
y_padded_categorical = to_categorical(y_padded, num_classes=len(french_tokenizer.word_index) + 1)

model = Sequential()
model.add(InputLayer(input_shape=(max_eng_len,)))
model.add(Embedding(input_dim=len(english_tokenizer.word_index) + 1, output_dim=128))
model.add(LSTM(128))
model.add(RepeatVector(max_fr_len))
model.add(LSTM(128, return_sequences=True))
model.add(TimeDistributed(Dense(len(french_tokenizer.word_index) + 1, activation='softmax')))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_padded, y_padded_categorical, batch_size=64, epochs=20, validation_split=0.2)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



Epoch 1/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 525ms/step - accuracy: 0.7168 - loss: 5.8882 - val_accuracy: 0.7827 - val_loss: 1.9161
Epoch 2/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 398ms/step - accuracy: 0.7887 - loss: 1.8025 - val_accuracy: 0.7827 - val_loss: 1.7110
Epoch 3/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 395ms/step - accuracy: 0.7851 - loss: 1.6343 - val_accuracy: 0.7827 - val_loss: 1.6179
Epoch 4/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 376ms/step - accuracy: 0.7860 - loss: 1.5612 - val_accuracy: 0.7880 - val_loss: 1.5959
Epoch 5/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 394ms/step - accuracy: 0.7930 - loss: 1.5292 - val_accuracy: 0.7883 - val_loss: 1.5851
Epoch 6/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 386ms/step - accuracy: 0.7923 - loss: 1.5217 - val_accuracy: 0.7883 - val_loss: 1.5852
Epoch 7/20
[1m50/50[

In [23]:
#STEP 13 : DEFINING THE DECODE_SEQUENCE FUNCTION

In [52]:
def decode_sequence(prediction, tokenizer):
    translated_sentence = []
    
    for word_prob in prediction:
        word_id = np.argmax(word_prob)
        if word_id == 0:
            break
        translated_sentence.append(tokenizer.index_word[word_id])
    
    return ' '.join(translated_sentence)


In [53]:
# STEP 14 : TRANSLATE USER INPUT

In [54]:
def translate_user_input(user_input, model, english_tokenizer, french_tokenizer, max_eng_len, max_fr_len):
    # Tokenize and pad the input sequence
    input_seq = english_tokenizer.texts_to_sequences([user_input])
    input_seq = pad_sequences(input_seq, maxlen=max_eng_len, padding='post')

    # Predict the translation
    prediction = model.predict(input_seq)

    # Decode the prediction to get the translated sentence
    translated_sentence = decode_sequence(prediction[0], french_tokenizer)
    
    return translated_sentence

In [27]:
#STEP 11 : CHECKING THE DATA

In [55]:
model.summary()

In [56]:
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) for `plot_model` to work.


In [29]:
#STEP 15 : RUNNING THE MODEL

In [30]:
user_input = input("Enter an English sentence to translate: ")
translated_sentence = translate_user_input(user_input, model, english_tokenizer, french_tokenizer, max_eng_len, max_fr_len)
print(f"Translated to French: {translated_sentence}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Translated to French: je ne pas
