In [1]:
#STEP ONE : IMPORTS

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

import keras as K
from keras import pad_sequences, Sequential, Embedding, LSTM, Dense, TimeDistributed, RepeatVector, InputLayer, Tokenizer, categorical_crossentropy, to_categorical, plot_model, layers

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression



In [3]:
#STEP 2 : FILE READING AND DATA COLLECTION.

In [2]:
english_french = pd.read_csv('C:/Users/user/Desktop/AI and Data Science Workshop/MyNLPModel/data/eng_-french.csv')
english_french = english_french.sample(4000)
english_french.head()

Unnamed: 0,English words/sentences,French words/sentences
113053,Your father seems very friendly.,Ton père semble être très amical.
131231,They were abandoned by their mother.,Ils ont été abandonnés par leur mère.
77184,It happened quite recently.,C'est arrivé tout récemment.
130672,Refugees in Africa are seeking help.,Les réfugiés en Afrique recherchent de l'aide.
10786,I felt the same.,J'ai ressenti la même chose.


In [5]:
#STEP 3 : MAKING THE WORDS LOWERCASE

In [3]:
english_french['French words/sentences'] = english_french['French words/sentences'].str.lower()
english_french['English words/sentences'] = english_french['English words/sentences'].str.lower()

In [4]:
english_french.shape

(4000, 2)

In [5]:
#STEP 4 : TOKENIZING THE DATA

In [6]:
english_tokenizer = Tokenizer()
french_tokenizer = Tokenizer()

In [7]:
english_tokenizer.fit_on_texts(english_french['English words/sentences'])
french_tokenizer.fit_on_texts(english_french['French words/sentences'])

In [8]:
#STEP 5 : CONVERTING SENTENCES TO SEQUENCES.

In [9]:
X = english_tokenizer.texts_to_sequences(english_french['English words/sentences'])
y = french_tokenizer.texts_to_sequences(english_french['French words/sentences'])
english_french.head()

Unnamed: 0,English words/sentences,French words/sentences
113053,your father seems very friendly.,ton père semble être très amical.
131231,they were abandoned by their mother.,ils ont été abandonnés par leur mère.
77184,it happened quite recently.,c'est arrivé tout récemment.
130672,refugees in africa are seeking help.,les réfugiés en afrique recherchent de l'aide.
10786,i felt the same.,j'ai ressenti la même chose.


In [10]:
print("English sequences sample:", X[:5])
print("French sequences sample:", y[:5])

English sequences sample: [[21, 179, 388, 51, 977], [39, 46, 1432, 78, 245, 231], [17, 153, 256, 619], [978, 14, 1433, 20, 1434, 64], [1, 468, 4, 257]]
French sequences sample: [[76, 147, 320, 46, 60, 1096], [52, 80, 77, 1764, 78, 228, 204], [25, 192, 35, 842], [20, 1097, 21, 1765, 1098, 2, 428], [22, 1766, 8, 99, 81]]


In [11]:
max_eng_len = max(len(seq) for seq in X)
max_fr_len = max(len(seq) for seq in y)

In [15]:
#STEP 6 : PAD SEQUENCES.

In [12]:
X_padded = pad_sequences(X, maxlen=max_eng_len, padding='post')
y_padded = pad_sequences(y, maxlen=max_fr_len, padding='post')

# Print shapes
print("Max length of English sequences:", max_eng_len)
print("Max length of French sequences:", max_fr_len)
print("Shape of X_padded:", X_padded.shape)
print("Shape of y_padded:", y_padded.shape)

Max length of English sequences: 25
Max length of French sequences: 28
Shape of X_padded: (4000, 25)
Shape of y_padded: (4000, 28)


In [13]:
#STEP 12 : DEFINING THE PRE-PROCESS_INPUT FUNCTION

In [14]:
def preprocess_input(sentence, english_tokenizer, max_eng_len):
    tokens = sentence.lower().split()
    token_ids = [english_tokenizer.word_index.get(word, 0) for word in tokens]
    padded_token_ids = pad_sequences([token_ids], maxlen=max_eng_len, padding='post')
    return padded_token_ids

In [19]:
# Print shapes after padding
print("English padded shape:", X_padded.shape)
print("French padded shape:", y_padded.shape)

English padded shape: (4000, 32)
French padded shape: (4000, 31)


In [20]:
#STEP 10 : TRAINING THE MODEL

In [22]:
y_padded_categorical = to_categorical(y_padded, num_classes=len(french_tokenizer.word_index) + 1)

model = Sequential()
model.add(InputLayer(input_shape=(max_eng_len,)))
model.add(Embedding(input_dim=len(english_tokenizer.word_index) + 1, output_dim=128))
model.add(LSTM(128))
model.add(RepeatVector(max_fr_len))
model.add(LSTM(128, return_sequences=True))
model.add(TimeDistributed(Dense(len(french_tokenizer.word_index) + 1, activation='softmax')))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_padded, y_padded_categorical, batch_size=64, epochs=20, validation_split=0.2)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])





Epoch 1/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 439ms/step - accuracy: 0.6914 - loss: 5.9831 - val_accuracy: 0.7617 - val_loss: 2.0692
Epoch 2/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 433ms/step - accuracy: 0.7553 - loss: 2.0680 - val_accuracy: 0.7617 - val_loss: 1.8582
Epoch 3/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 399ms/step - accuracy: 0.7594 - loss: 1.8271 - val_accuracy: 0.7617 - val_loss: 1.7782
Epoch 4/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 485ms/step - accuracy: 0.7565 - loss: 1.7760 - val_accuracy: 0.7617 - val_loss: 1.7729
Epoch 5/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 612ms/step - accuracy: 0.7650 - loss: 1.7307 - val_accuracy: 0.7699 - val_loss: 1.7480
Epoch 6/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 412ms/step - accuracy: 0.7669 - loss: 1.7103 - val_accuracy: 0.7699 - val_loss: 1.7357
Epoch 7/20
[1m50/50[

In [23]:
#STEP 13 : DEFINING THE DECODE_SEQUENCE FUNCTION

In [16]:
def decode_sequence(prediction, tokenizer):
    translated_sentence = []
    
    for word_prob in prediction:
        word_id = np.argmax(word_prob)
        if word_id == 0:
            break
        translated_sentence.append(tokenizer.index_word[word_id])
    
    return ' '.join(translated_sentence)


In [53]:
# STEP 14 : TRANSLATE USER INPUT

In [17]:
def translate_user_input(user_input, model, english_tokenizer, french_tokenizer, max_eng_len, max_fr_len):
    # Tokenize and pad the input sequence
    input_seq = english_tokenizer.texts_to_sequences([user_input])
    input_seq = pad_sequences(input_seq, maxlen=max_eng_len, padding='post')

    # Predict the translation
    prediction = model.predict(input_seq)

    # Decode the prediction to get the translated sentence
    translated_sentence = decode_sequence(prediction[0], french_tokenizer)
    
    return translated_sentence

In [27]:
#STEP 11 : CHECKING THE DATA

In [18]:
model.summary()

In [None]:
model.layers

In [20]:
model.plot(history.history['accuracy'])
model.plot(history.history['val_accuracy'])
model.title('model accuracy')
model.ylabel('accuracy')
model.xlabel('epoch')
model.legend(['train', 'test'], loc='upper left')
model.show()

AttributeError: module 'pydot' has no attribute 'InvocationException'

In [None]:
model.plot(history.history['loss'])
model.plot(history.history['val_loss'])
model.title('model loss')
model.ylabel('loss')
model.xlabel('epoch')
model.legend(['train', 'test'], loc='upper left')
model.show()

In [29]:
#STEP 15 : RUNNING THE MODEL

In [None]:
user_input = input("Enter an English sentence to translate: ")
translated_sentence = translate_user_input(user_input, model, english_tokenizer, french_tokenizer, max_eng_len, max_fr_len)
print(f"Translated to French: {translated_sentence}")