In [5]:
#STEP ONE : IMPORTS

In [6]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import tensorflow.keras.backend as K

from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, RepeatVector, InputLayer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.losses import categorical_crossentropy

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from tensorflow.keras.utils import to_categorical



In [7]:
#STEP 2 : FILE READING AND DATA COLLECTION.

In [8]:
english_french = pd.read_csv('C:/Users/user/Desktop/AI and Data Science Workshop/MyNLPModel/data/eng_-french.csv')
english_french = english_french.sample(4000)
print(english_french.head()) 

                                  English words/sentences  \
164589  Please turn off the lights when you leave the ...   
78016                         She wore a beautiful dress.   
102220                    How do you know I didn't do it?   
70013                          If you don't eat, you die.   
155497      Don't you want your children to learn French?   

                                   French words/sentences  
164589  Éteins les lumières quand tu quittes la pièce,...  
78016                        Elle a porté une belle robe.  
102220          Comment sais-tu que je ne l'ai pas fait ?  
70013                      Si tu ne manges pas, tu meurs.  
155497  Ne voulez-vous pas que vos enfants apprennent ...  


In [9]:
#STEP 3 : MAKING THE WORDS LOWERCASE

In [10]:
english_french['French words/sentences'] = english_french['French words/sentences'].str.lower()
english_french['English words/sentences'] = english_french['English words/sentences'].str.lower()

In [11]:
english_french.shape

(4000, 2)

In [12]:
#STEP 4 : TOKENIZING THE DATA

In [13]:
english_tokenizer = Tokenizer()
french_tokenizer = Tokenizer()

In [14]:
english_tokenizer.fit_on_texts(english_french['English words/sentences'])
french_tokenizer.fit_on_texts(english_french['French words/sentences'])

In [15]:
#STEP 5 : CONVERTING SENTENCES TO SEQUENCES.

In [16]:
X = english_tokenizer.texts_to_sequences(english_french['English words/sentences'])
y = french_tokenizer.texts_to_sequences(english_french['French words/sentences'])
english_french.head()

Unnamed: 0,English words/sentences,French words/sentences
164589,please turn off the lights when you leave the ...,"éteins les lumières quand tu quittes la pièce,..."
78016,she wore a beautiful dress.,elle a porté une belle robe.
102220,how do you know i didn't do it?,comment sais-tu que je ne l'ai pas fait ?
70013,"if you don't eat, you die.","si tu ne manges pas, tu meurs."
155497,don't you want your children to learn french?,ne voulez-vous pas que vos enfants apprennent ...


In [17]:
print("English sequences sample:", X[:5])
print("French sequences sample:", y[:5])

English sequences sample: [[65, 237, 144, 4, 1449, 76, 2, 102, 4, 214], [30, 996, 5, 468, 529], [38, 10, 2, 31, 1, 48, 10, 9], [57, 2, 17, 133, 2, 615], [17, 2, 27, 26, 310, 3, 330, 157]]
French sequences sample: [[1725, 19, 1726, 113, 10, 1727, 9, 379, 89, 35, 187], [28, 17, 1728, 20, 567, 669], [65, 66, 10, 5, 1, 8, 162, 3, 30], [37, 10, 8, 1111, 3, 10, 1729], [8, 148, 4, 3, 5, 175, 221, 1730, 7, 1731]]


In [18]:
max_eng_len = max(len(seq) for seq in X)
max_fr_len = max(len(seq) for seq in y)

In [19]:
#STEP 6 : PAD SEQUENCES.

In [20]:
X_padded = pad_sequences(X, maxlen=max_eng_len, padding='post')
y_padded = pad_sequences(y, maxlen=max_fr_len, padding='post')

# Print shapes
print("Max length of English sequences:", max_eng_len)
print("Max length of French sequences:", max_fr_len)
print("Shape of X_padded:", X_padded.shape)
print("Shape of y_padded:", y_padded.shape)

Max length of English sequences: 27
Max length of French sequences: 28
Shape of X_padded: (4000, 27)
Shape of y_padded: (4000, 28)


In [21]:
# Print shapes after padding
print("English padded shape:", X_padded.shape)
print("French padded shape:", y_padded.shape)

English padded shape: (4000, 27)
French padded shape: (4000, 28)


In [22]:
#STEP 10 : TRAINING THE MODEL

In [23]:
model = Sequential()
model.add(InputLayer(input_shape=(max_eng_len,)))
model.add(Embedding(input_dim=len(english_tokenizer.word_index) + 1, output_dim=128))
model.add(LSTM(128))
model.add(RepeatVector(max_fr_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dense(len(french_tokenizer.word_index) + 1, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])





In [24]:
model.summary()

In [25]:
#STEP 12 : DEFINING THE PRE-PROCESS_INPUT FUNCTION

In [26]:
def preprocess_input(sentence, english_tokenizer, max_eng_len):
    tokens = sentence.lower().split()
    token_ids = [english_tokenizer.word_index.get(word, 0) for word in tokens]
    padded_token_ids = pad_sequences([token_ids], maxlen=max_eng_len, padding='post')
    return padded_token_ids


In [27]:
#STEP 13 : DEFINING THE DECODE_SEQUENCE FUNCTION

In [38]:
def decode_sequence(input_seq):
    print(f"Input sequence to decode: {input_seq}")  # Debug statement
    decoded_sentence = []
    for idx in input_seq:
        if idx > 0 and idx in french_index_to_word:
            word = french_index_to_word[idx]
            decoded_sentence.append(word)
    translated_sentence = ' '.join(decoded_sentence)
    print(f"Decoded sentence: {translated_sentence}")  # Debug statement
    return translated_sentence


In [29]:
# STEP 14 : TRANSLATE USER INPUT

In [39]:
def translate_user_input(user_input, model, english_tokenizer, french_tokenizer, max_eng_len, max_fr_len):
    # Tokenize and pad the input sequence
    input_seq = english_tokenizer.texts_to_sequences([user_input])
    input_seq = pad_sequences(input_seq, maxlen=max_eng_len, padding='post')

    # Predict the translation
    prediction = model.predict(input_seq)
    
    # Decode the prediction to get the translated sentence
    translated_seq = np.argmax(prediction, axis=-1)
    translated_sentence = ""
    
    for word_id in translated_seq[0]:
        if word_id == 0:
            break
        translated_sentence += french_tokenizer.index_word[word_id] + ' '
    
    return translated_sentence.strip()


In [31]:
#STEP 11 : CHECKING THE DATA

In [40]:
model.summary()

In [33]:
#STEP 15 : RUNNING THE MODEL

In [42]:
user_input = input("Enter an English sentence to translate: ")
translated_sentence = translate_user_input(user_input, model, english_tokenizer, french_tokenizer, max_eng_len, max_fr_len)
print(f"Translated to French: {translated_sentence}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Translated to French: devait devait devait fermerais inventé inventé inventé inventé inventé inventé inventé inventé inventé inventé inventé inventé inventé inventé inventé fermerais fermerais fermerais fermerais fermerais fermerais fermerais fermerais fermerais
