In [1]:
#STEP ONE : IMPORTS

In [25]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import tensorflow.keras.backend as K

from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, RepeatVector
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.layers import InputLayer, Embedding, LSTM, RepeatVector, TimeDistributed, Dense, Attention, Concatenate, Input, Bidirectional

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from tensorflow.keras.utils import to_categorical


In [3]:
#STEP 2 : FILE READING AND DATA COLLECTION.

In [26]:
english_french = pd.read_csv('C:/Users/user/Desktop/AI and Data Science Workshop/MyNLPModel/data/eng_-french.csv')
english_french = english_french.sample(4000)
print(english_french.head()) 

                                  English words/sentences  \
80881                        Are you ready for Christmas?   
21864                                  Where is the book?   
51144                             We became good friends.   
51667                             Where did everybody go?   
163369  If I could, I would let every caged bird fly f...   

                                   French words/sentences  
80881                         Êtes-vous prête pour Noël ?  
21864                                  Où est le livret ?  
51144                            Nous devînmes bons amis.  
51667                     Où tout le monde est-il parti ?  
163369  Si je pouvais, je laisserais s'envoler libreme...  


In [5]:
#STEP 3 : MAKING THE WORDS LOWERCASE

In [27]:
english_french['French words/sentences'] = english_french['French words/sentences'].str.lower()
english_french['English words/sentences'] = english_french['English words/sentences'].str.lower()

In [28]:
english_french.shape

(4000, 2)

In [8]:
#STEP 4 : TOKENIZING THE DATA

In [29]:
english_tokenizer = Tokenizer()
french_tokenizer = Tokenizer()

In [30]:
english_tokenizer.fit_on_texts(english_french['English words/sentences'])
french_tokenizer.fit_on_texts(english_french['French words/sentences'])

In [31]:
X = english_tokenizer.texts_to_sequences(english_french['English words/sentences'])
y = french_tokenizer.texts_to_sequences(english_french['French words/sentences'])

In [32]:
print(len(english_tokenizer.word_index))
print(len(french_tokenizer.word_index))

2967
4513


In [13]:
#STEP 5 : CONVERTING SENTENCES TO SEQUENCES.

In [33]:
X = english_tokenizer.texts_to_sequences(english_french['English words/sentences'])
y = french_tokenizer.texts_to_sequences(english_french['French words/sentences'])
english_french.head()

Unnamed: 0,English words/sentences,French words/sentences
80881,are you ready for christmas?,êtes-vous prête pour noël ?
21864,where is the book?,où est le livret ?
51144,we became good friends.,nous devînmes bons amis.
51667,where did everybody go?,où tout le monde est-il parti ?
163369,"if i could, i would let every caged bird fly f...","si je pouvais, je laisserais s'envoler libreme..."


In [34]:
print("English sequences sample:", X[:5])
print("French sequences sample:", y[:5])

English sequences sample: [[26, 2, 145, 20, 640], [107, 6, 4, 169], [24, 1004, 77, 258], [107, 38, 297, 45], [59, 1, 88, 1, 74, 104, 190, 1441, 791, 792, 393]]
French sequences sample: [[41, 4, 424, 23, 705], [90, 13, 8, 1755], [17, 1756, 859, 333], [90, 35, 8, 119, 13, 10, 580], [42, 1, 425, 1, 1757, 1758, 1759, 73, 18, 1111, 19, 1112]]


In [36]:
max_eng_len = max(len(seq) for seq in X)
max_fr_len = max(len(seq) for seq in y)

In [75]:
#STEP 6 : PAD SEQUENCES.

In [37]:
X_padded = pad_sequences(X, maxlen=max_eng_len, padding='post')
y_padded = pad_sequences(y, maxlen=max_fr_len, padding='post')

# Print shapes
print("Max length of English sequences:", max_eng_len)
print("Max length of French sequences:", max_fr_len)
print("Shape of X_padded:", X_padded.shape)
print("Shape of y_padded:", y_padded.shape)

Max length of English sequences: 22
Max length of French sequences: 26
Shape of X_padded: (4000, 22)
Shape of y_padded: (4000, 26)


In [19]:
# Print shapes after padding
print("English padded shape:", X_padded.shape)
print("French padded shape:", y_padded.shape)

English padded shape: (500, 16)
French padded shape: (500, 22)


In [22]:
#STEP 10 : TRAINING THE MODEL

In [44]:
model = Sequential()
model.add(InputLayer(input_shape=(max_eng_len,)))
model.add(Embedding(input_dim=len(english_tokenizer.word_index) + 1, output_dim=128))
model.add(LSTM(128))
model.add(RepeatVector(max_fr_len))
model.add(LSTM(128, return_sequences=True))
model.add(TimeDistributed(Dense(len(french_tokenizer.word_index) + 1, activation='softmax')))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [None]:
model.summary

In [25]:
#STEP 12 : DEFINING THE PRE-PROCESS_INPUT FUNCTION

In [46]:
def preprocess_input(sentence):
    tokens = sentence.lower().split()
    token_ids = [english_tokenizer.word_index.get(word, 0) for word in tokens]
    padded_token_ids = pad_sequences([token_ids], maxlen=max_eng_len, padding='post')
    return padded_token_ids

In [27]:
#STEP 13 : DEFINING THE DECODE_SEQUENCE FUNCTION

In [52]:
# Define decode_sequence function
def decode_sequence(input_seq):
    print(f"Input sequence to decode: {input_seq}")  # Debug statement
    decoded_sentence = []
    for idx in input_seq:
        if idx > 0 and idx in french_index_to_word:
            word = french_index_to_word[idx]
            decoded_sentence.append(word)
    translated_sentence = ' '.join(decoded_sentence)
    print(f"Decoded sentence: {translated_sentence}")  # Debug statement
    return translated_sentence


In [29]:
# STEP 14 : TRANSLATE USER INPUT

In [48]:
def translate_user_input(user_input, model, english_tokenizer, french_tokenizer, max_eng_len, max_fr_len):
    # Tokenize and pad the input sequence
    input_seq = english_tokenizer.texts_to_sequences([user_input])
    input_seq = pad_sequences(input_seq, maxlen=max_eng_len, padding='post')

    # Predict the translation
    prediction = model.predict(input_seq)
    
    # Decode the prediction to get the translated sentence
    translated_seq = np.argmax(prediction, axis=-1)
    translated_sentence = ""
    
    for word_id in translated_seq[0]:
        if word_id == 0:
            break
        translated_sentence += french_tokenizer.index_word[word_id] + ' '
    
    return translated_sentence.strip()


In [31]:
#STEP 11 : CHECKING THE DATA

In [49]:
model.summary()

In [33]:
#STEP 15 : RUNNING THE MODEL

In [None]:
user_input = input("Enter an English sentence to translate: ")
translated_sentence = translate_user_input(user_input, model, english_tokenizer, french_tokenizer, max_eng_len, max_fr_len)
print(f"Translated to French: {translated_sentence}")