In [1]:
#STEP ONE : IMPORTS

In [106]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import tensorflow.keras.backend as K

from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, RepeatVector
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.losses import categorical_crossentropy

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from tensorflow.keras.utils import to_categorical



In [3]:
#STEP 2 : FILE READING AND DATA COLLECTION.

In [80]:
english_french = pd.read_csv('C:/Users/user/Desktop/AI and Data Science Workshop/MyNLPModel/data/eng_-french.csv')
english_french = english_french.sample(4000)
print(english_french.head()) 

                               English words/sentences  \
42567                           I've run out of money.   
158190  Tell Tom that he needs to wear a tie tomorrow.   
156947   Who's that cute guy I saw you with yesterday?   
85670                     There is no reason to shout.   
38529                            What a big boy he is!   

                                   French words/sentences  
42567                     Je suis tombé à court d'argent.  
158190  Dites à Tom qu'il lui faut porter une cravate ...  
156947  Qui est ce mignon garçon avec lequel je t'ai v...  
85670                       Il n'y a pas besoin de crier.  
38529                          Quel grand garçon il est !  


In [5]:
#STEP 3 : MAKING THE WORDS LOWERCASE

In [81]:
english_french['French words/sentences'] = english_french['French words/sentences'].str.lower()
english_french['English words/sentences'] = english_french['English words/sentences'].str.lower()

In [82]:
english_french.shape

(4000, 2)

In [83]:
#STEP 4 : TOKENIZING THE DATA

In [84]:
english_tokenizer = Tokenizer()
french_tokenizer = Tokenizer()

In [85]:
english_tokenizer.fit_on_texts(english_french['English words/sentences'])
french_tokenizer.fit_on_texts(english_french['French words/sentences'])

In [13]:
#STEP 5 : CONVERTING SENTENCES TO SEQUENCES.

In [86]:
X = english_tokenizer.texts_to_sequences(english_french['English words/sentences'])
y = french_tokenizer.texts_to_sequences(english_french['French words/sentences'])
english_french.head()

Unnamed: 0,English words/sentences,French words/sentences
42567,i've run out of money.,je suis tombé à court d'argent.
158190,tell tom that he needs to wear a tie tomorrow.,dites à tom qu'il lui faut porter une cravate ...
156947,who's that cute guy i saw you with yesterday?,qui est ce mignon garçon avec lequel je t'ai v...
85670,there is no reason to shout.,il n'y a pas besoin de crier.
38529,what a big boy he is!,quel grand garçon il est !


In [87]:
print("English sequences sample:", X[:5])
print("French sequences sample:", y[:5])

English sequences sample: [[83, 386, 68, 12, 118], [75, 8, 7, 14, 329, 3, 630, 5, 469, 193], [533, 7, 631, 756, 1, 280, 2, 32, 186], [60, 6, 53, 632, 3, 987], [19, 5, 281, 305, 14, 6]]
French sequences sample: [[1, 21, 668, 6, 490, 190], [429, 6, 14, 42, 65, 131, 560, 20, 837, 191], [33, 13, 12, 838, 315, 41, 669, 1, 179, 561, 164], [11, 126, 16, 3, 105, 2, 1718], [132, 180, 315, 11, 1719]]


In [88]:
max_eng_len = max(len(seq) for seq in X)
max_fr_len = max(len(seq) for seq in y)

In [75]:
#STEP 6 : PAD SEQUENCES.

In [89]:
X_padded = pad_sequences(X, maxlen=max_eng_len, padding='post')
y_padded = pad_sequences(y, maxlen=max_fr_len, padding='post')

# Print shapes
print("Max length of English sequences:", max_eng_len)
print("Max length of French sequences:", max_fr_len)
print("Shape of X_padded:", X_padded.shape)
print("Shape of y_padded:", y_padded.shape)

Max length of English sequences: 22
Max length of French sequences: 22
Shape of X_padded: (4000, 22)
Shape of y_padded: (4000, 22)


In [19]:
# Print shapes after padding
print("English padded shape:", X_padded.shape)
print("French padded shape:", y_padded.shape)

English padded shape: (500, 16)
French padded shape: (500, 22)


In [22]:
#STEP 10 : TRAINING THE MODEL

In [90]:
model = Sequential()
model.add(InputLayer(input_shape=(max_eng_len,)))
model.add(Embedding(input_dim=len(english_tokenizer.word_index) + 1, output_dim=128))
model.add(LSTM(128))
model.add(RepeatVector(max_fr_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dense(len(french_tokenizer.word_index) + 1, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])





In [91]:
model.summary()

In [25]:
#STEP 12 : DEFINING THE PRE-PROCESS_INPUT FUNCTION

In [92]:
def preprocess_input(sentence, english_tokenizer, max_eng_len):
    tokens = sentence.lower().split()
    token_ids = [english_tokenizer.word_index.get(word, 0) for word in tokens]
    padded_token_ids = pad_sequences([token_ids], maxlen=max_eng_len, padding='post')
    return padded_token_ids


In [27]:
#STEP 13 : DEFINING THE DECODE_SEQUENCE FUNCTION

In [93]:
def decode_sequence(input_seq, model, french_tokenizer, max_fr_len):
    # Encode the input as state vectors.
    states_value = model.layers[2].predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = french_tokenizer.word_index['<start>']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = model.layers[4].predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = french_tokenizer.index_word.get(sampled_token_index, '')

        if sampled_char == '<end>' or len(decoded_sentence.split()) > max_fr_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_char

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence.strip()



In [29]:
# STEP 14 : TRANSLATE USER INPUT

In [94]:
def translate_user_input(input_sentence, model, english_tokenizer, french_tokenizer, max_eng_len, max_fr_len):
    encoded_input = preprocess_input(input_sentence, english_tokenizer, max_eng_len)
    translated_sentence = decode_sequence(encoded_input, model, french_tokenizer, max_fr_len)
    return translated_sentence


In [31]:
#STEP 11 : CHECKING THE DATA

In [95]:
model.summary()

In [96]:
#STEP 15 : RUNNING THE MODEL

In [107]:
user_input = input("Enter an English sentence to translate: ")
print(f"User Input: {user_input}")
translated_sentence = translate_user_input(user_input, model, english_tokenizer, french_tokenizer, max_eng_len, max_fr_len)
print(f"Translated to French: {translated_sentence}")

User Input: hello


AttributeError: 'RepeatVector' object has no attribute 'predict'