In [63]:
#STEP ONE : IMPORTS

In [22]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import tensorflow.keras.backend as K

from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, RepeatVector
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.losses import categorical_crossentropy

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from tensorflow.keras.utils import to_categorical


In [65]:
#STEP 2 : FILE READING AND DATA COLLECTION.

In [23]:
english_french = pd.read_csv('C:/Users/user/Desktop/AI and Data Science Workshop/MyNLPModel/data/eng_-french.csv')
english_french = english_french.sample(500)
print(english_french.head()) 

                          English words/sentences  \
71366                  The post office is closed.   
83995                I'm afraid I have to go now.   
146715  How many people do you think there'll be?   
3068                                 You are big.   
95784              His schedule has been changed.   

                                French words/sentences  
71366                             La poste est fermée.  
83995     J'ai bien peur de devoir y aller maintenant.  
146715  Combien de personnes pensez-vous qu'il y aura?  
3068                                 Vous êtes grands.  
95784                      Son planning a été modifié.  


In [67]:
#STEP 3 : MAKING THE WORDS LOWERCASE

In [24]:
english_french['French words/sentences'] = english_french['French words/sentences'].str.lower()
english_french['English words/sentences'] = english_french['English words/sentences'].str.lower()

In [4]:
english_french.shape

(500, 2)

In [69]:
#STEP 4 : TOKENIZING THE DATA

In [5]:
english_tokenizer = Tokenizer()
french_tokenizer = Tokenizer()

In [25]:
english_tokenizer.fit_on_texts(english_french['English words/sentences'])
french_tokenizer.fit_on_texts(english_french['French words/sentences'])

In [73]:
#STEP 5 : CONVERTING SENTENCES TO SEQUENCES.

In [26]:
X = english_tokenizer.texts_to_sequences(english_french['English words/sentences'])
y = french_tokenizer.texts_to_sequences(english_french['French words/sentences'])
english_french.head()

Unnamed: 0,English words/sentences,French words/sentences
71366,the post office is closed.,la poste est fermée.
83995,i'm afraid i have to go now.,j'ai bien peur de devoir y aller maintenant.
146715,how many people do you think there'll be?,combien de personnes pensez-vous qu'il y aura?
3068,you are big.,vous êtes grands.
95784,his schedule has been changed.,son planning a été modifié.


In [75]:
#STEP 6 : PAD SEQUENCES.

In [45]:
max_len_english = max(len(seq) for seq in X)
max_len_french = max(len(seq) for seq in y)

X_padded = pad_sequences(X, maxlen=max_len_english, padding='post')
y_padded = pad_sequences(y, maxlen=max_len_french, padding='post')

In [77]:
#STEP 7 : CREATING WORD INDEX MAPPING AND CREATING REVERSE INDEX MAPPING.

In [44]:
english_word_to_index = english_tokenizer.word_index
french_word_to_index = french_tokenizer.word_index

english_index_to_word = {index: word for word, index in english_word_to_index.items()}
french_index_to_word = {index: word for word, index in french_word_to_index.items()}

In [29]:
print("English word-to-index mapping:")
print(dict(list(english_word_to_index.items())[:5]))
print("\nFrench word-to-index mapping:")
print(dict(list(french_word_to_index.items())[:5]))

English word-to-index mapping:
{'you': 1, 'i': 2, 'to': 3, 'the': 4, 'a': 5}

French word-to-index mapping:
{'je': 1, 'de': 2, 'vous': 3, 'pas': 4, 'que': 5}


In [79]:
#TEST 8 : SPLITTING THE DATA INTO TRAIN AND TEST SETS.

In [30]:
max_len = max(max_len_english, max_len_french)
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_padded, test_size=0.2, random_state=42)

In [81]:
#STEP 9 : DEFINING THE MODEL

In [38]:
#STEP 10 : TRAINING THE MODEL

In [43]:
model = Sequential()

model.add(Embedding(input_dim=(len(english_word_to_index)), output_dim=500))
model.add(LSTM(500))
model.add(Dense(len(french_word_to_index) + 1, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))


Epoch 1/10


ValueError: Arguments `target` and `output` must have the same shape. Received: target.shape=(None, 20), output.shape=(None, 1822)

In [39]:
#STEP 12 : DEFINING THE PRE-PROCESS_INPUT FUNCTION

In [14]:
def preprocess_input(sentence):
    tokens = word_tokenize(sentence.lower())
    token_ids = [english_word_to_index.get(word, 0) for word in tokens]  # Use 0 for unknown words
    padded_token_ids = pad_sequences([token_ids], maxlen=max_len, padding='post')
    print(f"Preprocessed input: {padded_token_ids}")  # Debug statement
    return padded_token_ids

In [40]:
#STEP 13 : DEFINING THE DECODE_SEQUENCE FUNCTION

In [15]:
def decode_sequence(input_seq):
    print(f"Input sequence to decode: {input_seq}")  # Debug statement
    decoded_sentence = []
    for idx in input_seq:
        if idx > 0 and idx in french_index_to_word:
            word = french_index_to_word[idx]
            decoded_sentence.append(word)
    translated_sentence = ' '.join(decoded_sentence)
    print(f"Decoded sentence: {translated_sentence}")  # Debug statement
    return translated_sentence

In [41]:
# STEP 14 : TRANSLATE USER INPUT

In [16]:
def translate_user_input(input_sentence):
    encoded_input = preprocess_input(input_sentence)
    predicted_output = model.predict(encoded_input)
    predicted_indices = np.argmax(predicted_output, axis=-1)
    print(f"Predicted indices: {predicted_indices}")  # Debug statement
    translated_sentence = decode_sequence(predicted_indices.flatten())
    return translated_sentence

In [85]:
#STEP 11 : CHECKING THE DATA

In [42]:
model.summary()

In [None]:
#STEP 15 : RUNNING THE MODEL

In [40]:
user_input = input("Enter an English sentence to translate: ")
translated_sentence = translate_user_input(user_input)
print(f"Translated to French: {translated_sentence}")

Preprocessed input: [[1327    0    1   27    3  336    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Predicted indices: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Input sequence to decode: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Decoded sentence: 
Translated to French: 
