<a href="https://colab.research.google.com/github/MahdiFaourr/MahdiFaourr/blob/main/English_to_French_translate_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Clone the repo.
!git clone https://github.com/zaka-ai/machine_learning_certification.git
# Import necessary libraries and functions.
import pandas as pd
import numpy as np
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, TimeDistributed, Dense, Bidirectional
import nltk
from nltk.tokenize import word_tokenize
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

# Download the 'punkt' tokenizer models
nltk.download('punkt')

# Define paths to English and French data
path_to_English_data = "machine_learning_certification/Challenge 7/en.csv"
path_to_French_data = "machine_learning_certification/Challenge 7/fr.csv"

# Read English and French data
data_English = pd.read_csv(path_to_English_data)
data_French = pd.read_csv(path_to_French_data)

df=pd.concat([data_English,data_French],axis=1)
df.columns=['English','French']

# Remove punctuation from English and French sentences
def punctuation_remover(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['English'] = df['English'].apply(punctuation_remover)
df['French'] = df['French'].apply(punctuation_remover)

# Tokenize English and French sentences
df['English Tokenize'] = df['English'].apply(word_tokenize)
df['French Tokenize'] = df['French'].apply(word_tokenize)

# Count the number of unique words in English and French
unique_english_words = set()
unique_french_words = set()

for tokens in df['English Tokenize']:
    unique_english_words.update(tokens)
number_unique_english_words = len(unique_english_words)

for tokens in df['French Tokenize']:
    unique_french_words.update(tokens)
number_unique_french_words = len(unique_french_words)

print("The Number of unique English words is:", number_unique_english_words)
print("The Number of unique French words is:", number_unique_french_words)

# Tokenize and pad English and French sentences for training
tokenizer_En = Tokenizer(num_words=10000)
tokenizer_En.fit_on_texts(df['English Tokenize'])
x_train = tokenizer_En.texts_to_sequences(df['English Tokenize'])
x_train = pad_sequences(x_train, maxlen=30, padding='post')

tokenizer_Fr = Tokenizer(num_words=1000)
tokenizer_Fr.fit_on_texts(df['French Tokenize'])
y_train = tokenizer_Fr.texts_to_sequences(df['French Tokenize'])
y_train = pad_sequences(y_train, maxlen=30, padding='post')

# Create a sequence-to-sequence model
def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=200, output_dim=60, input_length=30))
    model.add(Bidirectional(LSTM(60, return_sequences=True)))
    model.add(LSTM(75, return_sequences=True))
    model.add(TimeDistributed(Dense(40, activation='relu')))
    model.add(TimeDistributed(Dense(45, activation='relu')))
    model.add(TimeDistributed(Dense(345, activation='softmax')))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = create_model()
model.summary()

# Train the model
model.fit(x_train, y_train, validation_split=0.2, epochs=8, batch_size=32, shuffle=True, verbose=1)

# Function to translate English to French
def English_to_French_translator(input_text):
    input = input_text.translate(str.maketrans('', '', string.punctuation))
    input = word_tokenize(input)
    input = tokenizer_En.texts_to_sequences([input])
    padded_input_sequences = pad_sequences(input, maxlen=30, padding='post')
    predictions = model.predict(padded_input_sequences)
    translated_sequences = np.argmax(predictions, axis=-1)
    translated_text = tokenizer_Fr.sequences_to_texts(translated_sequences)
    translated_text = ''.join(translated_text)
    return translated_text

# Test the translation
input_text = "she is driving the big truck"
translated_text = English_to_French_translator(input_text)
print("Translated text in French:", translated_text)

# Save the Keras model to a file using pickle
with open("model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

