In [22]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import os
import numpy as np

In [2]:
# Load dataset
hebrew_folder_path = r"Data\Hebrew"
english_folder_path = r"Data\English"
dataset_path = r"Data\Modified"

In [70]:
def combine_csvs(path):
    df = pd.DataFrame({})
    files = os.listdir(path)
    for file in files:
        file_path = os.path.join(path, file)
        tmp = pd.read_csv(file_path)
        df = pd.concat([df, tmp], ignore_index=True)
    return df

df = combine_csvs(dataset_path)
df

Unnamed: 0,Hebrew,English,Safe,Hebrew_st,Hebrew_e,English_st,English_e
0,", ג'ול ורנס כתב פעם","Jules Verne once wrote,",True,"00:01:05,990","00:01:07,920","00:01:06,109","00:01:07,444"
1,",שים שתי ספינות בים הפתוח""","""Put two ships in the open sea",True,"00:01:07,920","00:01:10,420","00:01:08,027","00:01:10,989"
2,",ללא רוח או שפל"", .""מתישהו, הן ייפגשו","without wind or tide,, they will come together.""",True,"00:01:10,420","00:01:13,760","00:01:11,156","00:01:13,575"
3,".ככה הוריי הכירו, .כמו שתי ספינות שנועדו להיפגש","That's how my parents met., Like two ships des...",True,"00:01:35,850","00:01:40,520","00:01:36,181","00:01:40,268"
4,".זה בסדר, זה בסדר",It's okay.,True,"00:01:54,400","00:01:55,600","00:01:54,282","00:01:55,283"
...,...,...,...,...,...,...,...
5383,ומה,And what...,True,"02:10:34,342","02:10:35,809","02:10:34,661","02:10:36,046"
5384,סיזר,Caesar...,True,"02:10:38,176","02:10:39,509","02:10:38,415","02:10:39,917"
5385,עשה,did...,True,"02:10:42,743","02:10:43,676","02:10:43,086","02:10:44,179"
5386,.עבורנו,for us.,True,"02:10:45,609","02:10:47,176","02:10:45,839","02:10:47,341"


In [6]:
def tokenize(lang):
    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(lang)
    tensor = tokenizer.texts_to_sequences(lang)
    tensor = pad_sequences(tensor, padding='post')
    return tensor, tokenizer

In [8]:
input_tensor, inp_lang_tokenizer = tokenize(df['Hebrew'])
target_tensor, targ_lang_tokenizer = tokenize(df['English'])

In [13]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Train

In [39]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang_tokenizer.word_index)+1
vocab_tar_size = len(targ_lang_tokenizer.word_index)+1

# Build the Encoder-Decoder model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_inp_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units)),
    tf.keras.layers.RepeatVector(target_tensor_train.shape[1]),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units, return_sequences=True)),
    tf.keras.layers.Dense(vocab_tar_size, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [40]:
# Create a tf.data dataset
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

# Train the model
model.fit(dataset, epochs=220, validation_data=(input_tensor_val, target_tensor_val), steps_per_epoch=steps_per_epoch)

Epoch 1/220
Epoch 2/220
Epoch 3/220
Epoch 4/220
Epoch 5/220
Epoch 6/220
Epoch 7/220
Epoch 8/220
Epoch 9/220
Epoch 10/220
Epoch 11/220
Epoch 12/220
Epoch 13/220
Epoch 14/220
Epoch 15/220
Epoch 16/220
Epoch 17/220
Epoch 18/220
Epoch 19/220
Epoch 20/220
Epoch 21/220
Epoch 22/220
Epoch 23/220
Epoch 24/220
Epoch 25/220
Epoch 26/220
Epoch 27/220
Epoch 28/220
Epoch 29/220
Epoch 30/220
Epoch 31/220
Epoch 32/220
Epoch 33/220
Epoch 34/220
Epoch 35/220
Epoch 36/220
Epoch 37/220
Epoch 38/220
Epoch 39/220
Epoch 40/220
Epoch 41/220
Epoch 42/220
Epoch 43/220
Epoch 44/220
Epoch 45/220
Epoch 46/220
Epoch 47/220
Epoch 48/220
Epoch 49/220
Epoch 50/220
Epoch 51/220
Epoch 52/220
Epoch 53/220
Epoch 54/220
Epoch 55/220
Epoch 56/220
Epoch 57/220
Epoch 58/220
Epoch 59/220
Epoch 60/220
Epoch 61/220
Epoch 62/220
Epoch 63/220
Epoch 64/220
Epoch 65/220
Epoch 66/220
Epoch 67/220
Epoch 68/220
Epoch 69/220
Epoch 70/220
Epoch 71/220
Epoch 72/220
Epoch 73/220
Epoch 74/220
Epoch 75/220
Epoch 76/220
Epoch 77/220
Epoch 78

<keras.src.callbacks.History at 0x23953a3f730>

In [82]:
def translate(sentence):
    sentence = inp_lang_tokenizer.texts_to_sequences([sentence])
    sentence = pad_sequences(sentence, maxlen=input_tensor.shape[1], padding='post')
    predictions = model.predict(sentence)

    # Select the index of the maximum value in each prediction
    predicted_sequence = [np.argmax(pred) for pred in predictions[0]]

    # Convert the sequence of indices to text
    translated_sentence = targ_lang_tokenizer.sequences_to_texts([predicted_sequence])[0]
    return translated_sentence

# Example translation
print(translate("די כבר"))

don't knows. see?
