In [3]:
import pandas as pd


file_path = '/content/translatedataset.csv'
data = pd.read_csv(file_path)


print(data.head())


                                                  de  \
0  Ursprünglich war die Schulhofsanierung sogar s...   
1  Von daher werden sie gegen ihren Ex-Coach sich...   
2  Sie sind nicht alle erfahrene Rennfahrer, sond...   
3  In seinem Brief macht Snowden den deutschen Be...   
4  Ein Anwohner im Bischof-Freundorfer-Weg meldet...   

                                                  en  
0  The school yard renovation was originally plan...  
1  Consequently, they will be particularly motiva...  
2  They're not all experienced racers, but people...  
3  The letter extends an offer to cooperate with ...  
4  The residents of the Bischof-Freundorfer-Weg r...  


In [4]:

print(data.info())

print(data.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3003 entries, 0 to 3002
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   de      3003 non-null   object
 1   en      3003 non-null   object
dtypes: object(2)
memory usage: 47.0+ KB
None
                                                       de  \
count                                                3003   
unique                                               3003   
top     Ursprünglich war die Schulhofsanierung sogar s...   
freq                                                    1   

                                                 en  
count                                          3003  
unique                                         3001  
top     However, speaking the truth is not a crime.  
freq                                              2  


In [5]:

print(data.isnull().sum())

data = data.dropna()



de    0
en    0
dtype: int64


In [6]:

source_texts = data['de']
target_texts = data['en']

for source, target in zip(source_texts.head(), target_texts.head()):
    print(f"Source: {source}\nTarget: {target}\n")


Source: Ursprünglich war die Schulhofsanierung sogar schon in den Jahren 2008/2009 geplant, doch hohe unplanmäßige Ausgaben brachten eine Verschiebung.
Target: The school yard renovation was originally planned back in 2008/2009, however, high unplanned expenses meant that the work had to be pushed back.

Source: Von daher werden sie gegen ihren Ex-Coach sicher ganz besonders motiviert sein.
Target: Consequently, they will be particularly motivated playing against their former coach.

Source: Sie sind nicht alle erfahrene Rennfahrer, sondern Leute, die auf der Suche nach Spannung und Abenteuer sind sowie nach einem erreichbaren Weg zu Weltklasse-Veranstaltungen.
Target: They're not all experienced racers, but people looking for excitement and adventure, and an achievable path towards world-class events.

Source: In seinem Brief macht Snowden den deutschen Behörden ein Angebot der Zusammenarbeit, „wenn die Schwierigkeiten rund um die humanitäre Situation gelöst wurden“.
Target: The lette

In [7]:
processed_file_path = '/content/processed_translatedataset.csv'
data.to_csv(processed_file_path, index=False)


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split


file_path = '/content/translatedataset.csv'
data = pd.read_csv(file_path)


data = data.dropna()

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


source_texts = train_data['de'].tolist()
target_texts = train_data['en'].tolist()


In [9]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


source_tokenizer = Tokenizer()
target_tokenizer = Tokenizer()


source_tokenizer.fit_on_texts(source_texts)
target_tokenizer.fit_on_texts(target_texts)


source_sequences = source_tokenizer.texts_to_sequences(source_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)


max_source_len = max(len(seq) for seq in source_sequences)
max_target_len = max(len(seq) for seq in target_sequences)

source_padded = pad_sequences(source_sequences, maxlen=max_source_len, padding='post')
target_padded = pad_sequences(target_sequences, maxlen=max_target_len, padding='post')


X_train = source_padded
y_train = target_padded


In [10]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Dropout
from tensorflow.keras.optimizers import Adam

vocab_size_source = len(source_tokenizer.word_index) + 1
vocab_size_target = len(target_tokenizer.word_index) + 1
embedding_dim = 256
hidden_units = 512


encoder_inputs = Input(shape=(max_source_len,))
encoder_embedding = Embedding(vocab_size_source, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]


decoder_inputs = Input(shape=(max_target_len,))
decoder_embedding = Embedding(vocab_size_target, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_target, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


history = model.fit([X_train, y_train], y_train, epochs=10, batch_size=64, validation_split=0.2)


Epoch 1/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 9s/step - accuracy: 0.6226 - loss: 5.2514 - val_accuracy: 0.7059 - val_loss: 2.3493
Epoch 2/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m306s[0m 8s/step - accuracy: 0.7213 - loss: 2.1431 - val_accuracy: 0.7278 - val_loss: 2.1340
Epoch 3/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 8s/step - accuracy: 0.7283 - loss: 2.0253 - val_accuracy: 0.7281 - val_loss: 2.0528
Epoch 4/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 8s/step - accuracy: 0.7373 - loss: 1.8619 - val_accuracy: 0.7283 - val_loss: 1.9848
Epoch 5/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 8s/step - accuracy: 0.7360 - loss: 1.8132 - val_accuracy: 0.7495 - val_loss: 1.9403
Epoch 6/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 8s/step - accuracy: 0.7487 - loss: 1.7648 - val_accuracy: 0.7504 - val_loss: 1.9001
Epoch 7/10
[1m31/31[0m [32m━━━━

In [13]:

model.save('/content/translation_model.keras')


In [14]:
from tensorflow.keras.models import load_model


model = load_model('/content/translation_model.keras')


  saveable.load_own_variables(weights_store.get(inner_path))


In [15]:
optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


checkpoint_path = '/content/translation_model_checkpoint.keras'


checkpoint = ModelCheckpoint(filepath=checkpoint_path,
                             monitor='val_loss',
                             save_best_only=True,
                             save_weights_only=False,
                             verbose=1)


history = model.fit(
    [X_train, y_train],
    y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.2,
    callbacks=[checkpoint]
)


test_loss, test_accuracy = model.evaluate([X_test, y_test], y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

Epoch 1/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8s/step - accuracy: 0.8020 - loss: 1.4283
Epoch 1: val_loss improved from inf to 1.69373, saving model to /content/translation_model_checkpoint.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 8s/step - accuracy: 0.8020 - loss: 1.4284 - val_accuracy: 0.7944 - val_loss: 1.6937
Epoch 2/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8s/step - accuracy: 0.7980 - loss: 1.4359
Epoch 2: val_loss improved from 1.69373 to 1.68760, saving model to /content/translation_model_checkpoint.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m283s[0m 9s/step - accuracy: 0.7981 - loss: 1.4353 - val_accuracy: 0.7960 - val_loss: 1.6876
Epoch 3/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8s/step - accuracy: 0.8040 - loss: 1.4016
Epoch 3: val_loss improved from 1.68760 to 1.68173, saving model to /content/translation_model_checkpoint.keras
[1m31/31[0m [3

NameError: name 'X_test' is not defined

In [17]:

model_save_path = '/content/translation_model_final.keras'


model.save(model_save_path)
print(f'Model saved to {model_save_path}')


Model saved to /content/translation_model_final.keras


In [18]:
from tensorflow.keras.models import load_model


model_load_path = '/content/translation_model_final.keras'


model = load_model(model_load_path)
print(f'Model loaded from {model_load_path}')


Model loaded from /content/translation_model_final.keras


In [19]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam


optimizer = Adam(learning_rate=0.00005)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


checkpoint_path = '/content/translation_model_checkpoint.keras'


checkpoint = ModelCheckpoint(filepath=checkpoint_path,
                             monitor='val_loss',
                             save_best_only=True,
                             save_weights_only=False,
                             verbose=1)


history = model.fit(
    [X_train, y_train],
    y_train,
    epochs=1,
    batch_size=64,
    validation_split=0.2,
    callbacks=[checkpoint]
)


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8s/step - accuracy: 0.8146 - loss: 1.3079
Epoch 1: val_loss improved from inf to 1.63920, saving model to /content/translation_model_checkpoint.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m276s[0m 9s/step - accuracy: 0.8146 - loss: 1.3082 - val_accuracy: 0.8065 - val_loss: 1.6392


In [20]:

model_save_path = '/content/translation_model_final.keras'


model.save(model_save_path)
print(f'Model saved to {model_save_path}')


Model saved to /content/translation_model_final.keras


In [22]:
checkpoint = ModelCheckpoint(filepath=checkpoint_path,
                             monitor='val_loss',
                             save_best_only=True,
                             save_weights_only=False,
                             verbose=1)


history = model.fit(
    [X_train, y_train],
    y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.2,
    callbacks=[checkpoint]
)

Epoch 1/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8s/step - accuracy: 0.8157 - loss: 1.2974
Epoch 1: val_loss improved from inf to 1.63557, saving model to /content/translation_model_checkpoint.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 8s/step - accuracy: 0.8157 - loss: 1.2978 - val_accuracy: 0.8070 - val_loss: 1.6356
Epoch 2/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8s/step - accuracy: 0.8142 - loss: 1.3082
Epoch 2: val_loss improved from 1.63557 to 1.63266, saving model to /content/translation_model_checkpoint.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0m 9s/step - accuracy: 0.8142 - loss: 1.3081 - val_accuracy: 0.8075 - val_loss: 1.6327
Epoch 3/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7s/step - accuracy: 0.8161 - loss: 1.2945
Epoch 3: val_loss improved from 1.63266 to 1.62995, saving model to /content/translation_model_checkpoint.keras
[1m31/31[0m [32m━

In [23]:

model_save_path = '/content/translation_model_final.keras'

model.save(model_save_path)
print(f'Model saved to {model_save_path}')

Model saved to /content/translation_model_final.keras
