In [2]:
import pandas as pd
import numpy as np
from huggingface_hub.keras_mixin import keras
from sentencepiece import SentencePieceProcessor
from transformers import BertTokenizer, TFAutoModelForSeq2SeqLM, TFMarianMTModel, MarianTokenizer
import tensorflow as tf

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [3]:

eng_hung = pd.read_csv('/Users/tylerglaze/Documents/PSU/AI 574/LanguageDetector/SenPairs-Eng-Hung.tsv', sep='\t', header=None)
eng_hung = eng_hung.rename(columns={1: 'eng', 3: 'hung'})
eng_hung.drop([0, 2], axis=1, inplace=True)
eng_hung.head()
eng_hung.to_csv('eng_hung.csv', index=False)


In [4]:

print("TensorFlow version:", tf.__version__)
print("Built with Apple Silicon support:", tf.config.list_physical_devices('GPU'))

TensorFlow version: 2.19.0
Built with Apple Silicon support: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [5]:
eng_hung.head()

Unnamed: 0,eng,hung
0,I have to go to sleep.,Aludni kell mennem.
1,Muiriel is 20 now.,Muriel immár 20 éves.
2,Muiriel is 20 now.,Muiriel most 20 éves.
3,"The password is ""Muiriel"".",A jelszó: Muiriel.
4,"The password is ""Muiriel"".","A jelszó ""Muriel""."


In [6]:
eng_hung.head()
eng_hung.dropna()
# drop duplicates from each column
eng_hung = eng_hung.drop_duplicates(subset=['eng'])
eng_hung = eng_hung.drop_duplicates(subset=['hung'])
eng_hung.dropna()

Unnamed: 0,eng,hung
0,I have to go to sleep.,Aludni kell mennem.
1,Muiriel is 20 now.,Muriel immár 20 éves.
3,"The password is ""Muiriel"".",A jelszó: Muiriel.
5,I was in the mountains.,A hegyekben voltam.
6,You're in better shape than I am.,"Jobb formában vagy, mint én."
...,...,...
175841,"If he was a dictator, you'd be sucking up to him.","Ha diktátor lenne ő, benyalnád magad hozzá."
175842,How can we calculate the Earth's speed if we d...,Honnan tudjuk kiszámolni a Föld haladási sebes...
175843,How we doing today?,Hogy vagyunk ma?
175844,Mary can do a split.,Mari meg tudja csinálni a spárgát.


In [53]:
from transformers import TFAutoModelForSeq2SeqLM, MarianTokenizer, create_optimizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np

def train_translation_model(model_name, data_df, src_col, tgt_col, save_path):
    '''
    This function accepts a model name and a data frame and returns a trained model.
    :param model_name: the name of the marian model
    :param data_df: cleaned data frame of english and hungarian
    :param src_col: the name of the column in the df that contains the english sentence
    :param tgt_col: the name of the column in the df that contains the hungarian sentence
    :param save_path: the path to save the model
    :return:
    '''


    # Load model & tokenizer
    model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)

    # Sample down for quicker training during testing, comment out for full dataset
    data_df = data_df.sample(frac=0.5)

    # Train/validation split, (90, 10)
    train_df, val_df = train_test_split(data_df, test_size=0.1)

    # Tokenize input/output
    def tokenize(df):
        '''
        This function accepts a data frame and returns tokenized inputs and labels
        :param df:
        :return:
        '''
        # Tokenize inputs
        inputs = tokenizer(df[src_col].tolist(), return_tensors='tf', padding='max_length',
                           truncation=True, max_length=64)
        # Tokenize targets
        targets = tokenizer(df[tgt_col].tolist(), return_tensors='tf', padding='max_length',
                            truncation=True, max_length=64)
        # Create labels
        labels = targets['input_ids'].numpy()
        # Replace pad token with -100
        labels[labels == tokenizer.pad_token_id] = -100
        # Return inputs and labels as tensors
        return inputs, tf.convert_to_tensor(labels)

    # Tokenize inputs and labels
    train_inputs, train_labels = tokenize(train_df)
    # Tokenize inputs and labels
    val_inputs, val_labels = tokenize(val_df)

    # Optimizer setup
    # Steps per epoch
    steps_per_epoch = len(train_df) // 16
    # Define number of training steps
    num_train_steps = steps_per_epoch * 5
    # Define optimizer, use Adam, learning rate of 2e-5, no warmup
    optimizer, _ = create_optimizer(init_lr=3e-5, num_train_steps=num_train_steps, num_warmup_steps=0)

    # Compile model
    model.compile(optimizer=optimizer)

    # Train
    model.fit(
        # Input, labels, batch size, epochs
        x={ 'input_ids': train_inputs['input_ids'], 'attention_mask': train_inputs['attention_mask'] },
        y=train_labels,
        validation_data=(
            { 'input_ids': val_inputs['input_ids'], 'attention_mask': val_inputs['attention_mask'] },
            val_labels
        ),
        batch_size=32,
        epochs=10
    )

    # Save the model
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    return model, tokenizer

In [54]:
model, tokenizer = train_translation_model(
    model_name='Helsinki-NLP/opus-mt-en-hu',
    data_df=eng_hung,
    src_col='eng',
    tgt_col='hung',
    save_path='translation_model-en-hu'
)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hu.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x5e8604230>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/map_fn.py", line 497, in map_fn
    _, r_a = while_loop.while_loop(  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/while_loop.py", line 488, in while_loop
    loop_vars = body(*loop_vars)  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/while_loop.py", line 479, in <lambda>
    body = lambda i, lv: (i + 1, orig_body(*lv))  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/map_fn.py", line 495, in compute
    return (i + 1, tas)  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/util/tf_shoul

Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x5e8604230>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/map_fn.py", line 497, in map_fn
    _, r_a = while_loop.while_loop(  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/while_loop.py", line 488, in while_loop
    loop_vars = body(*loop_vars)  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/while_loop.py", line 479, in <lambda>
    body = lambda i, lv: (i + 1, orig_body(*lv))  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/map_fn.py", line 495, in compute
    return (i + 1, tas)  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/util/tf_shoul

Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x3d004ad80>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/map_fn.py", line 497, in map_fn
    _, r_a = while_loop.while_loop(  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/while_loop.py", line 488, in while_loop
    loop_vars = body(*loop_vars)  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/while_loop.py", line 479, in <lambda>
    body = lambda i, lv: (i + 1, orig_body(*lv))  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/map_fn.py", line 495, in compute
    return (i + 1, tas)  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/util/tf_shoul

Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x3d004ad80>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/map_fn.py", line 497, in map_fn
    _, r_a = while_loop.while_loop(  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/while_loop.py", line 488, in while_loop
    loop_vars = body(*loop_vars)  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/while_loop.py", line 479, in <lambda>
    body = lambda i, lv: (i + 1, orig_body(*lv))  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/ops/map_fn.py", line 495, in compute
    return (i + 1, tas)  File "/Users/tylerglaze/tf-macos-env/lib/python3.12/site-packages/tensorflow/python/util/tf_shoul

Epoch 1/10


E0000 00:00:1745203986.723000 3073522 meta_optimizer.cc:967] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62521]]}


In [7]:
# load model and tokenizer
model = TFAutoModelForSeq2SeqLM.from_pretrained('translation_model-en-hu')
tokenizer = MarianTokenizer.from_pretrained('translation_model-en-hu')

2025-04-21 01:09:33.400564: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2025-04-21 01:09:33.400597: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2025-04-21 01:09:33.400604: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
I0000 00:00:1745212173.400974 3462343 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1745212173.401019 3462343 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at translation_model-en-hu.
If your task is similar to the task the model of the checkpoint was trained on, yo

In [8]:
def translate_text(text, model, tokenizer, max_length=64, num_beams=5):
    '''
    This function accepts text and returns a translation using a trained model.
    :param text: The text to translate
    :param model: The trained model to use for translation
    :param tokenizer: The tokenizer to use for tokenization that was trained with the model
    :param max_length: the maximum length of the translation
    :param num_beams: the number of beams to use in the beam search
    :return:the translated text
    '''
    # Tokenize the input text
    input_ids = tokenizer(text, return_tensors='tf', padding=True, truncation=True, max_length=max_length)['input_ids']

    # Generate prediction
    outputs = model.generate(input_ids, max_length=max_length, num_beams=num_beams, early_stopping=True)

    # Decode and return the translated text
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

In [9]:
# Test out the model on a sample sentence
sample_sentence = "How many people like cats?"
translated = translate_text(sample_sentence, model, tokenizer)
print("Translation:", translated)

Translation: Hányan szeretik a macskákat?


In [58]:
#sample down for testing
shortened_eng_hung = eng_hung.sample(frac=0.0001, random_state=25)
# Create a list of source and true translations
source_sentences = shortened_eng_hung['eng'].tolist()
true_translations = shortened_eng_hung['hung'].tolist()
# define batch size
batch_size = 32
# Initialize a list to store predicted translations
predicted_translations = []

In [10]:

def translate_batch(texts, model, tokenizer, max_length=64, num_beams=5):
    '''
    This function accepts a list of texts and returns a list of translations using a trained model.
    :param texts: the texts to translate
    :param model: the trained model
    :param tokenizer: the trained tokenizer
    :param max_length: the maximum length of the translation
    :param num_beams: the number of beams to use in the beam search
    :return: decoded translations
    '''
    # Tokenize batch
    inputs = tokenizer(texts, return_tensors='tf', padding=True, truncation=True, max_length=max_length)

    # Generate predictions, set to outputs
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True
    )

    # Decode outputs
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


In [11]:
import pandas as pd
from tqdm import tqdm

# Sample down for testing
shortened_eng_hung = eng_hung.sample(frac=0.002, random_state=26)

source_sentences = shortened_eng_hung['eng'].tolist()
batch_size = 32

# Initialize empty DataFrame
translation_results = pd.DataFrame(columns=["source", "translations"])

# Process in batches
for i in tqdm(range(0, len(source_sentences), batch_size)):
    batch = source_sentences[i:i+batch_size]
    #use the translate batch method
    translated_batch = translate_batch(batch, model, tokenizer)

    # scoped DataFrame from the batch
    batch_df = pd.DataFrame({
        "source": batch,
        "translations": translated_batch
    })

    # Push results to Translation Results
    translation_results = pd.concat([translation_results, batch_df], ignore_index=True)

100%|██████████| 9/9 [11:40<00:00, 77.83s/it]


In [12]:
translation_results

Unnamed: 0,source,translations
0,Stop ordering me around.,Ne rendelgess már!
1,Why did Tom go to Australia?,Tom miért ment Ausztráliába?
2,Did you think you could fool me?,"Azt gondoltad, hogy becsaphatsz?"
3,I see someone.,Látok valakit.
4,Did you find what you needed?,"Megtaláltad, ami kellett?"
...,...,...
267,The enemy has broken through the castle gate.,Az ellenség betörte a vár kaput.
268,I have a lot of pictures.,Sok képem van.
269,We must make up for lost time.,Be kell pórolnunk az elveszett időt.
270,This is the only guidebook that was recommende...,"Ez az egyetlen útikönyv, amit ajánlottak nekem..."


In [13]:
# Add the reference translations to the DataFrame
shortened_eng_hung = shortened_eng_hung.reset_index(drop=True)
translation_results["reference"] = shortened_eng_hung["hung"]

# Calculate BLEU Score
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# Prepare BLEU inputs
references = [[ref.split()] for ref in translation_results["reference"].tolist()]
hypotheses = [pred.split() for pred in translation_results["translations"].tolist()]

# Compute smoothed BLEU score
smoothie = SmoothingFunction().method4
bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smoothie)

# Print and save BLEU score
print(f"Smoothed BLEU score: {bleu_score:.4f}")

with open("bleu_score.txt", "w") as f:
    f.write(f"Smoothed Corpus BLEU score: {bleu_score:.4f}")

Smoothed BLEU score: 0.3723


In [14]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import random

# Sampling 20 random rows from translation_results
sampled_df = translation_results.sample(n=20, random_state=42).reset_index(drop=True)

# Smoothing function for sentence-level BLEU
smoothie = SmoothingFunction().method4

# Iterate and compute
for i, row in sampled_df.iterrows():
    reference = row["reference"].split()
    prediction = row["translations"].split()

    bleu = sentence_bleu([reference], prediction, smoothing_function=smoothie)

    print(f"BLEU: {bleu:.4f}")
    print(f"Source: {row['source']}")
    print(f"Prediction: {row['translations']}")
    print(f"Reference: {row['reference']}")
    print("-" * 50)

BLEU: 0.2118
Source: I caught a carp in a net.
Prediction: Pontyot fogtam hálóban.
Reference: Pontyot fogtam hálóval.
--------------------------------------------------
BLEU: 0.0726
Source: I never really liked them.
Prediction: Sosem szerettem őket valóban.
Reference: Sosem kedveltem őket igazán.
--------------------------------------------------
BLEU: 1.0000
Source: Tell me where she lives.
Prediction: Mondd meg, hol lakik!
Reference: Mondd meg, hol lakik!
--------------------------------------------------
BLEU: 1.0000
Source: My house is your house.
Prediction: Az én házam a te házad.
Reference: Az én házam a te házad.
--------------------------------------------------
BLEU: 0.0000
Source: Shit stinks.
Prediction: Bűzlik a szar.
Reference: A szar büdös.
--------------------------------------------------
BLEU: 1.0000
Source: Whose number is this?
Prediction: Kinek a száma ez?
Reference: Kinek a száma ez?
--------------------------------------------------
BLEU: 0.0000
Source: Do you h