In [None]:
!pip install datasets

In [6]:
!pip install pyarrow



In [1]:
import pandas as pd
import csv
# Load the dataset
df = pd.read_csv("sentence_level_data-30k.csv") 

# Save the English column as data.en
df['English'].to_csv("data.en", index=False, header=False) #, quoting=csv.QUOTE_NONE, escapechar='\\'

# Save the Dutch column as data.nl
df['Dutch'].to_csv("data.nl", index=False, header=False) #, quoting=csv.QUOTE_NONE, escapechar='\\'

print("Files created successfully: data.en and data.nl")

Files created successfully: data.en and data.nl


In [7]:
# Load the English and Dutch data
with open("data.en", "r", encoding="utf-8") as en_file:
    en_lines = en_file.readlines()

with open("data.nl", "r", encoding="utf-8") as nl_file:
    nl_lines = nl_file.readlines()

# Find the maximum length of a line in either file
max_en_length = max(len(line.strip()) for line in en_lines)
max_nl_length = max(len(line.strip()) for line in nl_lines)

# Output the results
print(f"Maximum number of characters in a single line in data.en: {max_en_length}")
print(f"Maximum number of characters in a single line in data.nl: {max_nl_length}")

# Get the overall max length from both files
max_length = max(max_en_length, max_nl_length)
print(f"Maximum length between both files: {max_length}")

Maximum number of characters in a single line in data.en: 2256
Maximum number of characters in a single line in data.nl: 1789
Maximum length between both files: 2256


In [1]:
from datasets import Dataset

# Load your data files
with open("data.en", "r", encoding="utf-8") as f:
    english_sentences = [line.strip() for line in f]

with open("data.nl", "r", encoding="utf-8") as f:
    dutch_sentences = [line.strip() for line in f]

# Verify both files have the same number of lines
assert len(english_sentences) == len(dutch_sentences), "Mismatched number of lines in data.en and data.nl"

# Create a dictionary with the parallel data for translation
data = {"translation": [{"en": src, "nl": tgt} for src, tgt in zip(english_sentences, dutch_sentences)]}

# Create the Hugging Face dataset
dataset = Dataset.from_dict(data)

# Check the first example to ensure it's loaded correctly
print(dataset[0])

  from .autonotebook import tqdm as notebook_tqdm


{'translation': {'en': 'This is our precious Auburn.', 'nl': 'Dit is onze dierbare Auburn.'}}


In [2]:
dataset.save_to_disk("en-nl-pairs.arrow")

Saving the dataset (1/1 shards): 100%|██████████| 30005/30005 [00:00<00:00, 2085509.84 examples/s]


In [3]:
from datasets import load_from_disk

# Load the dataset from the directory where your arrow files are located
dataset = load_from_disk("en-nl-pairs.arrow")

In [2]:
print(dataset[0])

{'translation': {'en': 'This is our precious Auburn.', 'nl': 'Dit is onze dierbare Auburn.'}}


In [4]:
from transformers import MarianTokenizer

# Load the tokenizer
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-nl")

# Define the tokenization function
def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["nl"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=1024, truncation=True, padding="max_length")
    return model_inputs

# Apply the preprocessing
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 30005/30005 [00:12<00:00, 2329.53 examples/s]


In [5]:
# Save the tokenized dataset to a directory
tokenized_dataset.save_to_disk("tokenized_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 30005/30005 [00:00<00:00, 84807.27 examples/s] 


In [6]:
from datasets import load_from_disk

# Load the dataset from the directory where your arrow files are located
tokenized_dataset = load_from_disk("tokenized_dataset")

In [7]:
# Split the dataset into training and temporary datasets (80% train, 20% temp)
split_data = tokenized_dataset.train_test_split(test_size=0.2)

# Now split the temporary dataset into validation and test sets (50% val, 50% test)
temp_dataset = split_data["test"]
val_test_split = temp_dataset.train_test_split(test_size=0.5)

# Assign the datasets
train_dataset = split_data["train"]
val_dataset = val_test_split["train"]
test_dataset = val_test_split["test"]

In [8]:
# Save the train dataset to a directory
train_dataset.save_to_disk("train_dataset_hf")
# Save the validation dataset to a directory
val_dataset.save_to_disk("validation_dataset_hf")
# Save the test dataset to a directory
test_dataset.save_to_disk("test_dataset_hf")

Saving the dataset (1/1 shards): 100%|██████████| 24004/24004 [00:00<00:00, 55776.73 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3000/3000 [00:00<00:00, 59549.70 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3001/3001 [00:00<00:00, 41012.37 examples/s]


In [9]:
from datasets import load_from_disk

# Load the dataset from the directory where your arrow files are located
train_dataset = load_from_disk("train_dataset_hf")
validation_dataset = load_from_disk("validation_dataset_hf")
test_dataset = load_from_disk("test_dataset_hf")

In [17]:
import datasets
print(isinstance(train_dataset, datasets.Dataset))

True


In [10]:
from transformers import MarianTokenizer

# Load the tokenizer
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-nl")

def preprocess_function(examples):
    # Flatten the translation dictionary
    model_inputs = tokenizer(examples['translation']['en'], padding='max_length', truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['translation']['nl'], padding='max_length', truncation=True, max_length=512)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply the preprocessing function to the dataset
validating_dataset = validation_dataset.map(preprocess_function, remove_columns=["translation"])

Map: 100%|██████████| 3000/3000 [00:01<00:00, 2407.91 examples/s]


In [11]:
from transformers import MarianTokenizer

# Load the tokenizer
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-nl")

def preprocess_function(examples):
    # Flatten the translation dictionary
    model_inputs = tokenizer(examples['translation']['en'], padding='max_length', truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['translation']['nl'], padding='max_length', truncation=True, max_length=512)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply the preprocessing function to the dataset
training_dataset = train_dataset.map(preprocess_function, remove_columns=["translation"])

Map: 100%|██████████| 24004/24004 [00:10<00:00, 2395.84 examples/s]


In [12]:
from transformers import MarianTokenizer

# Load the tokenizer
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-nl")

def preprocess_function(examples):
    # Flatten the translation dictionary
    model_inputs = tokenizer(examples['translation']['en'], padding='max_length', truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['translation']['nl'], padding='max_length', truncation=True, max_length=512)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply the preprocessing function to the dataset
testing_dataset = test_dataset.map(preprocess_function, remove_columns=["translation"])

Map: 100%|██████████| 3001/3001 [00:01<00:00, 2259.70 examples/s]


In [13]:
# Save the train dataset to a directory
validating_dataset.save_to_disk("validation_dataset_hf_2transform")

Saving the dataset (1/1 shards): 100%|██████████| 3000/3000 [00:00<00:00, 171836.67 examples/s]


In [14]:
# Save the train dataset to a directory
training_dataset.save_to_disk("train_dataset_hf_2transform")

Saving the dataset (1/1 shards): 100%|██████████| 24004/24004 [00:00<00:00, 224482.77 examples/s]


In [15]:
# Save the train dataset to a directory
testing_dataset.save_to_disk("test_dataset_hf_2transform")

Saving the dataset (1/1 shards): 100%|██████████| 3001/3001 [00:00<00:00, 159042.57 examples/s]


In [None]:
!pip install wandb

In [3]:
import torch
print("PyTorch version:", torch.__version__)  # Check PyTorch version
print("CUDA available:", torch.cuda.is_available())  # Check if CUDA is available
print("CUDA version:", torch.version.cuda)  # CUDA version used by PyTorch
print("Number of CUDA devices:", torch.cuda.device_count())  # Number of available CUDA devices

PyTorch version: 2.2.0+cu121
CUDA available: True
CUDA version: 12.1
Number of CUDA devices: 1


In [2]:
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(transformers.__version__)

4.45.2


In [6]:
from datasets import load_from_disk

# Load the dataset from the directory where your arrow files are located
train_dataset = load_from_disk("train_dataset_hf_2transform")
validation_dataset = load_from_disk("validation_dataset_hf_2transform")
test_dataset = load_from_disk("test_dataset_hf_2transform")

In [4]:
print(train_dataset[0])

{'input_ids': [47460, 161, 2, 643, 2, 337, 2, 0, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027, 67027,

In [None]:
import wandb
from transformers import MarianMTModel, TrainingArguments, Trainer

# Load the MarianMT model
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-nl")
# Define training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="translation_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    fp16=True,
    logging_steps=500,
    dataloader_num_workers=3,  # Number of subprocesses to use for data loading.,
    save_total_limit=2,  # Keep only the last 2 checkpoints.,
    weight_decay=0.01,
    save_steps=1000,

    logging_dir="translation_logs"
)

# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
)

# Train the model
trainer.train()

In [None]:
!pip install transformers==4.46.2

In [4]:
import datasets
print(datasets.__version__)

3.0.1


In [None]:
!pip install evaluate


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
# from datasets import load_metric
from tqdm import tqdm
import evaluate
from datasets import Dataset

# Load your fine-tuned model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained('fine_tuned_en_nl_translation_model')
tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-nl')

# Load your test dataset
test_dataset = Dataset.load_from_disk('C:\\Users\\Simeon\\Desktop\\UNI\\YEAR 3\\SEM 1\\Software Engineering\\Project\\Task 2\\Fine-tuning\\test')

# Ensure the model is in evaluation mode
model.eval()

# Setup the metric (e.g., BLEU or ROUGE)
metric = evaluate.load("sacrebleu")  # BLEU is common for translation tasks

# Function to generate predictions and calculate metric scores
def evaluate_model(model, tokenizer, test_dataset, metric):
    results = []
    references = []
    predictions = []
    
    for example in tqdm(test_dataset):
        input_ids = torch.tensor(example['input_ids']).unsqueeze(0)
        attention_mask = torch.tensor(example['attention_mask']).unsqueeze(0)
        
        with torch.no_grad():
            # Generate the translation
            output_ids = model.generate(input_ids, attention_mask=attention_mask)
        
        # Decode the prediction and references
        prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        reference = tokenizer.decode(example['labels'], skip_special_tokens=True)
        
        # Add to lists for metric calculation
        predictions.append(prediction)
        references.append([reference])  # SacreBLEU expects a list of references per prediction

    # Calculate the metric
    metric_result = metric.compute(predictions=predictions, references=references)
    return metric_result

# Run the evaluation
metric_result = evaluate_model(model, tokenizer, test_dataset, metric)

# Print the results
print("Evaluation Result:", metric_result)


Downloading builder script: 100%|██████████| 8.15k/8.15k [00:00<?, ?B/s]
100%|██████████| 3001/3001 [55:16<00:00,  1.11s/it]  


Evaluation Result: {'score': 49.401016267244536, 'counts': [39038, 29251, 23318, 18957], 'totals': [58544, 55543, 52556, 49592], 'precisions': [66.68147034708937, 52.66370199665124, 44.367912322094526, 38.225923536054204], 'bp': 1.0, 'sys_len': 58544, 'ref_len': 55808}


In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Define the directory where the model and tokenizer are stored (adjust if needed)
model_dir = 'C:\\Users\\Simeon\\Desktop\\UNI\\YEAR 3\\SEM 1\\Software Engineering\\Project\\Task 2\\Fine-tuning\\fine_tuned_en_nl_translation_model'

# Load your fine-tuned model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)  # Replace with the correct model directory if different
tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-nl')  # Or use your custom tokenizer if saved locally
print(type(model))
print(type(tokenizer))



  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


<class 'transformers.models.marian.modeling_marian.MarianMTModel'>
<class 'transformers.models.marian.tokenization_marian.MarianTokenizer'>


In [7]:
import time
start = time.time()
# Sample text for translation (adjust this as needed)
text="Meet Bella, a gentle, affectionate 4-year-old Labrador mix with a heart as warm as her golden fur. Bella is a loyal companion who’s as happy lounging by your side as she is on a walk in the park. She’s friendly with other dogs and loves meeting new people, making her a wonderful addition to any family. Bella is house-trained, understands basic commands, and has a calm demeanor that’s perfect for quieter households, though she’s always up for some playtime and belly rubs. Recently given a clean bill of health by the vet, she’s all set to find her forever home. Bella’s ideal family would be one that enjoys cuddles and can provide her with daily strolls to satisfy her curious nose. If you’re looking for a devoted friend with endless love to give, Bella might just be the perfect match. Come meet her and see for yourself!"
# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

input_length = inputs["input_ids"].shape[1]
length_penaltyy = 1 + 2.75*(input_length / 100)
print("Input tokens:", input_length)
print("Input Text:", text)
print()
# Ensure the model is in evaluation mode
model.eval()

# Generate translation (output will be token ids)
with torch.no_grad():
    translated_ids = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], num_beams=4, length_penalty=length_penaltyy) #num_beams=3, length_penalty=30

# Decode the generated ids to text
translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)

# Print the translated text
print("Translated Text:", translated_text)

end = time.time()
print()
print("Time:", end-start)

Input tokens: 213
Input Text: Meet Bella, a gentle, affectionate 4-year-old Labrador mix with a heart as warm as her golden fur. Bella is a loyal companion who’s as happy lounging by your side as she is on a walk in the park. She’s friendly with other dogs and loves meeting new people, making her a wonderful addition to any family. Bella is house-trained, understands basic commands, and has a calm demeanor that’s perfect for quieter households, though she’s always up for some playtime and belly rubs. Recently given a clean bill of health by the vet, she’s all set to find her forever home. Bella’s ideal family would be one that enjoys cuddles and can provide her with daily strolls to satisfy her curious nose. If you’re looking for a devoted friend with endless love to give, Bella might just be the perfect match. Come meet her and see for yourself!

Translated Text: Maak kennis met Bella, een zachtaardige, aanhankelijke 4-jarige Labradormix met een hart dat zo warm is als haar gouden vac

In [None]:
#how to check a model's max capacity
print(model.config.max_position_embeddings)

In [None]:
#checking token length
from transformers import AutoTokenizer

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-nl')

# Sample sentence
sample_text = "Meet this sweet, loving elderly bulldog who’s recovering from a recent paw surgery. Despite his age, he’s full of gentle affection and enjoys cuddles. He’s currently in rehabilitation, taking slow, steady steps towards recovery. This resilient pup is looking for a cozy home to share his love."

# Tokenize the text and print token count
tokens = tokenizer.encode(sample_text, add_special_tokens=True)
print("Number of tokens:", len(tokens))

# Decode tokens to check how they match the original text
decoded_text = tokenizer.decode(tokens, skip_special_tokens=True)
print("Decoded text:", decoded_text)

In [115]:
import os
print(os.getcwd())

C:\Users\Simeon\Desktop\UNI\YEAR 3\SEM 1\Software Engineering\Project\Task 2\Fine-tuning


In [125]:
import urllib3
import requests

In [None]:
!pip install --upgrade --force-reinstall requests


In [146]:
!python -m unittest unit_testing_translation.py

Translation is consistent across different inferences.
Translation successfull, but emoji not handled correctly.
Empty input not handled correctly.
Model loaded correctly.
Tokenizer loaded correctly.
Model can translate longer sequences - translation has valid length.
Translation successful.


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
.......
----------------------------------------------------------------------
Ran 7 tests in 28.349s

OK
