In [None]:
!pip install datasets

In [6]:
!pip install pyarrow



In [9]:
from datasets import Dataset

# Load your data files
with open("SE4CSAI_2021_Practical/EN-NL/data/data.en", "r", encoding="utf-8") as f:
    english_sentences = [line.strip() for line in f]

with open("SE4CSAI_2021_Practical/EN-NL/data/data.nl", "r", encoding="utf-8") as f:
    dutch_sentences = [line.strip() for line in f]

# Verify both files have the same number of lines
assert len(english_sentences) == len(dutch_sentences), "Mismatched number of lines in data.en and data.nl"

# Create a dictionary with the parallel data for translation
data = {"translation": [{"en": src, "nl": tgt} for src, tgt in zip(english_sentences, dutch_sentences)]}

# Create the Hugging Face dataset
dataset = Dataset.from_dict(data)

# Check the first example to ensure it's loaded correctly
print(dataset[0])

{'translation': {'en': '"All citizens of the euro area will have to learn a new unit of account language and how to recognise the new euro coins and banknotes.', 'nl': '"Alle burgers van de eurozone zullen met de nieuwe munteenheid moeten leren omgaan en de nieuwe euromunststukken en -biljetten kunnen herkennen.'}}


In [20]:
dataset.save_to_disk("en-nl-pairs.arrow")

Saving the dataset (0/3 shards):   0%|          | 0/3528196 [00:00<?, ? examples/s]

In [None]:
!pip uninstall -y urllib3 boto3 botocore

In [None]:
!pip install "urllib3<2"

In [None]:
!pip install boto3 botocore

In [4]:
import boto3
import botocore
import urllib3

print("boto3 version:", boto3.__version__)
print("botocore version:", botocore.__version__)
print("urllib3 version:", urllib3.__version__)


boto3 version: 1.35.54
botocore version: 1.35.54
urllib3 version: 1.26.20


In [2]:
from datasets import load_from_disk

# Load the dataset from the directory where your arrow files are located
dataset = load_from_disk("en-nl-pairs.arrow")

In [3]:
print(dataset[0])

{'translation': {'en': '"All citizens of the euro area will have to learn a new unit of account language and how to recognise the new euro coins and banknotes.', 'nl': '"Alle burgers van de eurozone zullen met de nieuwe munteenheid moeten leren omgaan en de nieuwe euromunststukken en -biljetten kunnen herkennen.'}}


In [4]:
from transformers import MarianTokenizer

# Load the tokenizer
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-nl")

# Define the tokenization function
def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["nl"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True, padding="max_length")
    return model_inputs

# Apply the preprocessing
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 3528196/3528196 [15:01<00:00, 3912.03 examples/s]


In [6]:
# Save the tokenized dataset to a directory
tokenized_dataset.save_to_disk("tokenized_dataset")

Saving the dataset (14/14 shards): 100%|██████████| 3528196/3528196 [00:30<00:00, 115587.18 examples/s]


In [1]:
from datasets import load_from_disk

# Load the dataset from the directory where your arrow files are located
tokenized_dataset = load_from_disk("tokenized_dataset")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Split the dataset into training and temporary datasets (80% train, 20% temp)
split_data = tokenized_dataset.train_test_split(test_size=0.2)

# Now split the temporary dataset into validation and test sets (50% val, 50% test)
temp_dataset = split_data["test"]
val_test_split = temp_dataset.train_test_split(test_size=0.5)

# Assign the datasets
train_dataset = split_data["train"]
val_dataset = val_test_split["train"]
test_dataset = val_test_split["test"]

In [4]:
# Save the train dataset to a directory
train_dataset.save_to_disk("train_dataset_hf")
# Save the validation dataset to a directory
val_dataset.save_to_disk("validation_dataset_hf")
# Save the test dataset to a directory
test_dataset.save_to_disk("test_dataset_hf")

Saving the dataset (12/12 shards): 100%|██████████| 2822556/2822556 [06:24<00:00, 7331.79 examples/s] 
Saving the dataset (2/2 shards): 100%|██████████| 352820/352820 [00:13<00:00, 26586.74 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 352820/352820 [00:10<00:00, 34624.37 examples/s]


In [1]:
from datasets import load_from_disk

# Load the dataset from the directory where your arrow files are located
train_dataset = load_from_disk("train_dataset_hf")
validation_dataset = load_from_disk("validation_dataset_hf")

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.18.5-py3-none-win_amd64.whl.metadata (9.7 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=2.0.0 (from wandb)
  Downloading sentry_sdk-2.17.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-win_amd64.whl.metadata (10 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading wandb-0.18.5-py3-none-win_amd64.whl (15.4 MB)
   ---------------------------------------- 0.0/15.4 MB ? eta -:--:--
   --- ------------------------------------ 1.3/15.4

In [11]:
!pip uninstall -y torch

Found existing installation: torch 2.5.1









Uninstalling torch-2.5.1:

You can safely remove it manually.







  Successfully uninstalled torch-2.5.1


In [8]:
!pip install torch==2.2.0 --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.2.0
  Using cached https://download.pytorch.org/whl/cu121/torch-2.2.0%2Bcu121-cp310-cp310-win_amd64.whl (2454.8 MB)
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 2.2.2+cu121
    Uninstalling torch-2.2.2+cu121:
      Successfully uninstalled torch-2.2.2+cu121
Successfully installed torch-2.2.0+cu121


  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.20.1 requires torch==2.5.1, but you have torch 2.2.0+cu121 which is incompatible.
autoawq 0.2.6 requires torch==2.3.1, but you have torch 2.2.0+cu121 which is incompatible.
autoawq-kernels 0.0.7 requires torch==2.3.1, but you have torch 2.2.0+cu121 which is incompatible.
eole 0.0.2 requires torch<2.4,>=2.3, but you have torch 2.2.0+cu121 which is incompatible.


In [3]:
import torch
print("PyTorch version:", torch.__version__)  # Check PyTorch version
print("CUDA available:", torch.cuda.is_available())  # Check if CUDA is available
print("CUDA version:", torch.version.cuda)  # CUDA version used by PyTorch
print("Number of CUDA devices:", torch.cuda.device_count())  # Number of available CUDA devices

PyTorch version: 2.2.0+cu121
CUDA available: True
CUDA version: 12.1
Number of CUDA devices: 1


In [None]:
import wandb
from transformers import MarianMTModel, TrainingArguments, Trainer

# Load the MarianMT model
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-nl")
# Define training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="translation_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    fp16=True,
    logging_steps=500,
    dataloader_num_workers=4,  # Number of subprocesses to use for data loading.,
    save_total_limit=2,  # Keep only the last 2 checkpoints.,
    weight_decay=0.01,
    save_steps=1000,
    logging_dir="translation_logs"
)

# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss


