In [2]:
from os import linesep, read
from datasets import Dataset, load_metric
import datasets
from datasets.dataset_dict import DatasetDict
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np
import torch


# device = "cuda:0" if torch.cuda.is_available() else "cpu" 
device = torch.device('cuda')
# print(device)
# print(torch.cuda.get_device_name(device))


def create_dataset(englishScript, germanScript):
    lines_en = []
    lines_de = []
    with open(englishScript, mode='r', encoding='utf-8') as file_in:
        for line in file_in:
            lines_en.append(line.strip())

    with open(germanScript, mode='r', encoding='utf-8') as file_in:
        for line in file_in:
            lines_de.append(line.strip())

    translation_dict = {"translation": []}

    for i in range(len(lines_en)):
        singledict = {
            "en": lines_en[i], "de": lines_de[i]}
        translation_dict["translation"].append(singledict)
    return translation_dict


translation_dict_train = create_dataset(
    "./prep/train.de-en.en", "./prep/train.de-en.de",)
translation_dict_test = create_dataset(
    "./prep/test.de-en.en", "./prep/test.de-en.de")
translation_dict_valid = create_dataset(
    "./prep/valid.de-en.de", "./prep/valid.de-en.de")


# books = load_dataset("opus_books", "en-fr")
# dataset = load_dataset("text", data_files={
#                        "train": "./prep/train.de-en.en", "validation": "./prep/train.de-en.de"})

dataset_train = Dataset.from_dict(translation_dict_train)
dataset_test = Dataset.from_dict(translation_dict_test)
dataset_valid = Dataset.from_dict(translation_dict_valid)

dataset_final = DatasetDict(
    {"train": dataset_train, "test": dataset_test, "validation": dataset_valid})

print(dataset_final)


metric = load_metric("sacrebleu")
model_checkpoint = "Helsinki-NLP/opus-mt-en-de"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


if "mbart" in model_checkpoint:
    tokenizer.src_lang = "en-XX"
    tokenizer.tgt_lang = "de-DE"

prefix = ""
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "translate English to German: "
else:
    prefix = ""

source_lang = "en"
target_lang = "de"
max_input_length = 128
max_target_length = 128


def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset_final.map(preprocess_function, batched=True)
# train_dataloader = DataLoader(
#     dataset=dataset_final, batch_size=4, shuffle=True, collate_fn=lambda x: x)
# tokenized_dataset = tokenized_dataset.remove_columns(
#     dataset_final["train"].column_names)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    dataloader_num_workers=4
)



def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(
        decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds,
                            references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(
        pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 153348
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 6750
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 6970
    })
})




Map:   0%|          | 0/153348 [00:00<?, ? examples/s]



Map:   0%|          | 0/6750 [00:00<?, ? examples/s]

Map:   0%|          | 0/6970 [00:00<?, ? examples/s]

  0%|          | 0/9585 [00:00<?, ?it/s]

{'loss': 10.9699, 'learning_rate': 1.8956703182055298e-05, 'epoch': 0.05}
{'loss': 10.9699, 'learning_rate': 1.791340636411059e-05, 'epoch': 0.1}
{'loss': 10.9699, 'learning_rate': 1.6870109546165886e-05, 'epoch': 0.16}
{'loss': 10.9699, 'learning_rate': 1.582681272822118e-05, 'epoch': 0.21}
{'loss': 10.9699, 'learning_rate': 1.4783515910276475e-05, 'epoch': 0.26}
{'loss': 10.9699, 'learning_rate': 1.374021909233177e-05, 'epoch': 0.31}
{'loss': 10.9699, 'learning_rate': 1.2696922274387064e-05, 'epoch': 0.37}
{'loss': 10.9699, 'learning_rate': 1.1653625456442358e-05, 'epoch': 0.42}
{'loss': 10.9699, 'learning_rate': 1.0610328638497653e-05, 'epoch': 0.47}
{'loss': 10.9699, 'learning_rate': 9.567031820552947e-06, 'epoch': 0.52}
{'loss': 10.9699, 'learning_rate': 8.523735002608243e-06, 'epoch': 0.57}
{'loss': 10.9699, 'learning_rate': 7.480438184663538e-06, 'epoch': 0.63}
{'loss': 10.9699, 'learning_rate': 6.437141366718832e-06, 'epoch': 0.68}
{'loss': 10.9699, 'learning_rate': 5.393844548

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
