<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/ml801/nllb-200-ten-thousand-data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dataset Preprocess

In [1]:
!pip install datasets
!apt install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 9 not upgraded.


In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset

dataset = load_dataset("HamdanXI/arb-eng-parallel")

In [26]:
one_mill_data = dataset['train'].shuffle(seed=42).select(range(1000000))
one_hundred_thousand_data = dataset['train'].shuffle(seed=42).select(range(100000))
ten_thousand_data = dataset['train'].shuffle(seed=42).select(range(10000))

In [None]:
one_mill_data.push_to_hub("arb-eng-parallel-1mill")
one_hundred_thousand_data.push_to_hub("arb-eng-parallel-100k")
ten_thousand_data.push_to_hub("arb-eng-parallel-10k")

In [None]:
from datasets import DatasetDict

def splitting_dataset(dataset):
  train_test = dataset.train_test_split(test_size=0.2)
  test_val = train_test['test'].train_test_split(test_size=0.5)

  train_set = train_test['train']
  val_set = test_val['train']
  test_set = test_val['test']

  combined_dataset = DatasetDict({
    'train': train_set,
    'validation': val_set,
    'test': test_set})

  return combined_dataset

In [None]:
one_mill_data = splitting_dataset(one_mill_data)
one_hundred_thousand_data = splitting_dataset(one_hundred_thousand_data)
ten_thousand_data = splitting_dataset(ten_thousand_data)

In [None]:
one_mill_data.push_to_hub("arb-eng-parallel-1mill-splitted")
one_hundred_thousand_data.push_to_hub("arb-eng-parallel-100k-splitted")
ten_thousand_data.push_to_hub("arb-eng-parallel-10k-splitted")

## Fine-tuning NLLB-200

In [3]:
!pip install transformers torch
!pip install transformers[torch]



In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [5]:
from datasets import load_dataset

ten_thousand_data = load_dataset("HamdanXI/arb-eng-parallel-10k-splitted")

In [6]:
def max_length_calculator(examples, column_name):
    return max(len(tokenizer.encode(ex)) for ex in examples[column_name])

max_length_train_english = max_length_calculator(ten_thousand_data['train'], 'english')
max_length_train_arabic = max_length_calculator(ten_thousand_data['train'], 'arabic')

max_length_validate_english = max_length_calculator(ten_thousand_data['validation'], 'english')
max_length_validate_arabic = max_length_calculator(ten_thousand_data['validation'], 'arabic')

highest_length = max(max_length_train_english, max_length_train_arabic, max_length_validate_english, max_length_validate_arabic)

In [7]:
def tokenize_function(example):
    source = example["english"]
    target = example["arabic"]

    # Tokenizing source and target without returning tensors and without padding
    tokenized_source = tokenizer(source, truncation=True, max_length=highest_length, return_tensors="pt")
    tokenized_target = tokenizer(target, truncation=True, max_length=highest_length, return_tensors="pt")

    return {
        "input_ids": tokenized_source["input_ids"][0],
        "attention_mask": tokenized_source["attention_mask"][0],
        "labels": tokenized_target["input_ids"][0]
    }

tokenized_datasets = ten_thousand_data.map(tokenize_function)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [10]:
from transformers import TrainingArguments
from transformers import Trainer

training_args = TrainingArguments(
    output_dir="nllb-200-ten-thousand-data",
    num_train_epochs=1,
    push_to_hub=True)

training_args = training_args.set_save(steps=10000)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

Step,Training Loss
