<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/ml801/preprocess-and-finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dataset Preprocess

In [None]:
!pip install datasets
!apt install git-lfs

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset

dataset = load_dataset("HamdanXI/arb-eng-parallel")

In [26]:
one_mill_data = dataset['train'].shuffle(seed=42).select(range(1000000))
one_hundred_thousand_data = dataset['train'].shuffle(seed=42).select(range(100000))
ten_thousand_data = dataset['train'].shuffle(seed=42).select(range(10000))

In [None]:
one_mill_data.push_to_hub("arb-eng-parallel-1mill")
one_hundred_thousand_data.push_to_hub("arb-eng-parallel-100k")
ten_thousand_data.push_to_hub("arb-eng-parallel-10k")

In [None]:
from datasets import DatasetDict

def splitting_dataset(dataset):
  train_test = dataset.train_test_split(test_size=0.2)
  test_val = train_test['test'].train_test_split(test_size=0.5)

  train_set = train_test['train']
  val_set = test_val['train']
  test_set = test_val['test']

  combined_dataset = DatasetDict({
    'train': train_set,
    'validation': val_set,
    'test': test_set})

  return combined_dataset

In [None]:
one_mill_data = splitting_dataset(one_mill_data)
one_hundred_thousand_data = splitting_dataset(one_hundred_thousand_data)
ten_thousand_data = splitting_dataset(ten_thousand_data)

In [None]:
one_mill_data.push_to_hub("arb-eng-parallel-1mill-splitted")
one_hundred_thousand_data.push_to_hub("arb-eng-parallel-100k-splitted")
ten_thousand_data.push_to_hub("arb-eng-parallel-10k-splitted")

## Fine-tuning NLLB-200

In [None]:
!pip install transformers torch

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [32]:
def max_length_calculator(examples, column_name):
    return max(len(tokenizer.encode(ex)) for ex in examples[column_name])

max_length_train_english = max_length_calculator(ten_thousand_data['train'], 'english')
max_length_train_arabic = max_length_calculator(ten_thousand_data['train'], 'arabic')

max_length_validate_english = max_length_calculator(ten_thousand_data['validation'], 'english')
max_length_validate_arabic = max_length_calculator(ten_thousand_data['validation'], 'arabic')

max_length = max(max_length_train_english, max_length_train_arabic, max_length_validate_english, max_length_validate_arabic)

In [38]:
ten_thousand_data

DatasetDict({
    train: Dataset({
        features: ['arabic', 'english'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['arabic', 'english'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['arabic', 'english'],
        num_rows: 1000
    })
})

In [37]:
def preprocess_function(examples):
    # Filter out None or empty strings
    inputs = [ex for ex in examples['english'] if ex is not None and ex != ""]
    targets = [ex for ex in examples['arabic'] if ex is not None and ex != ""]

    if not inputs or not targets:  # Skip if either list is empty
        return {'input_ids': [], 'attention_mask': [], 'labels': []}

    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, truncation=True, padding=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = ten_thousand_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

TypeError: ignored