In [7]:
!pip install -U transformers datasets evaluate rouge-score accelerate



In [8]:
import numpy as np
import torch
from datasets import load_dataset
import evaluate


from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

In [9]:
MODEL_NAME = "facebook/nllb-200-distilled-600M" # change if needed
SRC_LANG = "khm_Khmr"
TGT_LANG = "khm_Khmr"


In [10]:
MAX_SOURCE_LENGTH = 256
MAX_TARGET_LENGTH = 128

In [11]:
dataset = load_dataset("csv", data_files="Summarization.csv")

Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [13]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    src_lang=SRC_LANG,
    tgt_lang=TGT_LANG,
)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [14]:
model.gradient_checkpointing_enable()
model.config.use_cache = False

In [15]:
def preprocess(batch):
    model_inputs = tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_SOURCE_LENGTH,
    )


    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["summary"],
            truncation=True,
            max_length=MAX_TARGET_LENGTH,
    )


    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(preprocess, batched=True, remove_columns=eval_dataset.column_names)

Map:   0%|          | 0/2718 [00:00<?, ? examples/s]



Map:   0%|          | 0/303 [00:00<?, ? examples/s]

In [16]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
)

In [17]:
rouge = evaluate.load("rouge")

Downloading builder script: 0.00B [00:00, ?B/s]

In [18]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)


    scores = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
    )


    return {
        "rouge1": scores["rouge1"],
        "rouge2": scores["rouge2"],
        "rougeL": scores["rougeL"],
        "rougeLsum": scores["rougeLsum"],
    }

In [19]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./nllb_khmer_sum",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=50,
    do_eval=False,
    eval_steps=500,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,
    report_to="none",
    )

In [20]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()


Step,Training Loss
50,2.1364
100,1.9267
150,1.8772


In [None]:
results = trainer.evaluate()
print("ROUGE scores:")
print(results)

In [None]:
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")

In [None]:
text = "លោកគ្រូពេទ្យ Angel Martin បានលើកឡើងថា Kylian Mbappe អាចនឹងត្រូវខកខានការប្រកួតដែលនៅសេសសល់ក្នុងព្រឹត្តិការណ៍ EURO 2024 បន្ទាប់ពីរូបគេជួបនូវរបួសបាក់ក្តោងច្រមុះ។ ជាការពិតណាស់ ក្នុងការប្រកួតកាលពីយប់មិញនេះ នៅក្នុងនាទីទី ៨៧ Mbappe បានប៉ះទង្គិចគ្នាជាមួយ ខ្សែការពារក្រុមអូទ្រីស កីឡាករ Kevin Danso រហូតដល់បាក់ក្តោងចម្រុះ ហូរឈាមពេញច្រមុះ និង ត្រូវបានប្តូរចេញនៅក្នុងនាទី ៩០ ខណៈ ក្រោយចប់ការប្រកួត គ្រូបង្គោលក្រុមបារាំង លោក Didier Deschamps បានបញ្ជាក់ច្បាស់ៗថា Mbappe ពិតជាបាក់ក្តោងច្រមុះពិតមែន។ ដកស្រង់សម្តីរបស់ លោកគ្រូពេទ្យ Angel Martin ទាក់ទងនឹងរឿងខាងលើ លោកបាននិយាយថា៖ ប្រសិនបើ Mbappe របួសបាក់ក្តោងច្រមុះ នោះ គាត់នឹងត្រូវឆ្លងកាត់ការវះកាត់ ព្រមទាំងត្រូវសម្រាក ២-៣ សប្តាហ៍ មិនត្រូវឲ្យច្រមុះមានចលនាឡើយ៕"
inputs = tokenizer(text, return_tensors="pt").to(model.device)


outputs = model.generate(
    **inputs,
    max_length=200,
    num_beams=4,
)


print(tokenizer.decode(outputs[0], skip_special_tokens=True))