In [1]:
!pip install datasets evaluate accelerate
!pip install sacrebleu jiwer

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.

In [2]:
import os

import torch
from datasets import load_dataset
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
import evaluate
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
output_dir = '/content/saved_model_weights'
os.makedirs(output_dir, exist_ok=True)

In [5]:
MODEL_NAME = "vinai/bartpho-syllable"
MAX_LENGTH = 512

In [6]:
def preprocess_function(examples):
    # Tokenize the text and apply truncation
    return tokenizer(examples["error"],
                     text_target=examples["original"],
                     max_length=MAX_LENGTH,
                     truncation=True,
                     )

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"sacrebleu": result["score"]}

In [7]:
# Define paths to your CSV files
train_path = '/content/drive/MyDrive/Colab Notebooks/BARTpho/data_preparation/train.csv'
test_path = '/content/drive/MyDrive/Colab Notebooks/BARTpho/data_preparation/test.csv'

dataset = load_dataset("csv", data_files={"train": train_path, "test": test_path})

print(dataset['train'])

print(dataset['test'])

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['original', 'error'],
    num_rows: 16099
})
Dataset({
    features: ['original', 'error'],
    num_rows: 4025
})


In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

input = dataset["train"][-1]['error']
output = dataset["train"][-1]['original']

# Tokenizing Input Data
inputs = tokenizer(
    input,
    text_target=output,
    max_length=MAX_LENGTH,
    truncation=True
)

inputs, len(inputs['input_ids'])

config.json:   0%|          | 0.00/897 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

dict.txt:   0%|          | 0.00/360k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

({'input_ids': [0, 1172, 1886, 427, 434, 3270, 309, 155, 4, 10, 71, 478, 731, 7, 197, 924, 706, 197, 331, 63, 1037, 52, 146, 739, 753, 989, 124, 4, 90, 349, 269, 672, 262, 141, 4, 344, 15, 33200, 52, 3682, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [0, 1172, 349, 427, 326, 309, 155, 4, 10, 71, 681, 7, 197, 924, 706, 197, 331, 63, 1037, 52, 146, 1278, 124, 4, 90, 349, 269, 672, 262, 141, 4, 344, 15, 333, 52, 3682, 5, 2]},
 42)

In [9]:
# Apply tokenization in a batched manner for efficiency
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Map:   0%|          | 0/16099 [00:00<?, ? examples/s]

Map:   0%|          | 0/4025 [00:00<?, ? examples/s]

In [10]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

metric = evaluate.load("sacrebleu")

pytorch_model.bin:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [12]:
args = Seq2SeqTrainingArguments(
    do_train=True,
    do_eval=True,
    output_dir="./checkpoints",
    save_steps=1000,
    save_total_limit=5,
    num_train_epochs=10,
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    evaluation_strategy="steps",
    eval_steps=1500,
    logging_dir="./logs",
    logging_steps=1500,
    predict_with_generate=True,
    save_strategy="no",
    resume_from_checkpoint=False
)

In [13]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss,Sacrebleu
1500,1.0941,0.120982,34.141534


In [None]:
model.save_pretrained(output_dir)

In [None]:
corrector = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

MAX_LENGTH = 512

# Define the text samples
texts = [
    "Nỗi buồn là giớt sương đg rơi trên mắt e vo anh",
    "Hàn mi cong lun linh trng anh dương vừa tan",
    "Nhìn em như  dang khóc ò ",
    "Khi bắt đầuuu, cả hai ta đi khôi đau dơn lúc gặp nhau",
    "ơi những truyến đi, thế dang bộn bề và lo như gặp nhau",
]

# Batch prediction
predictions = corrector(texts, max_length=MAX_LENGTH)

# Print predictions
for text, pred in zip(texts, predictions):
    print("- " + pred['generated_text'])

In [None]:
!cp -r "/content/saved_model_weights"  "/content/drive/MyDrive/Colab Notebooks/BARTpho/saved_model_weights"