In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install rouge_score
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l- \ done
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=cf417d68f30717e8ed0c995c36a0f638ee69e1d448466999d5d97d6e1de723e0
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [2]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
import torch
import pandas as pd
import numpy as np
import os

In [3]:
train = pd.read_csv(r"/kaggle/input/wikilingua-vn-dataset/dataset/train.csv", index_col=0)
test = pd.read_csv(r"/kaggle/input/wikilingua-vn-dataset/dataset/test.csv", index_col=0)

In [4]:
train.loc[train["document"].isnull(), "document"] = ""
train.drop(columns=["__index_level_0__"], inplace=True)
test.drop(columns=["__index_level_0__"], inplace=True)

In [5]:
import re
def remove_html_tags(text): 
    clean_text = re.sub(r'<.*?>', ' ', text)
    return clean_text

def remove_whitespace(text): 
    cleaned_text = ' '.join(text.split()) 
    return cleaned_text

def remove_special_tokens(text):
    text = re.sub(r'[@$%#&^*\\/_☰."]', ' ', text)
    return text

def preprocessing_flow(text):
    text = remove_whitespace(text)
    text = remove_html_tags(text)
    text = remove_special_tokens(text)
    return text



In [6]:
train["document"] = train["document"].map(preprocessing_flow)
train["summary"] = train["summary"].map(preprocessing_flow)
test["document"] = test["document"].map(preprocessing_flow)
test["summary"] = test["summary"].map(preprocessing_flow)

In [7]:
train.head(20)

Unnamed: 0,summary,document
0,Mở ứng dụng Instagram Nhấn vào biểu tượng kí...,"Nếu đã đăng nhập vào tài khoản Instagram , bạn..."
1,Làm vệ sinh nhà cửa khi quay trở về Mở cửa s...,"Xác bọ chét , dư lượng hoá chất và bụi là nhữn..."
2,Mở Yahoo Mail Chọn email rác Nhấn vào dấu ...,Nhấn vào biểu tượng Yahoo Mail hình phong thư ...
3,Bắt đầu với dạng tổng quát Dùng quy tắc luỹ ...,"Viết hàm bậc hai của bạn ở dạng tổng quát , f ..."
4,Mỉm cười Sở hữu làn da rám nắng khoẻ mạnh ...,Mỉm cười làm sáng bừng khuôn mặt và làm cho vẻ...
5,Biết mình cần gì Dùng lá từ cây khác Không...,"Cơn nhức , nóng , đau , và ngứa , khá dữ dội ..."
6,Trộn nước cốt chanh với sữa chua,Trộn 1 thìa canh ( 15 ml ) nước cốt chanh với ...
7,Đừng liên lạc với người yêu cũ Xoá tên người...,"Khi người yêu cũ bị ám ảnh , họ có thể sẽ tìm ..."
8,Giới hạn khoảng thời gian cố định để viết Vi...,Việc đưa ra một khoảng thời gian cụ thể giúp b...
9,Cho 1 gói bột thạch Jellobah vị rượu vào 200ml...,Khuấy đến khi bột tan


In [8]:
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")  
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-base").to('cuda')

tokenizer_config.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/820k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/904M [00:00<?, ?B/s]

In [9]:
input_max_length = 1024

In [10]:


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inputs"], truncation=True, padding="max_length", max_length=input_max_length
    )
    
    labels = tokenizer(text_target=examples["labels"], padding="max_length", truncation=True, max_length=256)
    model_inputs['labels'] = labels['input_ids']
    model_inputs['input_ids'] = model_inputs['input_ids']
    return model_inputs

In [11]:


if 'document' in train.columns and 'summary' in train.columns:
    train['inputs'] = train['document'] + '</s>'  
    train['labels'] = train['summary']
else:
    raise ValueError("CSV file must contain 'document' and 'summary' columns.")


dict_obj = {'inputs': train['inputs'].tolist(), 'labels': train['labels'].tolist()}
dataset = Dataset.from_dict(dict_obj)


tokenized_datasets = dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=['inputs'], 
    num_proc=8
)


  self.pid = os.fork()


Map (num_proc=8):   0%|          | 0/17622 [00:00<?, ? examples/s]

  self.pid = os.fork()


In [12]:
folder_path = "/kaggle/working/model"

os.makedirs(folder_path, exist_ok=True)

In [13]:

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")


training_args = Seq2SeqTrainingArguments(output_dir = "/kaggle/working/model",
                                      do_train=True,
                                      do_eval=False,
                                      num_train_epochs=5,
                                      learning_rate=1e-5,
                                      warmup_ratio=0.05,
                                      weight_decay=0.01,
                                      per_device_train_batch_size=4,
                                      per_device_eval_batch_size=4,
                                      logging_dir='./log',
                                      group_by_length=True,
                                      save_strategy = "no",
                                      # gradient_accumulation_steps=4,
                                      fp16=True,
                                      # gradient_checkpointing=True,
                                      )




In [14]:
key = "fdef48fc279db0325237791cb3cbefa589ad0c92"
import wandb
wandb.login(key=key)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [15]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Currently logged in as: [33mleductai2201[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241127_073753-45tonasw[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m/kaggle/working/model[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/leductai2201/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/leductai2201/huggingface/runs/45tonasw[0m


Step,Training Loss
500,9.9133
1000,0.602
1500,0.5294
2000,0.5062
2500,0.4877
3000,0.4758
3500,0.4872
4000,0.4807
4500,0.4719
5000,0.4454


TrainOutput(global_step=22030, training_loss=0.6416790678671042, metrics={'train_runtime': 20006.2303, 'train_samples_per_second': 4.404, 'train_steps_per_second': 1.101, 'total_flos': 1.073105594744832e+17, 'train_loss': 0.6416790678671042, 'epoch': 5.0})

In [16]:
trainer.save_model()

In [17]:
from evaluate import load
metric = load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [18]:


if 'document' in test.columns and 'summary' in test.columns:
    test['inputs'] = test['document'] + '</s>'  
    test['labels'] = test['summary']
else:
    raise ValueError("CSV file must contain 'document' and 'summary' columns.")


dict_obj = {'inputs': test['inputs'].tolist(), 'labels': test['labels'].tolist()}
dataset = Dataset.from_dict(dict_obj)


test_tokenized_datasets = dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=['inputs'], 
    num_proc=10
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

  self.pid = os.fork()


Map (num_proc=10):   0%|          | 0/1959 [00:00<?, ? examples/s]

  self.pid = os.fork()


In [19]:
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/model")
model.to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(36096, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36096, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [20]:
import torch 
import numpy as np
metrics = load("rouge")

max_target_length = 256
dataloader = torch.utils.data.DataLoader(test_tokenized_datasets, collate_fn=data_collator, batch_size=32)

predictions = []
references = []
for i, batch in enumerate(tqdm(dataloader)):
  outputs = model.generate(
      input_ids=batch['input_ids'].to('cuda'),
      max_length=max_target_length,
      attention_mask=batch['attention_mask'].to('cuda'),
  )
  with tokenizer.as_target_tokenizer():
    outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

    labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
    actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]
  predictions.extend(outputs)
  references.extend(actuals)
  metrics.add_batch(predictions=outputs, references=actuals)


metrics.compute()


  0%|          | 0/62 [00:00<?, ?it/s]



{'rouge1': 0.5158671064837986,
 'rouge2': 0.2585592340250508,
 'rougeL': 0.38263151705599086,
 'rougeLsum': 0.38259222008947086}