In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install rouge_score
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l- \ done
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=66e2e82abb2c127dfc675911e378ed228b2f8f0e10091e29a49d65203c525b21
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected pac

In [2]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
import torch
import pandas as pd
import numpy as np
import os

In [3]:
train = pd.read_csv(r"/kaggle/input/wikilingua-vn-dataset/dataset/train.csv", index_col=0)
test = pd.read_csv(r"/kaggle/input/wikilingua-vn-dataset/dataset/test.csv", index_col=0)

In [4]:
train.loc[train["document"].isnull(), "document"] = ""
train.drop(columns=["__index_level_0__"], inplace=True)
test.drop(columns=["__index_level_0__"], inplace=True)

In [5]:
import re
def remove_html_tags(text): 
    clean_text = re.sub(r'<.*?>', ' ', text)
    return clean_text

def remove_whitespace(text): 
    cleaned_text = ' '.join(text.split()) 
    return cleaned_text

def remove_special_tokens(text):
    text = re.sub(r'[@$%#&^*\\/_‚ò∞."]', ' ', text)
    return text

def preprocessing_flow(text):
    text = remove_whitespace(text)
    text = remove_html_tags(text)
    text = remove_special_tokens(text)
    return text



In [6]:
train["document"] = train["document"].map(preprocessing_flow)
train["summary"] = train["summary"].map(preprocessing_flow)
test["document"] = test["document"].map(preprocessing_flow)
test["summary"] = test["summary"].map(preprocessing_flow)

In [7]:
train.head(20)

Unnamed: 0,summary,document
0,M·ªü ·ª©ng d·ª•ng Instagram Nh·∫•n v√†o bi·ªÉu t∆∞·ª£ng k√≠...,"N·∫øu ƒë√£ ƒëƒÉng nh·∫≠p v√†o t√†i kho·∫£n Instagram , b·∫°n..."
1,L√†m v·ªá sinh nh√† c·ª≠a khi quay tr·ªü v·ªÅ M·ªü c·ª≠a s...,"X√°c b·ªç ch√©t , d∆∞ l∆∞·ª£ng ho√° ch·∫•t v√† b·ª•i l√† nh·ªØn..."
2,M·ªü Yahoo Mail Ch·ªçn email r√°c Nh·∫•n v√†o d·∫•u ...,Nh·∫•n v√†o bi·ªÉu t∆∞·ª£ng Yahoo Mail h√¨nh phong th∆∞ ...
3,B·∫Øt ƒë·∫ßu v·ªõi d·∫°ng t·ªïng qu√°t D√πng quy t·∫Øc lu·ªπ ...,"Vi·∫øt h√†m b·∫≠c hai c·ªßa b·∫°n ·ªü d·∫°ng t·ªïng qu√°t , f ..."
4,M·ªâm c∆∞·ªùi S·ªü h·ªØu l√†n da r√°m n·∫Øng kho·∫ª m·∫°nh ...,M·ªâm c∆∞·ªùi l√†m s√°ng b·ª´ng khu√¥n m·∫∑t v√† l√†m cho v·∫ª...
5,Bi·∫øt m√¨nh c·∫ßn g√¨ D√πng l√° t·ª´ c√¢y kh√°c Kh√¥ng...,"C∆°n nh·ª©c , n√≥ng , ƒëau , v√† ng·ª©a , kh√° d·ªØ d·ªôi ..."
6,Tr·ªôn n∆∞·ªõc c·ªët chanh v·ªõi s·ªØa chua,Tr·ªôn 1 th√¨a canh ( 15 ml ) n∆∞·ªõc c·ªët chanh v·ªõi ...
7,ƒê·ª´ng li√™n l·∫°c v·ªõi ng∆∞·ªùi y√™u c≈© Xo√° t√™n ng∆∞·ªùi...,"Khi ng∆∞·ªùi y√™u c≈© b·ªã √°m ·∫£nh , h·ªç c√≥ th·ªÉ s·∫Ω t√¨m ..."
8,Gi·ªõi h·∫°n kho·∫£ng th·ªùi gian c·ªë ƒë·ªãnh ƒë·ªÉ vi·∫øt Vi...,Vi·ªác ƒë∆∞a ra m·ªôt kho·∫£ng th·ªùi gian c·ª• th·ªÉ gi√∫p b...
9,Cho 1 g√≥i b·ªôt th·∫°ch Jellobah v·ªã r∆∞·ª£u v√†o 200ml...,Khu·∫•y ƒë·∫øn khi b·ªôt tan


In [8]:
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")  
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-base").to('cuda')

tokenizer_config.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/820k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/904M [00:00<?, ?B/s]

In [9]:


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inputs"], truncation=True, padding="max_length", max_length=256
    )
    
    labels = tokenizer(text_target=examples["labels"], padding="max_length", truncation=True, max_length=256)
    model_inputs['labels'] = labels['input_ids']
    model_inputs['input_ids'] = model_inputs['input_ids']
    return model_inputs

In [10]:


if 'document' in train.columns and 'summary' in train.columns:
    train['inputs'] = train['document'] + '</s>'  
    train['labels'] = train['summary']
else:
    raise ValueError("CSV file must contain 'document' and 'summary' columns.")


dict_obj = {'inputs': train['inputs'].tolist(), 'labels': train['labels'].tolist()}
dataset = Dataset.from_dict(dict_obj)


tokenized_datasets = dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=['inputs'], 
    num_proc=8
)


  self.pid = os.fork()


Map (num_proc=8):   0%|          | 0/17622 [00:00<?, ? examples/s]

In [11]:
folder_path = "/kaggle/working/model"

os.makedirs(folder_path, exist_ok=True)

In [12]:

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")


training_args = Seq2SeqTrainingArguments(output_dir = "/kaggle/working/model",
                                      do_train=True,
                                      do_eval=False,
                                      num_train_epochs=5,
                                      learning_rate=1e-5,
                                      warmup_ratio=0.05,
                                      weight_decay=0.01,
                                      per_device_train_batch_size=4,
                                      per_device_eval_batch_size=4,
                                      logging_dir='./log',
                                      group_by_length=True,
                                      save_strategy = "no",
                                      # gradient_accumulation_steps=4,
                                      fp16=True,
                                      # gradient_checkpointing=True,
                                      )




In [13]:
key = "fdef48fc279db0325237791cb3cbefa589ad0c92"
import wandb
wandb.login(key=key)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [14]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Currently logged in as: [33mleductai2201[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241127_035357-wka0p5qm[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m/kaggle/working/model[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at [34m[4mhttps://wandb.ai/leductai2201/huggingface[0m
[34m[1mwandb[0m: üöÄ View run at [34m[4mhttps://wandb.ai/leductai2201/huggingface/runs/wka0p5qm[0m


Step,Training Loss
500,10.2024
1000,0.6511
1500,0.5769
2000,0.5541
2500,0.5326
3000,0.5198
3500,0.5365
4000,0.5297
4500,0.5181
5000,0.4945


TrainOutput(global_step=22030, training_loss=0.6970292106521059, metrics={'train_runtime': 7770.4835, 'train_samples_per_second': 11.339, 'train_steps_per_second': 2.835, 'total_flos': 2.68276398686208e+16, 'train_loss': 0.6970292106521059, 'epoch': 5.0})

In [15]:
trainer.save_model()

In [16]:
from evaluate import load
metric = load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [17]:


if 'document' in test.columns and 'summary' in test.columns:
    test['inputs'] = test['document'] + '</s>'  
    test['labels'] = test['summary']
else:
    raise ValueError("CSV file must contain 'document' and 'summary' columns.")


dict_obj = {'inputs': test['inputs'].tolist(), 'labels': test['labels'].tolist()}
dataset = Dataset.from_dict(dict_obj)


test_tokenized_datasets = dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=['inputs'], 
    num_proc=10
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

  self.pid = os.fork()


Map (num_proc=10):   0%|          | 0/1959 [00:00<?, ? examples/s]

  self.pid = os.fork()


In [18]:
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/model")
model.to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(36096, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36096, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [19]:
import torch 
import numpy as np
metrics = load("rouge")

max_target_length = 256
dataloader = torch.utils.data.DataLoader(test_tokenized_datasets, collate_fn=data_collator, batch_size=32)

predictions = []
references = []
for i, batch in enumerate(tqdm(dataloader)):
  outputs = model.generate(
      input_ids=batch['input_ids'].to('cuda'),
      max_length=max_target_length,
      attention_mask=batch['attention_mask'].to('cuda'),
  )
  with tokenizer.as_target_tokenizer():
    outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

    labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
    actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]
  predictions.extend(outputs)
  references.extend(actuals)
  metrics.add_batch(predictions=outputs, references=actuals)


metrics.compute()


  0%|          | 0/62 [00:00<?, ?it/s]



{'rouge1': 0.45986957932955463,
 'rouge2': 0.20362181169919547,
 'rougeL': 0.34070607000311726,
 'rougeLsum': 0.34055964043727177}