In [1]:
%%capture
!pip install sentencepiece
!pip install transformers datasets
!pip install evaluate
!pip install rouge_score

In [2]:
%cd /content/drive/MyDrive/Summarization

/content/drive/MyDrive/Summarization


# Prepare data

In [None]:
import gdown
# Download vietnamese syllable file
train_url_path = 'https://drive.google.com/file/d/1sOqpFAbnAPa4G_zzCpIcWKU-rFlpANpi/view?usp=sharing'
train_filename = 'train_articles.json'
gdown.download(train_url_path, train_filename, quiet=False,fuzzy=True)

# Download Vietnamese corpus file 
val_url_path = 'https://drive.google.com/file/d/1yJJ5F_MndddW236Yl50cOZ2dJAmoTX_e/view?usp=sharing'
val_filename = 'val_articles.json'
gdown.download(val_url_path, val_filename, quiet=False,fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1sOqpFAbnAPa4G_zzCpIcWKU-rFlpANpi
To: /content/drive/MyDrive/Summarization/train_articles.json
100%|██████████| 51.4M/51.4M [00:01<00:00, 33.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1yJJ5F_MndddW236Yl50cOZ2dJAmoTX_e
To: /content/drive/MyDrive/Summarization/val_articles.json
100%|██████████| 10.2M/10.2M [00:00<00:00, 39.7MB/s]


'val_articles.json'

In [3]:
## Standard libraries
import os
import numpy as np 
import random
import math
import json
from tqdm.auto import tqdm

## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
from torch.nn import CrossEntropyLoss, NLLLoss
from torch.utils.data import DataLoader

## Transformers & Dataset
import datasets
from datasets import load_dataset
from datasets import Dataset

import transformers
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from transformers import AdamW
from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
from transformers import get_scheduler

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Device: cuda:0


In [None]:
MAX_ENCODER_LENGTH = 320
MAX_DECODER_LENGTH = 80

BATCH_SIZE = 4
LR=5e-4
EPOCHS=15

In [None]:
# Read data
train_data = json.load(open("train_articles.json", encoding="utf-8"))
train_data = Dataset.from_dict(train_data)

val_data = json.load(open("val_articles.json", encoding="utf-8"))
val_data = Dataset.from_dict(val_data)

## Tokenizer

In [None]:
train_data

Dataset({
    features: ['titles', 'domains', 'messages', 'snippets'],
    num_rows: 10000
})

In [None]:
# tokenizer = MT5Tokenizer.from_pretrained('google/mt5-small')
tokenizer = MT5Tokenizer.from_pretrained('/content/drive/MyDrive/Summarization/MT5-small_tokenizer')


def preprocess_data(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["messages"], padding="max_length", truncation=True, max_length=MAX_ENCODER_LENGTH, add_special_tokens=True)
  outputs = tokenizer(batch["snippets"], padding="max_length", truncation=True, max_length=MAX_ENCODER_LENGTH, add_special_tokens=True)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

In [None]:
train_data = train_data.map(
    preprocess_data,
    batched = True,
    batch_size = 256,
    remove_columns = ['titles', 'domains']
)

train_data.set_format(
    type='torch', columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

val_data = val_data.map(
    preprocess_data, 
    batched=True, 
    batch_size=256, 
    remove_columns=["titles", "domains"]
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# Convert dataset to dataloader
train_dataloader = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle = True)
val_dataloader = DataLoader(val_data, batch_size = BATCH_SIZE, shuffle = False)

In [None]:
model = MT5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/Summarization/MT5-small_model')
optimizer = AdamW(model.parameters(), lr = LR)



# Training

In [None]:
num_training_steps = EPOCHS * len(train_dataloader)

lr_scheduler = get_scheduler(
    name='linear', optimizer = optimizer, num_warmup_steps = 0, num_training_steps = num_training_steps
)

In [None]:
progress_bar = tqdm(range(num_training_steps))
model = model.to(device)
model.train()

# Contruct loss
losses = []
best_loss = float('inf')  # Track the best loss
best_epoch = -1  # Track the epoch with the best loss

for epoch in range(EPOCHS):
  # Total loss
  total_loss = 0

  for i, batch in enumerate(train_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    decoder_attention_mask=batch['decoder_attention_mask'],
                    labels=batch['labels'],
                    return_dict=True)

    loss, logits = outputs['loss'], outputs['logits']

    optimizer.zero_grad()
    outputs['loss'].backward()
    optimizer.step()
    lr_scheduler.step()

    total_loss += loss.item()
    losses.append(loss.item())

    if i % 500 == 0:
      print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                  .format(epoch+1, EPOCHS, i, len(train_dataloader), loss.item()))

    progress_bar.update(1)

  avg_loss = total_loss / len(train_dataloader)

  if avg_loss < best_loss:
    best_loss = avg_loss
    best_epoch = epoch
    # Save the model weights at the current best epoch
    model.save_pretrained('MT5-small_model')
    tokenizer.save_pretrained('MT5-small_tokenizer')
  print("*" * 100)
  print(f'Epoch {epoch + 1}/{EPOCHS}, Loss: {avg_loss}')
  print(f"Best loss: {best_loss} at epoch {best_epoch + 1}")
  print("*" * 100)

  0%|          | 0/37500 [00:00<?, ?it/s]

Epoch [1/15], Step [0/2500], Loss: 1.2611
Epoch [1/15], Step [500/2500], Loss: 1.3320
Epoch [1/15], Step [1000/2500], Loss: 0.9698
Epoch [1/15], Step [1500/2500], Loss: 1.1290
Epoch [1/15], Step [2000/2500], Loss: 1.1341
****************************************************************************************************
Epoch 1/15, Loss: 1.178354786169529
Best loss: 1.178354786169529 at epoch 1
****************************************************************************************************
Epoch [2/15], Step [0/2500], Loss: 1.1560
Epoch [2/15], Step [500/2500], Loss: 0.8146
Epoch [2/15], Step [1000/2500], Loss: 1.1496
Epoch [2/15], Step [1500/2500], Loss: 1.1335
Epoch [2/15], Step [2000/2500], Loss: 1.3850
****************************************************************************************************
Epoch 2/15, Loss: 1.1499880502462387
Best loss: 1.1499880502462387 at epoch 2
****************************************************************************************************


In [None]:
import matplotlib.pyplot as plt
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Losses')
plt.show()


In [None]:
!pwd

/content


In [8]:
model = MT5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/Summarization/MT5-small_model')
tokenizer = MT5Tokenizer.from_pretrained('/content/drive/MyDrive/Summarization/MT5-small_tokenizer')

In [5]:
model = model.to(device)

In [6]:
# Generate summary of an article 
def summarize(text, min_length = 20):
  inputs = tokenizer(text, padding = 'max_length', truncation = True, max_length=320, return_tensors="pt")
  input_ids = inputs.input_ids.to(device)
  attention_mask = inputs.attention_mask.to(device)

  outputs = model.generate(input_ids, attention_mask=attention_mask, min_length=min_length)

  return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [13]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install huggingface_hub
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [17]:
tokenizer.push_to_hub("Johnx69/mt5_small_summarization", create_pr=1)
model.push_to_hub("Johnx69/mt5_small_summarization", create_pr=1)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Johnx69/mt5_small_summarization/commit/c1b944a7a95af03bff041c8bc5169a3d53bfee0a', commit_message='Upload MT5ForConditionalGeneration', commit_description='', oid='c1b944a7a95af03bff041c8bc5169a3d53bfee0a', pr_url='https://huggingface.co/Johnx69/mt5_small_summarization/discussions/2', pr_revision='refs/pr/2', pr_num=2)