In [1]:
# Model: https://github.com/vietai/ViT5/blob/main/examples/finetune_huggingface_example.ipynb

import os
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments, T5ForConditionalGeneration, T5Tokenizer, TrainerCallback
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
from functools import partial




In [2]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
# Define the preprocess function that tokenizes the inputs
def preprocess_function(examples, tokenizer):
    # Tokenize the input text, setting a max_length for truncation
    model_inputs = tokenizer(examples['inputs'], truncation=True, padding=True, max_length=128)
    # Tokenize the labels (aspect terms) by converting the list of terms into a string
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['labels'], truncation=True, padding=True, max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [4]:
# Create a partial function with the tokenizer included
preprocess_function_with_tokenizer = partial(preprocess_function, tokenizer=tokenizer)

In [5]:
import json

# Load the JSON file
file_path = 'label_subset.json'
with open(file_path, 'r', encoding='utf-8-sig') as file:
    data = json.load(file)

# Initialize the lists
input_lines = []
label_lines = []

# Iterate over the entries in the JSON data
for entry in data:
    # Append the 'Content' field to input_lines
    input_lines.append(entry['Content'])

    # Append the 'Aspects' field (concatenated terms) to label_lines
    aspects = [term for aspect in entry['Aspects'] for term in aspect['AspectTerms']]
    label_lines.append(' '.join(aspects))  # Join the list of terms into a single string

In [6]:
# Create a dictionary from the lists
dict_obj = {'inputs': input_lines, 'labels': label_lines}

# Convert the dictionary into a Hugging Face Dataset object
dataset = Dataset.from_dict(dict_obj)


In [7]:
# Apply the preprocessing function to the dataset
tokenized_datasets = dataset.map(
    preprocess_function_with_tokenizer, 
    batched=True, 
    remove_columns=['inputs'], 
    num_proc=8
)

Map (num_proc=8):   0%|          | 0/1100 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")


In [9]:
training_args = Seq2SeqTrainingArguments(
    "./", 
    do_train=True,
    do_eval=False,
    num_train_epochs=30,
    learning_rate=1e-5,
    warmup_ratio=0.05,
    weight_decay=0.01,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    logging_dir='./',
    group_by_length=True,
    save_strategy="epoch",
    save_total_limit=3,
    fp16=True,
)

In [10]:
# Initialize the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [11]:
class CustomSaveCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        model = kwargs['model']
        # Loop through all model parameters and make them contiguous before saving
        for param in model.parameters():
            if not param.is_contiguous():
                param.data = param.data.contiguous()

In [12]:
# Add this callback to your training setup
trainer.add_callback(CustomSaveCallback())

In [13]:
# Continue training with this callback handling the saving
trainer.train()

  0%|          | 0/33000 [00:00<?, ?it/s]

{'loss': 10.4177, 'grad_norm': 66.326904296875, 'learning_rate': 3.0303030303030305e-06, 'epoch': 0.45}
{'loss': 4.0082, 'grad_norm': 58.12025451660156, 'learning_rate': 6.060606060606061e-06, 'epoch': 0.91}
{'loss': 1.3558, 'grad_norm': 6.654158592224121, 'learning_rate': 9.090909090909091e-06, 'epoch': 1.36}
{'loss': 0.827, 'grad_norm': 13.19107723236084, 'learning_rate': 9.88835725677831e-06, 'epoch': 1.82}
{'loss': 0.6395, 'grad_norm': 59.489261627197266, 'learning_rate': 9.728867623604467e-06, 'epoch': 2.27}
{'loss': 0.4579, 'grad_norm': 2.9564690589904785, 'learning_rate': 9.569377990430623e-06, 'epoch': 2.73}
{'loss': 0.3753, 'grad_norm': 5.818318843841553, 'learning_rate': 9.40988835725678e-06, 'epoch': 3.18}
{'loss': 0.3046, 'grad_norm': 1.9483758211135864, 'learning_rate': 9.250398724082935e-06, 'epoch': 3.64}
{'loss': 0.305, 'grad_norm': 1.5463536977767944, 'learning_rate': 9.090909090909091e-06, 'epoch': 4.09}
{'loss': 0.2623, 'grad_norm': 4.055758476257324, 'learning_rate'

TrainOutput(global_step=33000, training_loss=0.4328357733524207, metrics={'train_runtime': 15978.0891, 'train_samples_per_second': 2.065, 'train_steps_per_second': 2.065, 'total_flos': 962835321323520.0, 'train_loss': 0.4328357733524207, 'epoch': 30.0})

In [14]:
# Set model to evaluation mode
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [15]:
# Preprocess the input for the model
def preprocess_input(input_texts, tokenizer, max_length=128):
    inputs = tokenizer(input_texts, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
    return inputs

In [16]:
# Generate predictions (inference)
def generate_predictions(input_texts, model, tokenizer, max_length=128, num_beams=4):
    # Preprocess input
    inputs = preprocess_input(input_texts, tokenizer, max_length)

    # Generate outputs (using beam search to improve prediction quality)
    with torch.no_grad():  # Ensure no gradients are computed during inference
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=max_length,
            num_beams=num_beams,  # Beam search for better quality
            early_stopping=True
        )

    # Decode the predictions into readable text
    decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return decoded_outputs


In [17]:
# Run inference on the example input
# Example input for inference
test_input = ["sản phẩm tốt, giao hàng nhanh"]
predicted_aspects = generate_predictions(test_input, model, tokenizer)
print("Predicted Aspects:", predicted_aspects)

Predicted Aspects: ['giao hàng']


In [18]:
# Specify the directory where you want to save the model
save_directory = "sample_data/Sep23"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)


('sample_data/Sep23\\tokenizer_config.json',
 'sample_data/Sep23\\special_tokens_map.json',
 'sample_data/Sep23\\spiece.model',
 'sample_data/Sep23\\added_tokens.json')