# **Aspect-based Sentiment Analysis using RNNs, Conv1D, Transformer Model**

## **Dataset**

In [None]:
!pip install -q datasets==3.2.0

In [2]:
from datasets import load_dataset

ds = load_dataset("thainq107/abte-restaurants")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/454 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/183k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/61.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3602 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['Tokens', 'Tags', 'Polarities'],
        num_rows: 3602
    })
    test: Dataset({
        features: ['Tokens', 'Tags', 'Polarities'],
        num_rows: 1119
    })
})

In [9]:
ds['train'][0]

{'Tokens': ['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.'],
 'Tags': ['0', '0', '1', '0', '0', '0', '0', '0', '0'],
 'Polarities': ['-1', '-1', '0', '-1', '-1', '-1', '-1', '-1', '-1']}

## **Tokenizer**

In [4]:
corpus = [" ".join(i) for i in ds['train']['Tokens']]

In [5]:
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors

# Initialize the tokenizer using WordLevel model (word-based tokenization)
tokenizer = Tokenizer(models.WordLevel(unk_token="<unk>"))

# Use Whitespace pre-tokenizer to split words based on spaces
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Train the tokenizer on the dataset
trainer = trainers.WordLevelTrainer(vocab_size=5000,special_tokens=["<pad>", "<unk>"])
tokenizer.train_from_iterator(corpus, trainer)

# Save the trained tokenizer as a JSON file
tokenizer.save("word_tokenizer.json")

In [6]:
max_len_tokens = max([len(tokens) for tokens in ds['train']['Tokens']])
max_len_tags = max([len([token for token in tokens if token != '0']) for tokens in ds['train']['Tags']])
MAX_LEN = max_len_tokens + max_len_tags
MAX_LEN

100

In [7]:
import torch

def pad_and_truncate(inputs, pad_id):
    if len(inputs) < MAX_LEN:
        padded_inputs = inputs + [pad_id] * (MAX_LEN - len(inputs))
    else:
        padded_inputs = inputs[:MAX_LEN]
    return padded_inputs

def tokenize_and_align_labels(examples):
    tokenized_inputs = []
    labels = []
    for tokens, pols in zip(examples['Tokens'], examples['Polarities']):

        token_pols = []
        pols_label = 0
        for i in range(len(tokens)):
            if int(pols[i]) != -1:
                token_pols.append(tokens[i])
                pols_label = int(pols[i])

        input_tokens = tokens + token_pols # concat sentence + term
        token_ids = [
            tokenizer.token_to_id(token.lower())
            if tokenizer.token_to_id(token.lower()) else 0
            for token in input_tokens
        ]

        token_ids = pad_and_truncate(token_ids, tokenizer.token_to_id("<pad>"))

        tokenized_inputs.append(token_ids)
        labels.append(pols_label)


    return {
            'input_ids': torch.tensor(tokenized_inputs),
            'labels': torch.tensor(labels)
        }

In [8]:
preprocessed_ds = ds.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3602 [00:00<?, ? examples/s]

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [10]:
preprocessed_ds['train'][0]

{'Tokens': ['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.'],
 'Tags': ['0', '0', '1', '0', '0', '0', '0', '0', '0'],
 'Polarities': ['-1', '-1', '0', '-1', '-1', '-1', '-1', '-1', '-1'],
 'input_ids': [24,
  2,
  67,
  11,
  44,
  509,
  10,
  74,
  3,
  67,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'labels': 0}

## **Model**

### **LSTM**

In [None]:
import torch
import torch.nn as nn
from transformers import PreTrainedModel

class ABSALSTMClassifier(PreTrainedModel):
    def __init__(self, config, vocab_size, num_classes,
                 embedding_dim=256, hidden_dim=256, pad_idx=0, dropout=0.3):
        super().__init__(config)
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        self.fc = nn.Linear(hidden_dim, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        embedded = self.embedding(input_ids)
        outputs, (hidden, _) = self.lstm(embedded)

        hidden = hidden.squeeze(0)

        logits = self.fc(hidden)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)  # labels có shape (batch_size,)

        return {"loss": loss, "logits": logits}

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
from transformers import PretrainedConfig
config = PretrainedConfig()

# LSTM Model
model = ABSALSTMClassifier(config, len(tokenizer.get_vocab()), num_classes=3)

In [None]:
model

ABSALSTMClassifier(
  (embedding): Embedding(4286, 256, padding_idx=0)
  (lstm): LSTM(256, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (loss_fn): CrossEntropyLoss()
)

### **Transformer**

In [None]:
import torch
import torch.nn as nn
from transformers import PreTrainedModel

class ABSATransformerClassifier(PreTrainedModel):
    def __init__(self, config, vocab_size, num_classes,
                 max_len=512, embedding_dim=256, num_heads=8,
                 num_layers=6, hidden_dim=1024, pad_idx=0, dropout=0.3):
        super().__init__(config)

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.position_embedding = nn.Embedding(max_len, embedding_dim)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim, nhead=num_heads, dim_feedforward=hidden_dim, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Linear(embedding_dim, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        batch_size, seq_length = input_ids.shape

        positions = torch.arange(seq_length, device=input_ids.device).unsqueeze(0)
        embeddings = self.embedding(input_ids) + self.position_embedding(positions)

        encoded = self.transformer_encoder(embeddings)

        cls_representation = encoded[:, 0, :]

        logits = self.fc(cls_representation)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return {"loss": loss, "logits": logits}

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
from transformers import PretrainedConfig
config = PretrainedConfig()

# Transformer Model
model = ABSATransformerClassifier(config, len(tokenizer.get_vocab()), num_classes=3)

### **Conv1D**

In [None]:
import torch
import torch.nn as nn
from transformers import PreTrainedModel

class ABSAConv1DClassifier(PreTrainedModel):
    def __init__(self, config, vocab_size, num_classes,
                 embedding_dim=256, num_filters=256, kernel_size=3,
                 pad_idx=0, dropout=0.3):
        super().__init__(config)

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

        self.conv = nn.Conv1d(
            in_channels=embedding_dim, out_channels=num_filters,
            kernel_size=kernel_size, padding=1
        )

        self.fc = nn.Linear(num_filters, num_classes)

        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        embedded = self.embedding(input_ids)  # (B, S, E)
        embedded = embedded.permute(0, 2, 1)  # (B, E, S) -> Conv1D cần định dạng này

        features = torch.relu(self.conv(embedded))  # (B, num_filters, S)

        # 🚀 Max-Pooling theo chiều seq_len để lấy feature mạnh nhất
        pooled = torch.max(features, dim=2).values  # (B, num_filters)
        logits = self.fc(pooled)  # (B, num_classes)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)  # labels có shape (B,)

        return {"loss": loss, "logits": logits}

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
from transformers import PretrainedConfig
config = PretrainedConfig()

# Conv1D Model
model = ABSAConv1DClassifier(config, len(tokenizer.get_vocab()), num_classes=3)

In [None]:
input_ids = torch.tensor([preprocessed_ds['train'][0]['input_ids']])
labels = torch.tensor([preprocessed_ds['train'][0]['labels']])

In [None]:
predictions = model(input_ids)

In [None]:
predictions['logits'].shape

torch.Size([1, 3])

In [None]:
predictions = model(input_ids, labels)

In [None]:
predictions['loss']

tensor(2.2077, grad_fn=<NllLossBackward0>)

## **Evaluate**

In [None]:
!pip install -q evaluate==0.4.3

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(
        predictions=predictions, references=labels
    )

## **Trainer**

In [None]:
# Disable wandb
import os
os.environ['WANDB_DISABLED'] = 'true'

# # Use wandb
# import wandb
# wandb.init(
#     project="aspect-based-sentiment-analysis",
#     name="conv1d" # "transformer-encoder", "lstm", "conv1d"
# )

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="abte-restaurants-conv1d", # "transformer-encoder", "lstm", "conv1d"
    logging_dir="logs",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=50,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["test"],
    compute_metrics=compute_metrics,
)

### **LSTM**

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,1.1025,1.095834,0.175156
2,1.0949,1.086635,0.175156
3,1.0874,1.077085,0.649687
4,1.0789,1.066388,0.649687
5,1.0687,1.053357,0.649687
6,1.0561,1.035136,0.649687
7,1.0366,1.00546,0.649687
8,0.9995,0.917426,0.649687
9,0.9472,0.888722,0.649687
10,0.9464,0.887091,0.649687


TrainOutput(global_step=1450, training_loss=0.8938708982796505, metrics={'train_runtime': 47.2642, 'train_samples_per_second': 3810.495, 'train_steps_per_second': 30.679, 'total_flos': 56959182420000.0, 'train_loss': 0.8938708982796505, 'epoch': 50.0})

### **Transformer**

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.964,0.895304,0.649687
2,0.9262,0.922936,0.631814
3,0.9204,0.90418,0.644325
4,0.9097,0.888342,0.648794
5,0.8903,0.89499,0.63807
6,0.8767,0.892461,0.625559
7,0.862,0.886962,0.632708
8,0.8625,0.897754,0.635389
9,0.8403,0.88642,0.636282
10,0.8388,0.874027,0.636282


TrainOutput(global_step=1450, training_loss=0.7322444152832032, metrics={'train_runtime': 305.1027, 'train_samples_per_second': 590.293, 'train_steps_per_second': 4.752, 'total_flos': 512132107860000.0, 'train_loss': 0.7322444152832032, 'epoch': 50.0})

### **Conv1D**

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.9696,0.907229,0.649687
2,0.9436,0.892204,0.649687
3,0.9227,0.884161,0.649687
4,0.908,0.875218,0.649687
5,0.8828,0.865475,0.649687
6,0.8692,0.856157,0.649687
7,0.8516,0.85046,0.649687
8,0.842,0.844148,0.649687
9,0.8292,0.838595,0.649687
10,0.8183,0.833856,0.650581


TrainOutput(global_step=1450, training_loss=0.7338019903774919, metrics={'train_runtime': 42.1314, 'train_samples_per_second': 4274.725, 'train_steps_per_second': 34.416, 'total_flos': 21356438100000.0, 'train_loss': 0.7338019903774919, 'epoch': 50.0})