# **Aspect-based Term Extraction using RNN, Conv1D,  Transformer Model**

##**Dataset**

In [None]:
!pip install -q datasets==3.2.0

In [2]:
from datasets import load_dataset

ds = load_dataset("thainq107/abte-restaurants")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/454 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/183k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/61.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3602 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['Tokens', 'Tags', 'Polarities'],
        num_rows: 3602
    })
    test: Dataset({
        features: ['Tokens', 'Tags', 'Polarities'],
        num_rows: 1119
    })
})

In [4]:
ds['train'][0]

{'Tokens': ['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.'],
 'Tags': ['0', '0', '1', '0', '0', '0', '0', '0', '0'],
 'Polarities': ['-1', '-1', '0', '-1', '-1', '-1', '-1', '-1', '-1']}

##**Tokenizer**

In [6]:
corpus = [" ".join(i) for i in ds['train']['Tokens']]

In [7]:
corpus[0]

'But the staff was so horrible to us .'

In [8]:
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors

# Initialize the tokenizer using WordLevel model (word-based tokenization)
tokenizer = Tokenizer(models.WordLevel(unk_token="<unk>"))

# Use Whitespace pre-tokenizer to split words based on spaces
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Train the tokenizer on the dataset
trainer = trainers.WordLevelTrainer(vocab_size=5000,special_tokens=["<pad>", "<unk>"])
tokenizer.train_from_iterator(corpus, trainer)

# Save the trained tokenizer as a JSON file
tokenizer.save("word_tokenizer.json")

In [9]:
encoded = tokenizer.encode("The restaurant serves delicious food")
token_ids = [tokenizer.token_to_id(token) for token in encoded.tokens]
encoded.tokens, token_ids

(['The', 'restaurant', 'serves', 'delicious', 'food'], [13, 65, 873, 60, 14])

In [10]:
len(tokenizer.get_vocab())

4286

In [11]:
MAX_LEN = max([len(tokens) for tokens in ds['train']['Tags']])
MAX_LEN

79

In [12]:
tokenizer.token_to_id("<pad>")

0

In [13]:
import torch

def pad_and_truncate(inputs, pad_id):
    if len(inputs) < MAX_LEN:
        padded_inputs = inputs + [pad_id] * (MAX_LEN - len(inputs))
    else:
        padded_inputs = inputs[:MAX_LEN]
    return padded_inputs

def tokenize_and_align_labels(examples):
    tokenized_inputs = []
    labels = []
    for tokens, tags in zip(examples['Tokens'], examples['Tags']):
        token_ids = [
            tokenizer.token_to_id(token.lower())
            if tokenizer.token_to_id(token.lower()) else 0
            for token in tokens
        ] # [13, 65, 873, 60, 14, 0]

        tags = [int(tag) for tag in tags]

        assert len(token_ids) == len(tags)

        token_ids = pad_and_truncate(token_ids, tokenizer.token_to_id("<pad>"))
        tags = pad_and_truncate(tags, -100)

        tokenized_inputs.append(token_ids)
        labels.append(tags)

    return {
            'input_ids': torch.tensor(tokenized_inputs),
            'labels': torch.tensor(labels)
        }

In [14]:
preprocessed_ds = ds.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3602 [00:00<?, ? examples/s]

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [15]:
preprocessed_ds['train']

Dataset({
    features: ['Tokens', 'Tags', 'Polarities', 'input_ids', 'labels'],
    num_rows: 3602
})

## **Model**

### **LSTM**

In [17]:
import torch
import torch.nn as nn
from transformers import PreTrainedModel

class ABTELSTMClassifier(PreTrainedModel):
    def __init__(self, config, vocab_size, num_classes,
                 embedding_dim=256, hidden_dim=256, pad_idx=0):
        super().__init__(config)
        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=pad_idx
        )

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        self.fc = nn.Linear(hidden_dim, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        embedded = self.embedding(input_ids)
        outputs, _ = self.lstm(embedded)
        logits = self.fc(outputs)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits.permute(0, 2, 1), labels)

        return {"loss": loss, "logits": logits}

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
from transformers import PretrainedConfig
config = PretrainedConfig()

# LSTM Model
model = ABTELSTMClassifier(config, len(tokenizer.get_vocab()), num_classes=3)

In [None]:
model

ABTELSTMClassifier(
  (embedding): Embedding(4286, 256, padding_idx=0)
  (lstm): LSTM(256, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (loss_fn): CrossEntropyLoss()
)

### **Transformer**

In [None]:
import torch.nn as nn
from transformers import PreTrainedModel

class ABTETransformerClassifier(PreTrainedModel):
    def __init__(self, config, vocab_size, num_classes,
                 max_len=512, embedding_dim=256, num_heads=8,
                 num_layers=6, hidden_dim=1024, pad_idx=0):
        super().__init__(config)
        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx = pad_idx)
        self.position_embedding = nn.Embedding(max_len, embedding_dim)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=embedding_dim, nhead=num_heads, dim_feedforward=hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layers, num_layers=num_layers)
        self.fc = nn.Linear(embedding_dim, num_classes)

        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        seq_length = input_ids.size(1)
        positions = torch.arange(
            seq_length, device=input_ids.device).unsqueeze(0)

        outputs = self.embedding(input_ids) + self.position_embedding(positions)

        outputs = outputs.permute(1, 0, 2)
        outputs = self.transformer_encoder(outputs)

        outputs = outputs.permute(1, 0, 2) #=> BxSxE
        logits = self.fc(outputs)
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits.permute(0, 2, 1), labels)
        return {"loss": loss, "logits": logits}

In [None]:
from transformers import PretrainedConfig
config = PretrainedConfig()

# Transformer Model
model = ABTETransformerClassifier(config, len(tokenizer.get_vocab()), num_classes=3)

In [None]:
model

ABTETransformerClassifier(
  (embedding): Embedding(4286, 256, padding_idx=0)
  (position_embedding): Embedding(512, 256)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1024, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=256, out_features=3, bias=True)
)

### **Conv1D**

In [None]:
import torch
import torch.nn as nn
from transformers import PreTrainedModel

class ABTEConv1DClassifier(PreTrainedModel):
    def __init__(self, config, vocab_size, num_classes,
                 embedding_dim=256, num_filters=256, kernel_size=3, pad_idx=0):
        super().__init__(config)
        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=pad_idx)

        self.conv = nn.Conv1d(
            in_channels=embedding_dim, out_channels=num_filters,
            kernel_size=kernel_size, padding=1)

        self.fc = nn.Linear(num_filters, num_classes)

        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        embedded = self.embedding(input_ids) # BxSxE
        embedded = embedded.permute(0, 2, 1) # BxExS

        # Áp dụng Conv1D
        features = torch.relu(self.conv(embedded))

        features = features.permute(0, 2, 1)
        logits = self.fc(features)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits.permute(0, 2, 1), labels)

        return {"loss": loss, "logits": logits}

In [None]:
from transformers import PretrainedConfig
config = PretrainedConfig()

# Conv1D Model
model = ABTEConv1DClassifier(config, len(tokenizer.get_vocab()), num_classes=3)

In [None]:
model

ABTEConv1DClassifier(
  (embedding): Embedding(4286, 256, padding_idx=0)
  (conv): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (fc): Linear(in_features=256, out_features=3, bias=True)
)

In [None]:
input_ids = torch.tensor([preprocessed_ds['train'][0]['input_ids']])
labels = torch.tensor([preprocessed_ds['train'][0]['labels']])

In [None]:
input_ids

tensor([[ 24,   2,  67,  11,  44, 509,  10,  74,   3,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0]])

In [None]:
labels

tensor([[   0,    0,    1,    0,    0,    0,    0,    0,    0, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100]])

In [None]:
predictions = model(input_ids)

In [None]:
predictions['logits'].shape

torch.Size([1, 79, 3])

In [None]:
predictions = model(input_ids, labels)

In [None]:
predictions['loss']

tensor(1.0296, grad_fn=<NllLoss2DBackward0>)

## **Evaluation**

In [None]:
!pip install -q seqeval==1.2.2

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
id2label = {
    0: "O",
    1: "B-Term",
    2: "I-Term"
}
label2id = {
    "O": 0,
    "B-Term": 1,
    "I-Term": 2
}

In [None]:
import numpy as np
from seqeval.metrics import f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = f1_score(true_predictions, true_labels)
    return {"F1-score": results}

## **Trainer**

In [None]:
# Disable wandb
import os
os.environ['WANDB_DISABLED'] = 'true'

# # Use wandb
# import wandb
# wandb.init(
#     project="aspect-based-term-extraction",
#     name="con1d" # "transformer-encoder", "lstm", "conv1d"
# )

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="abte-restaurants-conv1d", # "transformer-encoder", "lstm", "conv1d"
    logging_dir="logs",
    learning_rate=2e-5,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    num_train_epochs=100,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="F1-score",
    # report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["test"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.push_to_hub(token="")

### **LSTM**

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1-score
1,1.0852,1.069513,0.097305
2,1.058,1.04464,0.101456
3,1.0328,1.020347,0.085228
4,1.0092,0.996222,0.077253
5,0.9844,0.971953,0.074405
6,0.9591,0.947189,0.067629
7,0.9324,0.921632,0.064299
8,0.9053,0.895053,0.063199
9,0.8774,0.867224,0.056789
10,0.8465,0.838331,0.059572


TrainOutput(global_step=1500, training_loss=0.5357293043136596, metrics={'train_runtime': 123.0388, 'train_samples_per_second': 2927.532, 'train_steps_per_second': 12.191, 'total_flos': 89995508223600.0, 'train_loss': 0.5357293043136596, 'epoch': 100.0})

### **Transformer**

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1-score
1,0.7234,0.649193,0.0
2,0.5493,0.58964,0.0
3,0.5152,0.566954,0.065474
4,0.4837,0.532524,0.185592
5,0.4652,0.50792,0.296152
6,0.439,0.487792,0.375375
7,0.4179,0.475419,0.396943
8,0.3974,0.468168,0.409014
9,0.3914,0.449159,0.444253
10,0.3781,0.459076,0.435412


TrainOutput(global_step=1500, training_loss=0.21848222875595094, metrics={'train_runtime': 527.2961, 'train_samples_per_second': 683.108, 'train_steps_per_second': 2.845, 'total_flos': 809168730418800.0, 'train_loss': 0.21848222875595094, 'epoch': 100.0})

### **Conv1D**

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1-score
1,0.9904,0.959926,0.115755
2,0.9273,0.902819,0.159195
3,0.8696,0.851609,0.21878
4,0.8201,0.805694,0.27037
5,0.7762,0.764947,0.30723
6,0.7348,0.728439,0.331497
7,0.6987,0.695728,0.343547
8,0.6645,0.666544,0.346776
9,0.637,0.640447,0.351184
10,0.6091,0.616734,0.364348


TrainOutput(global_step=1500, training_loss=0.4019995946884155, metrics={'train_runtime': 124.4357, 'train_samples_per_second': 2894.668, 'train_steps_per_second': 12.054, 'total_flos': 33743172198000.0, 'train_loss': 0.4019995946884155, 'epoch': 100.0})