<a href="https://colab.research.google.com/github/GinuraAdikari/InsightHive/blob/main/model_ATE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install seqeval



In [2]:
import pandas as pd
import spacy
import ast

# Load dataset
df = pd.read_csv("ABSA_dataset.csv")
df.head()

Unnamed: 0,No.,Review_Text,Sentiment_Label,review_length,tokens,Cleaned_review
0,1,one best game music soundtracks game didnt rea...,1,79,"['one', 'best', 'game', 'music', 'soundtrack',...",one best game music soundtrack game didnt real...
1,10001,best purchase ever bought exersaucer little gu...,1,39,"['best', 'purchase', 'ever', 'bought', 'exersa...",best purchase ever bought exersaucer little gu...
2,20001,book slow weak one beststhank god slow weak bo...,0,20,"['book', 'slow', 'weak', 'one', 'beststhank', ...",book slow weak one beststhank god slow weak bo...
3,30001,mustread every southern lady failed love book ...,1,49,"['mustread', 'every', 'southern', 'lady', 'fai...",mustread every southern lady failed love book ...
4,40001,horrible watch napoleon want funny movie sucks...,0,32,"['horrible', 'watch', 'napoleon', 'want', 'fun...",horrible watch napoleon want funny movie suck ...


In [3]:
df["tokens"] = df["tokens"].apply(ast.literal_eval)  # Convert stored lists back to Python lists

# Load SpaCy NLP model
nlp = spacy.load("en_core_web_sm")

def bio_tagging(tokens):
    text = " ".join(tokens)  # Convert token list to text
    doc = nlp(text)

    tags = ["O"] * len(tokens)  # Default all tokens as 'O'

    for chunk in doc.noun_chunks:  # Detect noun phrases
        chunk_tokens = chunk.text.split()
        start_idx = -1

        # Find start index of noun chunk in token list
        for i in range(len(tokens) - len(chunk_tokens) + 1):
            if tokens[i:i + len(chunk_tokens)] == chunk_tokens:
                start_idx = i
                break

        # Assign BIO tags
        if start_idx != -1:
            tags[start_idx] = "B-Aspect"
            for i in range(start_idx + 1, start_idx + len(chunk_tokens)):
                tags[i] = "I-Aspect"

    return tags

# Apply BIO tagging function
df["bio_tags"] = df["tokens"].apply(bio_tagging)

# Save dataset with BIO tags
df.to_csv("bio_tagged_dataset.csv", index=False)

print("✅ BIO-tagged dataset saved as 'bio_tagged_dataset.csv'!")

✅ BIO-tagged dataset saved as 'bio_tagged_dataset.csv'!


In [4]:
import pandas as pd

# Load preprocessed dataset
df = pd.read_csv("bio_tagged_dataset.csv")

# Convert token lists and BIO tag lists from string to actual lists
import ast
df["tokens"] = df["tokens"].apply(ast.literal_eval)
df["bio_tags"] = df["bio_tags"].apply(ast.literal_eval)

print("✅ Dataset loaded successfully!")
print(df.head())  # Display first few rows

✅ Dataset loaded successfully!
     No.                                        Review_Text  Sentiment_Label  \
0      1  one best game music soundtracks game didnt rea...                1   
1  10001  best purchase ever bought exersaucer little gu...                1   
2  20001  book slow weak one beststhank god slow weak bo...                0   
3  30001  mustread every southern lady failed love book ...                1   
4  40001  horrible watch napoleon want funny movie sucks...                0   

   review_length                                             tokens  \
0             79  [one, best, game, music, soundtrack, game, did...   
1             39  [best, purchase, ever, bought, exersaucer, lit...   
2             20  [book, slow, weak, one, beststhank, god, slow,...   
3             49  [mustread, every, southern, lady, failed, love...   
4             32  [horrible, watch, napoleon, want, funny, movie...   

                                      Cleaned_review  \
0  on

In [5]:
from transformers import AutoTokenizer
from tqdm import tqdm
import torch

tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)

# Label mapping for BIO tagging
label_map = {"O": 0, "B-Aspect": 1, "I-Aspect": 2}

def tokenize_and_align_labels(tokens, bio_tags):
    tokenized_input = tokenizer(tokens, truncation=True, is_split_into_words=True, padding="longest", add_special_tokens=True)
    word_ids = tokenized_input.word_ids()

    aligned_labels = []
    prev_word = None
    for word_id in word_ids:
        if word_id is None:
            aligned_labels.append(-100)  # Ignore special tokens
        elif word_id != prev_word:
            aligned_labels.append(label_map[bio_tags[word_id]])  # Assign correct label
        else:
            aligned_labels.append(-100)  # Ignore subword parts
        prev_word = word_id

    tokenized_input["labels"] = aligned_labels
    return tokenized_input

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
import torch

tokenized_inputs = []

for tokens, bio_tags in tqdm(zip(df["tokens"], df["bio_tags"]), total=len(df)):
    tokenized_input = tokenize_and_align_labels(tokens, bio_tags)
    tokenized_inputs.append(tokenized_input)

print(f"✅ Tokenization completed for {len(tokenized_inputs)} samples.")

# Find max length from all tokenized inputs
max_len = max(len(x["input_ids"]) for x in tokenized_inputs)

# Apply padding manually to ensure all tensors have the same size
def pad_tensor(tensor, max_length, pad_value=0):
    """Pads tensor to max_length with pad_value."""
    padding_size = max_length - len(tensor)
    return torch.cat([tensor, torch.full((padding_size,), pad_value, dtype=torch.long)])

# Convert tokenized inputs to tensors and pad
input_ids = torch.stack([pad_tensor(torch.tensor(x["input_ids"]), max_len) for x in tokenized_inputs])
attention_mask = torch.stack([pad_tensor(torch.tensor(x["attention_mask"]), max_len) for x in tokenized_inputs])
labels = torch.stack([pad_tensor(torch.tensor(x["labels"]), max_len, pad_value=-100) for x in tokenized_inputs])  # ✅ Use -100 to ignore padding in loss calculation

# Save dataset in PyTorch format
torch.save({
    "input_ids": input_ids,
    "attention_mask": attention_mask,
    "labels": labels
}, "bert_ner_dataset.pt")

print(f"✅ Tokenized dataset saved successfully! (Padded to max length {max_len})")

100%|██████████| 1310/1310 [00:02<00:00, 576.38it/s]


✅ Tokenization completed for 1310 samples.
✅ Tokenized dataset saved successfully! (Padded to max length 512)


In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import torch
import numpy as np
from torch.utils.data import Dataset
from seqeval.metrics import precision_score, recall_score, f1_score

# Load dataset
dataset = torch.load("bert_ner_dataset.pt")

# Label setup
label_list = ["O", "B-Aspect", "I-Aspect"]
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

# Optional: sample for testing stability
# dataset = {k: v[:300] for k, v in dataset.items()}  # Uncomment for quick tests

In [13]:
# Dataset class
class BERTNERDataset(Dataset):
       def __init__(self, encodings):  # Changed _init_ to __init__
           self.encodings = encodings
       def __len__(self):
           return len(self.encodings["input_ids"])
       def __getitem__(self, idx):
           return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

In [14]:
# Train/validation split
train_size = int(0.8 * len(dataset["input_ids"]))
train_dataset = BERTNERDataset({k: v[:train_size] for k, v in dataset.items()})
val_dataset = BERTNERDataset({k: v[train_size:] for k, v in dataset.items()})

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(
    "roberta-base",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Evaluation metrics
def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)
    true_preds = [
        [label_list[p] for (p, l) in zip(pred_row, label_row) if l != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(pred_row, label_row) if l != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]
    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
    }

In [16]:
# Training configuration (Colab-optimized)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),  # Mixed precision if GPU available
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [17]:
# Train
trainer.train()

# Save model/tokenizer
model.save_pretrained("roberta_ate_model")
tokenizer.save_pretrained("roberta_tokenizer")

print("✅ Model training complete and saved!")

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.589008,0.434575,0.480718,0.456484


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.589008,0.434575,0.480718,0.456484
2,No log,0.5868,0.457659,0.508188,0.481602
3,No log,0.576258,0.466311,0.508188,0.48635


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


✅ Model training complete and saved!


In [18]:
from google.colab import files

# Zip and download ATE model
!zip -r roberta_ate_model.zip roberta_ate_model
files.download("roberta_ate_model.zip")

# Zip and download tokenizer
!zip -r roberta_tokenizer.zip roberta_tokenizer
files.download("roberta_tokenizer.zip")

  adding: roberta_ate_model/ (stored 0%)
  adding: roberta_ate_model/config.json (deflated 51%)
  adding: roberta_ate_model/model.safetensors (deflated 15%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: roberta_tokenizer/ (stored 0%)
  adding: roberta_tokenizer/special_tokens_map.json (deflated 52%)
  adding: roberta_tokenizer/tokenizer.json (deflated 82%)
  adding: roberta_tokenizer/vocab.json (deflated 59%)
  adding: roberta_tokenizer/tokenizer_config.json (deflated 75%)
  adding: roberta_tokenizer/merges.txt (deflated 53%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>