<a href="https://colab.research.google.com/github/GinuraAdikari/InsightHive/blob/Sentiment_Analysis/model_ABSC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Mount Google drive & Load ATE**

In [1]:
import os
import pandas as pd
import numpy as np
import string
import torch
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForTokenClassification, BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
# ========== 1. Load Preprocessed Dataset ==========

absa_file = "ABSA_dataset.csv"
absa_df = pd.read_csv(absa_file)
print(f"Loaded dataset with {len(absa_df)} rows.")

Loaded dataset with 1310 rows.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# ========== 2. Load ATE Model ==========

ate_model_path = "/content/drive/MyDrive/ABSA/roberta_ate_model"
ate_tokenizer_path = "/content/drive/MyDrive/ABSA/roberta_tokenizer"

ate_model = AutoModelForTokenClassification.from_pretrained(ate_model_path)
ate_tokenizer = AutoTokenizer.from_pretrained(ate_tokenizer_path, add_prefix_space=True)
ate_model.eval()

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

## **Extract Aspect using ATE**

In [5]:
# ========== 3. Aspect Extraction ==========
def extract_aspects(text):
    tokens = ate_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = ate_model(**tokens).logits
    predictions = torch.argmax(outputs, dim=2).squeeze().tolist()
    tokens_list = ate_tokenizer.convert_ids_to_tokens(tokens["input_ids"].squeeze())

    aspect_terms = []
    current_aspect = []
    for token, label in zip(tokens_list, predictions):
        if token in string.punctuation:
            continue
        if label == 1:
            if current_aspect:
                aspect_terms.append(" ".join(current_aspect))
                current_aspect = []
            current_aspect.append(token.replace("Ġ", ""))
        elif label == 2 and current_aspect:
            current_aspect.append(token.replace("Ġ", ""))
        else:
            if current_aspect:
                aspect_terms.append(" ".join(current_aspect))
                current_aspect = []
    if current_aspect:
        aspect_terms.append(" ".join(current_aspect))
    return aspect_terms

In [6]:
# Apply ATE
absa_df["Aspects"] = absa_df["Cleaned_review"].apply(extract_aspects)

## **Create ABSC Training Data**

In [7]:
# ========== 4. Clean Aspects ==========

# Define stopwords
stopwords = set(stopwords.words('english'))

def clean_aspects(aspects):
    cleaned_aspects = []
    for aspect in aspects:
        # Remove special tokens
        if aspect in ["<s>", "</s>"]:
            continue
        # Remove punctuation and split into words
        words = [word.strip(string.punctuation) for word in aspect.split()]
        # Remove stopwords and empty strings
        words = [word for word in words if word.lower() not in stopwords and word]
        if words:
            cleaned_aspects.append(" ".join(words))
    return cleaned_aspects

absa_df["Cleaned_Aspects"] = absa_df["Aspects"].apply(clean_aspects)
print(absa_df[["Cleaned_review", "Cleaned_Aspects"]].head())

                                      Cleaned_review  \
0  one best game music soundtrack game didnt real...   
1  best purchase ever bought exersaucer little gu...   
2  book slow weak one beststhank god slow weak bo...   
3  mustread every southern lady failed love book ...   
4  horrible watch napoleon want funny movie suck ...   

                                     Cleaned_Aspects  
0  [one best game music soundtrack game, fact, sm...  
1  [best purchase, ex ers au cer little guy, musi...  
2         [book, usual frightening detailed book st]  
3  [every southern lady failed love book book che...  
4  [horrible watch nap oleon, funny movie suck mo...  


In [8]:
# ========== 5. Create ABSC Dataset ==========

sia = SentimentIntensityAnalyzer()

def get_aspect_context_sentiment(review, aspect, window=30):
    """
    Extracts sentiment for a given aspect using its surrounding context in the review.
    """
    idx = review.lower().find(aspect.lower())
    if idx == -1:
        return 2  # Neutral or unknown

    start = max(0, idx - window)
    end = min(len(review), idx + len(aspect) + window)
    context = review[start:end]

    score = sia.polarity_scores(context)["compound"]
    if score > 0.05:
        return 1  # Positive
    elif score < -0.05:
        return 0  # Negative
    else:
        return 2  # Neutral (optional to include)

In [9]:
# Create aspect-review-sentiment pairs
absc_data = []
for _, row in absa_df.iterrows():
    review = row["Cleaned_review"]
    for aspect in row["Cleaned_Aspects"]:
        label = get_aspect_context_sentiment(review, aspect)
        if label in [0, 1]:  # binary
            absc_data.append({
                "Review": review,
                "Aspect": aspect,
                "Sentiment": label
            })

absc_df = pd.DataFrame(absc_data)
absc_df["Sentiment"] = absc_df["Sentiment"].astype(int)

## **Tokenize & Build Dataset**

In [10]:
# ========== 6. Tokenization ==========

absc_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_absc(examples):
    return absc_tokenizer(
        examples["Review"],
        examples["Aspect"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized = tokenize_absc(absc_df.to_dict("list"))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [11]:
# === ABSC Pre-Training Validation ===

print("\n===== ABSC Dataset Validation =====")

# 1. Check for missing values
missing = absc_df.isnull().sum()
print("\nMissing values:\n", missing)

# 2. Class distribution
print("\nClass distribution:\n", absc_df["Sentiment"].value_counts())

# 3. Unique sentiment labels
print("\nUnique Sentiment Labels:", absc_df["Sentiment"].unique())

# 4. Sample review-aspect-sentiment
print("\nRandom Samples:")
print(absc_df[["Review", "Aspect", "Sentiment"]].sample(5))

# 5. Check if any aspect is not in review
print("\nChecking if aspects are found in reviews...")
aspect_not_found = absc_df[~absc_df.apply(lambda row: row["Aspect"].lower() in row["Review"].lower(), axis=1)]
print(f"Aspects not found in review: {len(aspect_not_found)}")

# 6. Tokenization stats
tokenized = absc_tokenizer(
    absc_df["Review"].tolist(),
    absc_df["Aspect"].tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt",
    max_length=128
)
print("\nMax token length per sample:", tokenized['input_ids'].shape[1])

# 7. Dataset size
print(f"\nTotal training samples: {len(absc_df)}")

print("\n✅ Validation complete. Safe to begin training.")


===== ABSC Dataset Validation =====

Missing values:
 Review       0
Aspect       0
Sentiment    0
dtype: int64

Class distribution:
 Sentiment
1    4275
0    1597
Name: count, dtype: int64

Unique Sentiment Labels: [1 0]

Random Samples:
                                                 Review  \
575   nice feature poor construction many pump past ...   
1943  first time read confederacy ambivalent couldnt...   
5803  brought bundle better half cant find original ...   
2001  wonderful wonderful book wee hour finishing st...   
5385  case work well viewing video also hold well li...   

                                 Aspect  Sentiment  
575                      pump top twist          0  
1943                 even first reading          0  
5803                      original item          1  
2001  wonderful wonderful book wee hour          1  
5385                          something          1  

Checking if aspects are found in reviews...
Aspects not found in review: 0


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai


Max token length per sample: 128

Total training samples: 5872

✅ Validation complete. Safe to begin training.


## **Train & Save**

In [12]:
# ========== 7. Dataset Class ==========

class ABSADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [13]:
labels = absc_df["Sentiment"].tolist()
encodings = tokenized  # keep this as is

# First, split indices
indices = list(range(len(labels)))
train_idx, temp_idx, y_train, y_temp = train_test_split(
    indices, labels, test_size=0.2, stratify=labels, random_state=42
)
val_idx, test_idx, y_val, y_test = train_test_split(
    temp_idx, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

# Use the custom ABSADataset to create datasets with selected indices
train_dataset = ABSADataset({k: [v[i] for i in train_idx] for k, v in encodings.items()},
                            [labels[i] for i in train_idx])
val_dataset = ABSADataset({k: [v[i] for i in val_idx] for k, v in encodings.items()},
                          [labels[i] for i in val_idx])
test_dataset = ABSADataset({k: [v[i] for i in test_idx] for k, v in encodings.items()},
                           [labels[i] for i in test_idx])

In [14]:
# ========== 8. Class Weights ==========
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# ========== 9. Model ==========
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.classifier = torch.nn.Linear(model.config.hidden_size, 2)
model.weighted_loss = torch.nn.CrossEntropyLoss(weight=weights_tensor)

# ========== 10. Metrics ==========
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "f1": f1_score(labels, preds),
        "roc_auc": roc_auc_score(labels, preds),
        "confusion_matrix": confusion_matrix(labels, preds).tolist()
    }

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# ========== 11. Training Arguments ==========
training_args = TrainingArguments(
    output_dir="./absc_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none",
    fp16=torch.cuda.is_available()
)

# ========== 12. Trainer ==========
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=absc_tokenizer,
    compute_metrics=compute_metrics
)

# ========== 13. Train ==========
trainer.train()

# ========== 14. Save ==========
model.save_pretrained("bert_absc_model")
absc_tokenizer.save_pretrained("bert_absc_tokenizer")

  trainer = Trainer(
  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc,Confusion Matrix
1,0.5881,0.585818,0.727428,0.727428,1.0,0.842209,0.5,"[[0, 160], [0, 427]]"
2,0.5925,0.586336,0.727428,0.727428,1.0,0.842209,0.5,"[[0, 160], [0, 427]]"
3,0.5947,0.58654,0.727428,0.727428,1.0,0.842209,0.5,"[[0, 160], [0, 427]]"


  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}


('bert_absc_tokenizer/tokenizer_config.json',
 'bert_absc_tokenizer/special_tokens_map.json',
 'bert_absc_tokenizer/vocab.txt',
 'bert_absc_tokenizer/added_tokens.json')

In [16]:
from google.colab import files

# Zip and download ATE model
!zip -r bert_absc_model.zip bert_absc_model
files.download("bert_absc_model.zip")

# Zip and download tokenizer
!zip -r bert_absc_tokenizer.zip bert_absc_tokenizer
files.download("bert_absc_tokenizer.zip")

  adding: bert_absc_model/ (stored 0%)
  adding: bert_absc_model/config.json (deflated 49%)
  adding: bert_absc_model/model.safetensors (deflated 7%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: bert_absc_tokenizer/ (stored 0%)
  adding: bert_absc_tokenizer/special_tokens_map.json (deflated 42%)
  adding: bert_absc_tokenizer/vocab.txt (deflated 53%)
  adding: bert_absc_tokenizer/tokenizer_config.json (deflated 75%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>