<a href="https://colab.research.google.com/github/Ghostalp07/SPROJ/blob/aspect-based-sentiment-analyzer/hybrid_aspect_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F


In [None]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "yelp.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "omkarsabnis/yelp-reviews-dataset",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/omkarsabnis/yelp-reviews-dataset?dataset_version_number=1&file_name=yelp.csv...


100%|██████████| 3.49M/3.49M [00:00<00:00, 111MB/s]

Extracting zip of yelp.csv...





First 5 records:               business_id        date               review_id  stars  \
0  9yKzy9PApeiPPOUJEtnvkg  2011-01-26  fWKvX83p0-ka4JS3dc6E5A      5   
1  ZRJwVLyzEJq1VAihDhYiow  2011-07-27  IjZ33sJrzXqU-0X6U8NwyA      5   
2  6oRAC4uyJCsJl1X0WZpVSA  2012-06-14  IESLBzqUCLdSzSqm0eCSxQ      4   
3  _1QQZuf4zZOyFCvXc0o6Vg  2010-05-27  G-WvGaISbqqaMHlNnByodA      5   
4  6ozycU1RpktNG2-1BroVtw  2012-01-05  1uJFq2r5QfJG_6ExMRCaGw      5   

                                                text    type  \
0  My wife took me here on my birthday for breakf...  review   
1  I have no idea why some people give bad review...  review   
2  love the gyro plate. Rice is so good and I als...  review   
3  Rosie, Dakota, and I LOVE Chaparral Dog Park!!...  review   
4  General Manager Scott Petello is a good egg!!!...  review   

                  user_id  cool  useful  funny  
0  rLtl8ZkDX5vH5nAx9C3q5Q     2       5      0  
1  0a2KyEL0d3Yb1V6aivbIuQ     0       0      0  
2  0hT2KtfLiobPvh6

In [None]:
balanced_df = df.groupby('stars', group_keys=False).apply(lambda x: x.sample(min(len(x), 100)))
balanced_df.dropna(inplace=True)
balanced_df['stars'] = balanced_df['stars'].astype(int)
print(balanced_df['stars'].value_counts())
aspect_keywords = ['food', 'service', 'price', 'ambience', 'location']

stars
1    100
2    100
3    100
4    100
5    100
Name: count, dtype: int64


  balanced_df = df.groupby('stars', group_keys=False).apply(lambda x: x.sample(min(len(x), 100)))


In [None]:
from transformers import AutoTokenizer
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

class YelpDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(text, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Convert stars to sentiment classes: 1, 2 = negative, 3 = neutral, 4, 5 = positive
def star_to_sentiment(star):
    if star <= 2:
        return 0
    elif star == 3:
        return 1
    else:
        return 2

balanced_df['sentiment'] = balanced_df['stars'].apply(star_to_sentiment)
X_train, X_test, y_train, y_test = train_test_split(balanced_df['text'], balanced_df['sentiment'], test_size=0.2, random_state=42)

train_dataset = YelpDataset(X_train.tolist(), y_train.tolist())
test_dataset = YelpDataset(X_test.tolist(), y_test.tolist())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
class HybridAspectModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        self.gru = nn.GRU(768, 256, bidirectional=True, batch_first=True)
        self.aspect_head = nn.Linear(512, len(aspect_keywords))
        self.sentiment_head = nn.Linear(512, 3)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        gru_out, _ = self.gru(sequence_output)
        pooled = gru_out.mean(dim=1)
        aspect_logits = self.aspect_head(pooled)
        sentiment_logits = self.sentiment_head(pooled)
        return aspect_logits, sentiment_logits


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HybridAspectModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        _, sentiment_logits = model(input_ids, attention_mask)
        loss = loss_fn(sentiment_logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1, Loss: 25.4414
Epoch 2, Loss: 17.3439
Epoch 3, Loss: 9.6995


In [None]:
def predict_sentiment(text, model, tokenizer):
    model.eval()
    encoding = tokenizer(text, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        aspect_logits, sentiment_logits = model(input_ids, attention_mask)

    predicted_aspects = [aspect_keywords[i] for i, logit in enumerate(aspect_logits[0]) if logit > 0]
    predicted_sentiment = torch.argmax(sentiment_logits, dim=1).item()

    sentiment_map = {0: "negative", 1: "neutral", 2: "positive"}

    return {
        'text': text,
        'aspects': predicted_aspects,
        'sentiment': sentiment_map[predicted_sentiment]
    }

In [None]:
model = HybridAspectModel().to(device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

test_text = "oh wow food was cold again"
result = predict_sentiment(test_text, model, tokenizer)

print("\nPrediction Result:")
print(f"Text: {result['text']}")
print(f"Predicted aspects: {result['aspects']}")
print(f"Predicted sentiment: {result['sentiment']}")


Prediction Result:
Text: oh wow food was cold again
Predicted aspects: ['service', 'price']
Predicted sentiment: negative


In [None]:
import gradio as gr