<a href="https://colab.research.google.com/github/Harjandar/absa-restaurant-sentiment/blob/main/notebooks/ABSA_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [74]:
# ==============================
# STEP 0: Import Libraries
# ==============================
import pandas as pd  # For loading and handling CSV datasets
import re           # For text cleaning using regular expressions
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [75]:
# ==============================
# STEP 1: Load TRAIN dataset
# ==============================
# Load restaurant review dataset from GitHub
url_train = "https://raw.githubusercontent.com/Harjandar/absa-restaurant-sentiment/main/data/raw/restaurants_train_single.csv"
df_train = pd.read_csv(url_train)
print("Original train dataset shape:", df_train.shape)
df_train.head()  # show first 5 rows


Original train dataset shape: (2507, 7)


Unnamed: 0,sentence Id,sentence,aspect_category,aspect_term,from,to,polarity
0,1004293:0,Judging from previous posts this used to be a ...,RESTAURANT#GENERAL,place,51,56,negative
1,1004293:1,"We, there were four of us, arrived at noon - t...",SERVICE#GENERAL,staff,75,80,negative
2,1004293:2,"They never brought us complimentary noodles, i...",SERVICE#GENERAL,,0,0,negative
3,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#QUALITY,food,4,8,negative
4,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#STYLE_OPTIONS,portions,52,60,negative


In [76]:
# ==============================
# STEP 2: Keep only relevant aspects
# ==============================
# We only care about these aspects for ABSA (Aspect-Based Sentiment Analysis)
keep_aspects = [
    "FOOD#QUALITY",
    "FOOD#STYLE_OPTIONS",
    "FOOD#PRICES",
    "SERVICE#GENERAL",
    "DELIVERY#GENERAL",
    "RESTAURANT#GENERAL"  # will rename later
]

# Filter training dataset to keep only the relevant aspects
df_train = df_train[df_train['aspect_category'].isin(keep_aspects)]

In [77]:
# ==============================
# STEP 3: Rename aspects
# ==============================
# Rename "RESTAURANT#GENERAL" to "OVERALL#GENERAL" for clarity
df_train['aspect_category'] = df_train['aspect_category'].replace({"RESTAURANT#GENERAL": "OVERALL#GENERAL"})
df_train['aspect_category'].unique()


array(['OVERALL#GENERAL', 'SERVICE#GENERAL', 'FOOD#QUALITY',
       'FOOD#STYLE_OPTIONS', 'FOOD#PRICES'], dtype=object)

In [78]:
# ==============================
# STEP 4: Remove rows with NULL aspect_term
# ==============================
# Keep only rows where aspect_term is not null
df_train = df_train[df_train['aspect_term'].notnull()]

In [79]:
# ==============================
# STEP 5: Remove neutral reviews
# ==============================
# Neutral reviews are not useful for binary classification
df_train = df_train[df_train['polarity'] != 'neutral']
df_train['polarity'].value_counts()  # check counts

Unnamed: 0_level_0,count
polarity,Unnamed: 1_level_1
positive,1001
negative,412


In [80]:
# ==============================
# STEP 6: Clean text
# ==============================
def clean_text(text):
    """
    Text cleaning function:
    - Lowercase text
    - Remove punctuation and special characters
    - Remove extra spaces
    """
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # keep letters and spaces only
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

# Apply cleaning to all sentences in training data
df_train['sentence'] = df_train['sentence'].apply(clean_text)

In [81]:
# ==============================
# STEP 7: Balance the dataset
# ==============================
# Separate positive and negative reviews
df_pos = df_train[df_train['polarity'] == 'positive']
df_neg = df_train[df_train['polarity'] == 'negative']

# Upsample the smaller class to balance dataset
df_neg_upsampled = df_neg.sample(n=len(df_pos), replace=True, random_state=42)

# Combine positive and upsampled negative
df_train_balanced = pd.concat([df_pos, df_neg_upsampled]).reset_index(drop=True)

# Shuffle dataset
df_train_balanced = df_train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print("Balanced class counts:\n", df_train_balanced['polarity'].value_counts())

Balanced class counts:
 polarity
negative    1001
positive    1001
Name: count, dtype: int64


In [93]:
print("Final test dataset shape:", df_train_balanced.shape)

Final test dataset shape: (2002, 7)


In [82]:
# ==============================
# STEP 8: Load TEST dataset
# ==============================
url_test = "https://raw.githubusercontent.com/Harjandar/absa-restaurant-sentiment/main/data/raw/restaurants_test_single.csv"
df_test = pd.read_csv(url_test)
print("Original test dataset shape:", df_test.shape)

# Keep only same aspects as training
df_test = df_test[df_test['aspect_category'].isin(keep_aspects)]

# Rename aspect
df_test['aspect_category'] = df_test['aspect_category'].replace({"RESTAURANT#GENERAL": "OVERALL#GENERAL"})

# Remove neutral reviews
df_test = df_test[df_test['polarity'] != 'neutral']

# Clean test sentences
df_test['sentence'] = df_test['sentence'].apply(clean_text)

print("Final test dataset shape:", df_test.shape)

Original test dataset shape: (859, 7)
Final test dataset shape: (655, 7)


In [83]:
# ==============================
# STEP 9: Prepare input and output
# ==============================
X_train = df_train_balanced['sentence'].values
y_train_labels = df_train_balanced['polarity'].values

X_test = df_test['sentence'].values
y_test_labels = df_test['polarity'].values

In [84]:
# ==============================
# STEP 10: Encode labels correctly
# ==============================
# Fit label encoder only on known classes
le = LabelEncoder()
le.fit(['positive', 'negative'])  # define classes explicitly

# Transform train and test labels to integers (positive=1, negative=0)
y_train = le.transform(y_train_labels)
y_test = le.transform(y_test_labels)

In [85]:
# ==============================
# STEP 11: Tokenize and pad sequences
# ==============================
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)  # fit only on train data

# Convert text to sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to have same length
max_len = 50  # maximum sentence length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [86]:
# ==============================
# STEP 12: Build Bidirectional LSTM model
# ==============================
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_len))  # embedding layer
model.add(Bidirectional(LSTM(128, return_sequences=False)))  # Bi-LSTM layer for context
model.add(Dropout(0.5))  # dropout for regularization
model.add(Dense(1, activation='sigmoid'))  # output layer for binary sentiment

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [87]:
# ==============================
# STEP 13: Train LSTM model
# ==============================
history = model.fit(
    X_train_pad, y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.2
)

Epoch 1/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 141ms/step - accuracy: 0.5449 - loss: 0.6815 - val_accuracy: 0.7681 - val_loss: 0.5130
Epoch 2/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 154ms/step - accuracy: 0.8233 - loss: 0.4426 - val_accuracy: 0.8229 - val_loss: 0.4429
Epoch 3/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 129ms/step - accuracy: 0.8954 - loss: 0.2842 - val_accuracy: 0.8928 - val_loss: 0.2806
Epoch 4/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 177ms/step - accuracy: 0.9420 - loss: 0.1592 - val_accuracy: 0.8828 - val_loss: 0.2957
Epoch 5/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 120ms/step - accuracy: 0.9061 - loss: 0.2270 - val_accuracy: 0.8853 - val_loss: 0.2794


In [88]:
# ==============================
# STEP 14: Evaluate on test
# ==============================
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# Compute metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\n✅ LSTM Test Performance")
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 120ms/step

✅ LSTM Test Performance
Accuracy: 0.7190839694656489
Precision: 0.8626506024096385
Recall: 0.7381443298969073
F1-score: 0.7955555555555556


In [89]:
# ==============================
# STEP 15: Define aspect rules
# ==============================
# Main aspects and sub-aspects with keywords
aspect_rules = {
    "FOOD": {
        "QUALITY": ["tasty", "delicious", "bland"],
        "SPICE": ["spicy", "hot"],
        "ITEM": ["biryani", "pizza", "burger"]
    },
    "SERVICE": {
        "SPEED": ["slow", "fast"],
        "STAFF": ["staff", "waiter", "rude"]
    },
    "DELIVERY": {
        "GENERAL": ["delivery", "late", "rider"]
    }
}

In [90]:
# ==============================
# STEP 16: Detect aspects and keywords
# ==============================
def detect_aspects_with_keywords(sentence):
    """
    Detect main aspects and keywords in a sentence.
    Returns a dict: {aspect: [keywords found]}
    """
    sentence = sentence.lower()
    detected = {}
    for main_aspect, sub_aspects in aspect_rules.items():
        matched_keywords = []
        for sub_aspect, keywords in sub_aspects.items():
            for word in keywords:
                if word in sentence:
                    matched_keywords.append(word)
        if matched_keywords:
            detected[main_aspect] = matched_keywords
    return detected

In [91]:
# ==============================
# STEP 17: Predict aspect-level sentiment
# ==============================
def predict_aspect_sentiment_detailed(sentence):
    """
    Predict sentiment for each detected aspect in a review.
    Steps:
    1) Clean the sentence
    2) Detect aspects and relevant keywords
    3) Extract only words relevant to each aspect
    4) Feed aspect-specific text to LSTM
    5) Return dict: aspect -> sentiment + keywords
    """
    sentence_clean = clean_text(sentence)
    detected_aspects = detect_aspects_with_keywords(sentence_clean)
    if not detected_aspects:
        return "No aspects detected."

    results = {}

    for aspect, keywords in detected_aspects.items():
        # Extract only aspect-related words from sentence
        sub_sentence = " ".join([word for word in sentence_clean.split() if word in keywords])
        if not sub_sentence:
            sub_sentence = sentence_clean  # fallback to full sentence

        # Convert to sequence and pad
        seq = tokenizer.texts_to_sequences([sub_sentence])
        pad = pad_sequences(seq, maxlen=max_len, padding='post')

        # Predict sentiment probability
        prob = model.predict(pad)[0][0]
        sentiment = "Positive" if prob > 0.5 else "Negative"

        results[aspect] = {
            "sentiment": sentiment,
            "mentioned_keywords": keywords
        }

    return results

In [92]:
# ==============================
# STEP 18: Test example review
# ==============================
review = "biryani was very spicy but tasty and service was slow"

# Predict aspect-level sentiment
predict_aspect_sentiment_detailed(review)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step


{'FOOD': {'sentiment': 'Positive',
  'mentioned_keywords': ['tasty', 'spicy', 'biryani']},
 'SERVICE': {'sentiment': 'Negative', 'mentioned_keywords': ['slow']}}