<a href="https://colab.research.google.com/github/Harjandar/absa-restaurant-sentiment/blob/main/notebooks/ABSA_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==============================
# STEP 0: IMPORT LIBRARIES
# ==============================

# Regular expressions for text cleaning
import re

# Pandas & NumPy for data handling
import pandas as pd
import numpy as np

# NLTK for stopwords and lemmatization
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# TensorFlow / Keras for LSTM modeling
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam

# Scikit-learn for resampling & metrics
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Download NLTK resources (run once)
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
# ==============================
# STEP 1: LOAD TRAIN DATASET
# ==============================

# Load the original train dataset (Western-style reviews)
train_url = "https://raw.githubusercontent.com/Harjandar/absa-restaurant-sentiment/main/data/raw/restaurants_train_single.csv"
train_df = pd.read_csv(train_url)

# Check shape & first few rows
print("Train dataset shape:", train_df.shape)
train_df.head()


Train dataset shape: (2507, 7)


Unnamed: 0,sentence Id,sentence,aspect_category,aspect_term,from,to,polarity
0,1004293:0,Judging from previous posts this used to be a ...,RESTAURANT#GENERAL,place,51,56,negative
1,1004293:1,"We, there were four of us, arrived at noon - t...",SERVICE#GENERAL,staff,75,80,negative
2,1004293:2,"They never brought us complimentary noodles, i...",SERVICE#GENERAL,,0,0,negative
3,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#QUALITY,food,4,8,negative
4,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#STYLE_OPTIONS,portions,52,60,negative


In [3]:
# ==============================
# STEP 2: LOAD TEST DATASET
# ==============================

# Load the original test dataset
test_url = "https://raw.githubusercontent.com/Harjandar/absa-restaurant-sentiment/main/data/raw/restaurants_test_single.csv"
test_df = pd.read_csv(test_url)

# Check shape & first few rows
print("Test dataset shape:", test_df.shape)
test_df.head()


Test dataset shape: (859, 7)


Unnamed: 0,sentence Id,sentence,aspect_category,aspect_term,from,to,polarity
0,en_BlueRibbonSushi_478218171:0,Yum!,FOOD#QUALITY,,0,0,positive
1,en_BlueRibbonSushi_478218171:1,Serves really good sushi.,FOOD#QUALITY,sushi,19,24,positive
2,en_BlueRibbonSushi_478218171:2,Not the biggest portions but adequate.,FOOD#STYLE_OPTIONS,portions,16,24,neutral
3,en_BlueRibbonSushi_478218171:3,Green Tea creme brulee is a must!,FOOD#QUALITY,Green Tea creme brulee,0,22,positive
4,en_BlueRibbonSushi_478218171:4,Don't leave the restaurant without it.,FOOD#QUALITY,,0,0,positive


In [4]:
# ==============================
# STEP 3: MAP ASPECTS FOR OUR PROJECT
# ==============================

# Our project focuses on 4 aspects:
# 1. FOOD    -> taste, quality
# 2. SERVICE -> staff behavior
# 3. DELIVERY -> packaging, delivery speed, hot/cold
# 4. OVERALL -> general restaurant experience

aspect_mapping = {
    'FOOD#QUALITY': 'FOOD',
    'FOOD#STYLE_OPTIONS': 'FOOD',
    'FOOD#PRICES': 'FOOD',
    'SERVICE#GENERAL': 'SERVICE',
    'RESTAURANT#GENERAL': 'OVERALL'
}

# Apply mapping to train dataset
train_df = train_df[train_df['aspect_category'].isin(aspect_mapping.keys())]
train_df['aspect'] = train_df['aspect_category'].map(aspect_mapping)

# Apply mapping to test dataset
test_df = test_df[test_df['aspect_category'].isin(aspect_mapping.keys())]
test_df['aspect'] = test_df['aspect_category'].map(aspect_mapping)

# Verify counts
print("Train aspect counts:\n", train_df['aspect'].value_counts())
print("Test aspect counts:\n", test_df['aspect'].value_counts())


Train aspect counts:
 aspect
FOOD       1076
SERVICE     449
OVERALL     422
Name: count, dtype: int64
Test aspect counts:
 aspect
FOOD       391
SERVICE    155
OVERALL    142
Name: count, dtype: int64


In [5]:
# ==============================
# STEP 4: FILTER ONLY POSITIVE & NEGATIVE
# ==============================

# For simplicity, we remove neutral reviews
train_df = train_df[train_df['polarity'].isin(['positive','negative'])]
test_df  = test_df[test_df['polarity'].isin(['positive','negative'])]

# Verify distribution
print("Train polarity counts:\n", train_df['polarity'].value_counts())
print("Test polarity counts:\n", test_df['polarity'].value_counts())


Train polarity counts:
 polarity
positive    1265
negative     624
Name: count, dtype: int64
Test polarity counts:
 polarity
positive    485
negative    170
Name: count, dtype: int64


In [6]:
# ==============================
# STEP 5: CREATE MODEL INPUT
# ==============================

# Combine sentence + aspect for LSTM input:
# e.g., "Burger was tasty [ASP] FOOD"
train_df['text'] = train_df['sentence'] + " [ASP] " + train_df['aspect']
test_df['text']  = test_df['sentence'] + " [ASP] " + test_df['aspect']

train_df[['sentence','aspect','text']].head()


Unnamed: 0,sentence,aspect,text
0,Judging from previous posts this used to be a ...,OVERALL,Judging from previous posts this used to be a ...
1,"We, there were four of us, arrived at noon - t...",SERVICE,"We, there were four of us, arrived at noon - t..."
2,"They never brought us complimentary noodles, i...",SERVICE,"They never brought us complimentary noodles, i..."
3,The food was lousy - too sweet or too salty an...,FOOD,The food was lousy - too sweet or too salty an...
4,The food was lousy - too sweet or too salty an...,FOOD,The food was lousy - too sweet or too salty an...


In [7]:
# ==============================
# STEP 6: TEXT PREPROCESSING
# ==============================

# Initialize stopwords & lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Preprocessing steps:
    1. Lowercase
    2. Remove punctuation
    3. Remove stopwords except negations ('not','no','never')
    4. Lemmatize
    """
    text = text.lower()                         # lowercase
    text = re.sub(r'[^a-z\s]', '', text)       # remove punctuation
    words = text.split()                        # tokenize
    words = [w for w in words if w not in stop_words or w in ['not','no','never']]
    words = [lemmatizer.lemmatize(w) for w in words]  # lemmatize
    return " ".join(words)

# Apply preprocessing
train_df['text_clean'] = train_df['text'].apply(preprocess_text)
test_df['text_clean']  = test_df['text'].apply(preprocess_text)

# Check
train_df[['text','text_clean']].head()


Unnamed: 0,text,text_clean
0,Judging from previous posts this used to be a ...,judging previous post used good place not long...
1,"We, there were four of us, arrived at noon - t...",four u arrived noon place empty staff acted li...
2,"They never brought us complimentary noodles, i...",never brought u complimentary noodle ignored r...
3,The food was lousy - too sweet or too salty an...,food lousy sweet salty portion tiny asp food
4,The food was lousy - too sweet or too salty an...,food lousy sweet salty portion tiny asp food


In [8]:
# ==============================
# STEP 7: BALANCE TRAINING DATA PER ASPECT
# ==============================

# Upsample smaller class (positive/negative) for each aspect
balanced_dfs = []

for aspect in ['FOOD','SERVICE','DELIVERY','OVERALL']:
    df_aspect = train_df[train_df['aspect']==aspect]
    if df_aspect.empty:
        continue
    df_pos = df_aspect[df_aspect['polarity']=='positive']
    df_neg = df_aspect[df_aspect['polarity']=='negative']

    # Upsample smaller class
    if len(df_pos) > len(df_neg):
        df_neg = resample(df_neg, replace=True, n_samples=len(df_pos), random_state=42)
    else:
        df_pos = resample(df_pos, replace=True, n_samples=len(df_neg), random_state=42)

    balanced_dfs.append(pd.concat([df_pos, df_neg]))

# Combine all aspects & shuffle
train_df = pd.concat(balanced_dfs).sample(frac=1, random_state=42)

# Verify
print("Balanced train counts per aspect:\n", train_df.groupby('aspect')['polarity'].value_counts())


Balanced train counts per aspect:
 aspect   polarity
FOOD     negative    741
         positive    741
OVERALL  negative    313
         positive    313
SERVICE  negative    226
         positive    226
Name: count, dtype: int64


In [9]:
# ==============================
# STEP 8: ENCODE LABELS
# ==============================

label_map = {'negative':0, 'positive':1}
train_df['label'] = train_df['polarity'].map(label_map)
test_df['label']  = test_df['polarity'].map(label_map)


In [10]:
# ==============================
# STEP 9: TOKENIZATION & PADDING
# ==============================

MAX_WORDS = 5000
MAX_LEN   = 50

# Initialize tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['text_clean'])

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(train_df['text_clean'])
X_test_seq  = tokenizer.texts_to_sequences(test_df['text_clean'])

# Pad sequences
X_train = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')
X_test  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')

y_train = train_df['label'].values
y_test  = test_df['label'].values


In [11]:
# ==============================
# STEP 10: BUILD LSTM MODEL
# ==============================

model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=100, input_length=MAX_LEN),  # Convert word index → dense vector
    Bidirectional(LSTM(128)),  # Bidirectional LSTM for context understanding
    Dropout(0.3),              # Regularization
    Dense(1, activation='sigmoid')  # Binary sentiment
])

# Compile model
model.compile(loss='binary_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
model.summary()




In [None]:
# ==============================
# STEP 11: TRAIN LSTM MODEL
# ==============================

history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.1,
    verbose=1
)


Epoch 1/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 107ms/step - accuracy: 0.5616 - loss: 0.6704 - val_accuracy: 0.8164 - val_loss: 0.4257
Epoch 2/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 112ms/step - accuracy: 0.8985 - loss: 0.2927 - val_accuracy: 0.8633 - val_loss: 0.3524
Epoch 3/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 112ms/step - accuracy: 0.9465 - loss: 0.1677 - val_accuracy: 0.8516 - val_loss: 0.3952
Epoch 4/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 112ms/step - accuracy: 0.9579 - loss: 0.1247 - val_accuracy: 0.8633 - val_loss: 0.4023
Epoch 5/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 102ms/step - accuracy: 0.9594 - loss: 0.1045 - val_accuracy: 0.8711 - val_loss: 0.4088
Epoch 6/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 113ms/step - accuracy: 0.9622 - loss: 0.0954 - val_accuracy: 0.8594 - val_loss: 0.4478
Epoch 7/10
[1m72/72[0m 

In [None]:
# ==============================
# STEP 12: EVALUATE MODEL
# ==============================

y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

accuracy  = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall    = recall_score(y_test, y_pred)
f1        = f1_score(y_test, y_pred)

print("✅ TEST PERFORMANCE")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")


In [None]:
# ==============================
# STEP 13: PREDICT SENTIMENT PER ASPECT FOR ANY REVIEW
# ==============================

aspects_list = ['FOOD','SERVICE','DELIVERY','OVERALL']

def predict_aspects(review):
    """
    Fully LSTM-based aspect-wise sentiment prediction.
    Handles any words (burger, tea, naan) dynamically.
    Returns sentiment & words considered by model.
    """
    results = {}
    for aspect in aspects_list:
        text = review + " [ASP] " + aspect
        text_clean = preprocess_text(text)
        seq = tokenizer.texts_to_sequences([text_clean])
        pad_seq = pad_sequences(seq, maxlen=MAX_LEN, padding='post')

        pred_prob = model.predict(pad_seq, verbose=0)[0][0]
        sentiment = "Positive" if pred_prob >= 0.5 else "Negative"



    return results

# Example usage
review = "Burger was tasty but delivery was slow and staff was rude"
print(predict_aspects(review))


In [None]:
# ==============================
# STEP 14: LOAD PAKISTANI REVIEWS
# ==============================

# Load Pakistani-style reviews dataset
pak_url = "https://raw.githubusercontent.com/Harjandar/absa-restaurant-sentiment/main/data/raw/pakistani_reviews_150.csv"
pak_df = pd.read_csv(pak_url)

# Check shape and columns
print("Pakistani dataset shape:", pak_df.shape)
print("Pakistani dataset columns:", pak_df.columns)

# Preview first few rows
pak_df.head()


In [None]:
# ==============================
# STEP 14.1: MAP ASPECTS (PAK DATA)
# ==============================

pak_df = pak_df[pak_df['aspect_category'].isin(aspect_mapping.keys())]
pak_df['aspect'] = pak_df['aspect_category'].map(aspect_mapping)

print("Pakistani aspect counts:\n", pak_df['aspect'].value_counts())


In [None]:
# ==============================
# STEP 14.2: FILTER POLARITY
# ==============================

pak_df = pak_df[pak_df['polarity'].isin(['positive', 'negative'])]

print("Pakistani polarity counts:\n", pak_df['polarity'].value_counts())


In [None]:
# ==============================
# STEP 14.3: CREATE INPUT TEXT
# ==============================

pak_df['text'] = pak_df['sentence'] + " [ASP] " + pak_df['aspect']


In [None]:
# ==============================
# STEP 14.4: PREPROCESS TEXT
# ==============================

pak_df['text_clean'] = pak_df['text'].apply(preprocess_text)

pak_df[['text', 'text_clean']].head()


In [None]:
# ==============================
# STEP 14.5: ENCODE LABELS
# ==============================

pak_df['label'] = pak_df['polarity'].map(label_map)


In [None]:
# ==============================
# STEP 15: MERGE DATASETS
# ==============================

combined_train_df = pd.concat([train_df, pak_df], axis=0).sample(frac=1, random_state=42)

print("Combined dataset shape:", combined_train_df.shape)
print("Combined aspect counts:\n", combined_train_df['aspect'].value_counts())
print("Combined polarity counts:\n", combined_train_df['polarity'].value_counts())


In [None]:
# ==============================
# STEP 15.1: RE-BALANCE PER ASPECT
# ==============================

balanced_dfs = []

for aspect in ['FOOD', 'SERVICE', 'DELIVERY', 'OVERALL']:
    df_aspect = combined_train_df[combined_train_df['aspect'] == aspect]
    if df_aspect.empty:
        continue

    df_pos = df_aspect[df_aspect['polarity'] == 'positive']
    df_neg = df_aspect[df_aspect['polarity'] == 'negative']

    if len(df_pos) > len(df_neg):
        df_neg = resample(df_neg, replace=True, n_samples=len(df_pos), random_state=42)
    else:
        df_pos = resample(df_pos, replace=True, n_samples=len(df_neg), random_state=42)

    balanced_dfs.append(pd.concat([df_pos, df_neg]))

combined_train_df = pd.concat(balanced_dfs).sample(frac=1, random_state=42)

print("Balanced combined counts:\n",
      combined_train_df.groupby('aspect')['polarity'].value_counts())


In [None]:
# ==============================
# STEP 16: TOKENIZATION (COMBINED)
# ==============================

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(combined_train_df['text_clean'])

X_train_seq = tokenizer.texts_to_sequences(combined_train_df['text_clean'])
X_train = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post')

y_train = combined_train_df['label'].values


In [None]:
# ==============================
# STEP 16.1: BUILD MODEL AGAIN
# ==============================

model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=100, input_length=MAX_LEN),
    Bidirectional(LSTM(128)),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(0.001),
    metrics=['accuracy']
)

model.summary()


In [None]:
# ==============================
# STEP 16.2: TRAIN MODEL
# ==============================

history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.1,
    verbose=1
)


In [None]:
# ==============================
# STEP 16.3: EVALUATION
# ==============================

X_test_seq = tokenizer.texts_to_sequences(test_df['text_clean'])
X_test = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post')
y_test = test_df['label'].values

y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

print("✅ PERFORMANCE AFTER ADDING PAKISTANI REVIEWS")
print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall   : {recall_score(y_test, y_pred):.4f}")
print(f"F1-score : {f1_score(y_test, y_pred):.4f}")
