<a href="https://colab.research.google.com/github/Harjandar/absa-restaurant-sentiment/blob/main/notebooks/ABSA_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [92]:
# ==============================
# STEP 0: Import Libraries
# ==============================
import pandas as pd   # for handling CSVs and DataFrames
import re             # for text cleaning using regular expressions

# ==============================
# STEP 1: Load TRAIN dataset
# ==============================
url = "https://raw.githubusercontent.com/Harjandar/absa-restaurant-sentiment/main/data/raw/restaurants_train_single.csv"

# Load dataset from GitHub
df = pd.read_csv(url)
df.head()  # Show first 5 rows



Unnamed: 0,sentence Id,sentence,aspect_category,aspect_term,from,to,polarity
0,1004293:0,Judging from previous posts this used to be a ...,RESTAURANT#GENERAL,place,51,56,negative
1,1004293:1,"We, there were four of us, arrived at noon - t...",SERVICE#GENERAL,staff,75,80,negative
2,1004293:2,"They never brought us complimentary noodles, i...",SERVICE#GENERAL,,0,0,negative
3,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#QUALITY,food,4,8,negative
4,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#STYLE_OPTIONS,portions,52,60,negative


In [93]:
df.head(10)  # Show first 10 rows for detailed view
df.columns   # Print column names

Index(['sentence Id', 'sentence', 'aspect_category', 'aspect_term', 'from',
       'to', 'polarity'],
      dtype='object')

In [94]:
 #==============================
# STEP 2: Keep only relevant aspects
# ==============================
# These are the aspects we want to focus on for ABSA
keep_aspects = [
    "FOOD#QUALITY",
    "FOOD#STYLE_OPTIONS",
    "FOOD#PRICES",
    "SERVICE#GENERAL",
    "DELIVERY#GENERAL",
    "RESTAURANT#GENERAL"  # will rename to OVERALL#GENERAL
]

# Filter dataset to only include these aspects
df = df[df['aspect_category'].isin(keep_aspects)]
df.head(10)


Unnamed: 0,sentence Id,sentence,aspect_category,aspect_term,from,to,polarity
0,1004293:0,Judging from previous posts this used to be a ...,RESTAURANT#GENERAL,place,51,56,negative
1,1004293:1,"We, there were four of us, arrived at noon - t...",SERVICE#GENERAL,staff,75,80,negative
2,1004293:2,"They never brought us complimentary noodles, i...",SERVICE#GENERAL,,0,0,negative
3,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#QUALITY,food,4,8,negative
4,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#STYLE_OPTIONS,portions,52,60,negative
5,1004293:4,"After all that, they complained to me about th...",SERVICE#GENERAL,,0,0,negative
6,1004293:5,Avoid this place!,RESTAURANT#GENERAL,place,11,16,negative
7,1014458:0,"I have eaten at Saul, many times, the food is ...",FOOD#QUALITY,food,38,42,positive
8,1014458:1,Saul is the best restaurant on Smith Street an...,RESTAURANT#GENERAL,Saul,0,4,positive
9,1014458:2,The duck confit is always amazing and the foie...,FOOD#QUALITY,foie gras terrine with figs,42,69,positive


In [95]:
# ==============================
# STEP 3: Rename aspects
# ==============================
# Rename "RESTAURANT#GENERAL" to "OVERALL#GENERAL" for clarity
df['aspect_category'] = df['aspect_category'].replace({"RESTAURANT#GENERAL": "OVERALL#GENERAL"})
df['aspect_category'].unique()  # check unique aspects

array(['OVERALL#GENERAL', 'SERVICE#GENERAL', 'FOOD#QUALITY',
       'FOOD#STYLE_OPTIONS', 'FOOD#PRICES'], dtype=object)

In [96]:
# ==============================
# STEP 4: Remove rows with NULL aspect_term
# ==============================
df = df[df['aspect_term'].notnull()]  # keep only rows with aspect_term

In [97]:
# ==============================
# STEP 5: Check polarity distribution
# ==============================
df['polarity'].value_counts()  # counts of positive, negative, neutral

Unnamed: 0_level_0,count
polarity,Unnamed: 1_level_1
positive,1001
negative,412
neutral,46


In [98]:
# ==============================
# STEP 6: Remove neutral reviews
# ==============================
# Neutral reviews are not useful for binary sentiment classification
df = df[df['polarity'] != 'neutral']
df['polarity'].value_counts()  # check distribution after removal


Unnamed: 0_level_0,count
polarity,Unnamed: 1_level_1
positive,1001
negative,412


In [99]:
# ==============================
# STEP 7: Clean the text
# ==============================
# Convert text to lowercase, remove punctuation and extra spaces
def clean_text(text):
    text = str(text).lower()  # convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

df['sentence'] = df['sentence'].apply(clean_text)  # apply cleaning

In [100]:
# ==============================
# STEP 8: Keep relevant aspects again (optional redundancy)
# ==============================
keep_aspects = [
    "FOOD#QUALITY",
    "FOOD#STYLE_OPTIONS",
    "SERVICE#GENERAL",
    "RESTAURANT#GENERAL",  # for overall experience or delivery
    "RESTAURANT#PRICES"
]

df = df[df['aspect_category'].isin(keep_aspects)].reset_index(drop=True)

# Replace 'NULL' aspect_term with empty string
df['aspect_term'] = df['aspect_term'].replace('NULL', '')

df['polarity'].value_counts()  # check counts

Unnamed: 0_level_0,count
polarity,Unnamed: 1_level_1
positive,821
negative,341


In [101]:
# ==============================
# STEP 9: Balance dataset
# ==============================
# Separate positive and negative examples
df_pos = df[df['polarity'] == 'positive']
df_neg = df[df['polarity'] == 'negative']

# Upsample negative class to match positive
df_neg_upsampled = df_neg.sample(n=len(df_pos), replace=True, random_state=42)

# Combine positive and upsampled negative
df_balanced = pd.concat([df_pos, df_neg_upsampled]).reset_index(drop=True)

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check new counts
print(df_balanced['polarity'].value_counts())

polarity
positive    821
negative    821
Name: count, dtype: int64


In [102]:
# ==============================
# STEP 10: Load TEST dataset
# ==============================
url_test = "https://raw.githubusercontent.com/Harjandar/absa-restaurant-sentiment/main/data/raw/restaurants_test_single.csv"
df_test = pd.read_csv(url_test)

print("Original test dataset shape:", df_test.shape)

Original test dataset shape: (859, 7)


In [103]:
# ==============================
# STEP 11: Keep SAME aspects as TRAIN
# ==============================
keep_aspects = [
    "FOOD#QUALITY",
    "FOOD#STYLE_OPTIONS",
    "FOOD#PRICES",
    "SERVICE#GENERAL",
    "DELIVERY#GENERAL",
    "RESTAURANT#GENERAL"
]

df_test = df_test[df_test['aspect_category'].isin(keep_aspects)]
print("After aspect filtering:", df_test.shape)


After aspect filtering: (688, 7)


In [104]:
# ==============================
# STEP 12: Rename aspect
# ==============================
df_test['aspect_category'] = df_test['aspect_category'].replace({"RESTAURANT#GENERAL": "OVERALL#GENERAL"})

In [105]:
# ==============================
# STEP 13: Remove neutral reviews
# ==============================
df_test = df_test[df_test['polarity'] != 'neutral']  # do not balance test
print("After removing neutral reviews:")
print(df_test['polarity'].value_counts())


After removing neutral reviews:
polarity
positive    485
negative    170
Name: count, dtype: int64


In [106]:
# ==============================
# STEP 14: Clean test sentences
# ==============================
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_test['sentence'] = df_test['sentence'].apply(clean_text)

print("\nFinal test dataset shape:", df_test.shape)
df_test.head()



Final test dataset shape: (655, 7)


Unnamed: 0,sentence Id,sentence,aspect_category,aspect_term,from,to,polarity
0,en_BlueRibbonSushi_478218171:0,yum,FOOD#QUALITY,,0,0,positive
1,en_BlueRibbonSushi_478218171:1,serves really good sushi,FOOD#QUALITY,sushi,19,24,positive
3,en_BlueRibbonSushi_478218171:3,green tea creme brulee is a must,FOOD#QUALITY,Green Tea creme brulee,0,22,positive
4,en_BlueRibbonSushi_478218171:4,dont leave the restaurant without it,FOOD#QUALITY,,0,0,positive
5,en_BlueRibbonSushi_478218345:0,no comparison,OVERALL#GENERAL,,0,0,positive


In [107]:
# ==============================
# STEP 15: Prepare Input & Output for LSTM
# ==============================
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train data
X_train = df_balanced['sentence'].values
y_train = df_balanced['polarity'].values

# Encode labels: positive=1, negative=0
le = LabelEncoder()
y_train = le.fit_transform(y_train)

# Test data
X_test = df_test['sentence'].values
y_test = df_test['polarity'].values
y_test = le.transform(y_test)  # use same encoder

In [108]:
# ==============================
# STEP 16: Tokenize & Pad
# ==============================
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)  # fit only on train

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = 50  # maximum sentence length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')


# ==============================
# STEP 17: Build LSTM Model
# ==============================

Use Bidirectional LSTM

Where: Replace your current LSTM layer in STEP 17: Build LSTM Model

Why: Bidirectional LSTM reads sentences both forward and backward → better understanding of context.

In [112]:
# ==============================
# STEP 17: Build LSTM Model
# ==============================
from tensorflow.keras.layers import Bidirectional
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_len))  # embedding layer
#Replace your current LSTM layer in STEP 17: Build LSTM Model
#model.add(LSTM(64, return_sequences=False))
model.add(Bidirectional(LSTM(128, return_sequences=False))) #Bidirectional LSTM
model.add(Dropout(0.5))  # dropout to prevent overfitting
model.add(Dense(1, activation='sigmoid'))  # output layer for binary classification

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [110]:
# ==============================
# STEP 18: Train LSTM
# ==============================
history = model.fit(
    X_train_pad, y_train,
    epochs=5,         # can increase for better training
    batch_size=32,
    validation_split=0.2
)

Epoch 1/5
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 168ms/step - accuracy: 0.5496 - loss: 0.6869 - val_accuracy: 0.7112 - val_loss: 0.5945
Epoch 2/5
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 126ms/step - accuracy: 0.7792 - loss: 0.5087 - val_accuracy: 0.8116 - val_loss: 0.3866
Epoch 3/5
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 146ms/step - accuracy: 0.8877 - loss: 0.2990 - val_accuracy: 0.7599 - val_loss: 0.4965
Epoch 4/5
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 129ms/step - accuracy: 0.8614 - loss: 0.3854 - val_accuracy: 0.8997 - val_loss: 0.2863
Epoch 5/5
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 171ms/step - accuracy: 0.9479 - loss: 0.1741 - val_accuracy: 0.9119 - val_loss: 0.2485


In [111]:
# ==============================
# STEP 19: Evaluate on Test
# ==============================
y_pred_prob = model.predict(X_test_pad)  # predict probabilities
y_pred = (y_pred_prob > 0.5).astype(int).flatten()  # convert to 0/1

# Compute metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\n✅ LSTM Test Performance")
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step

✅ LSTM Test Performance
Accuracy: 0.7816793893129771
Precision: 0.8958333333333334
Recall: 0.797938144329897
F1-score: 0.8440567066521265
