# Traditional ML Baseline for ABSA Sentiment Classification

**Objective:** Train Random Forest classifier with TF-IDF features as baseline to compare with XLM-RoBERTa

**Academic Justification:**
- Establishes baseline performance using classical ML (Scikit-learn Random Forest)
- TF-IDF captures term importance without contextual embeddings
- Comparison validates whether transformer pre-training provides value for Manglish code-switching
- Following best practices: same train/test split, same evaluation protocol, same class imbalance handling

# Stage 0: Environment Setup

In [1]:
# Connect to google drive
from google.colab import drive
import os

# 1. Mount Google Drive (To save the model checkpoints)
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

# NLP Processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# ML & Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

print("‚úì All libraries loaded successfully")

‚úì All libraries loaded successfully


In [3]:
# Download NLTK resources (run once)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('punkt_tab')

print("‚úì NLTK resources downloaded")

‚úì NLTK resources downloaded


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Stage 1: Configuration & Data Loading

In [24]:
# Configuration
DATA_PATH = r'/content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Dataset/aspect_categorization_after_filtering.pkl'
GOLD_PATH = r'/content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Dataset/Final_Gold_Standard.csv'
#DATA_PATH = r'C:\Users\Ong Hui Ling\Dropbox\PC\Documents\Github\Aspect-Based-Sentiment-Analysis\Dataset\aspect_categorization_after_filtering.pkl'
#GOLD_PATH = r'C:\Users\Ong Hui Ling\Dropbox\PC\Documents\Github\Aspect-Based-Sentiment-Analysis\Dataset\Final_Gold_Standard.csv'

# Output path
OUTPUT_PATH = r'/content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Modelling/models'

RANDOM_SEED = 42
TEST_SIZE = 0.15
VAL_SIZE = 0.10

# Label encoding
LABEL2ID = {"negative": 0, "positive": 1}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}

print(f"‚úì Configuration loaded")
print(f"  Data path: {DATA_PATH}")
print(f"  Gold path: {GOLD_PATH}")
print(f"  Random seed: {RANDOM_SEED}")

‚úì Configuration loaded
  Data path: /content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Dataset/aspect_categorization_after_filtering.pkl
  Gold path: /content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Dataset/Final_Gold_Standard.csv
  Random seed: 42


In [5]:
print("="*70)
print("LOADING & PREPARING DATA")
print("="*70)

# Load training data
df = pd.read_pickle(DATA_PATH)
print(f"  Raw segments loaded: {len(df):,}")

# --- PREVENT DATA LEAKAGE: Exclude gold standard review IDs ----------
print(f"\n  ‚ö†Ô∏è  DATA LEAKAGE PREVENTION:")
print(f"  Loading gold standard to identify held-out review IDs...")

try:
    gold_df = pd.read_csv(GOLD_PATH)

    # Extract unique Original_Review_IDs from gold dataset
    if 'Original_Review_ID' in gold_df.columns:
        gold_review_ids = set(gold_df['Original_Review_ID'].unique())
    elif 'Review_ID' in gold_df.columns:
        gold_review_ids = set(gold_df['Review_ID'].unique())
    else:
        print(f"  ‚ö†Ô∏è  Warning: Could not find review ID column in gold dataset")
        gold_review_ids = set()

    print(f"  ‚úì Gold dataset loaded: {len(gold_df):,} annotations")
    print(f"  ‚úì Unique review IDs in gold: {len(gold_review_ids):,}")

    # Filter out segments from gold review IDs
    n_before = len(df)
    df = df[~df['Original_Review_ID'].isin(gold_review_ids)].copy()
    n_after = len(df)
    n_removed = n_before - n_after

    print(f"  ‚úì Filtered out {n_removed:,} segments from gold reviews ({n_removed/n_before*100:.1f}%)")
    print(f"  ‚úì Training segments remaining: {n_after:,}")

except Exception as e:
    print(f"  ‚úó Error loading gold dataset: {e}")

# Filter to single-aspect segments
df["num_aspects"] = df["Aspect_Labels"].apply(len)
df_single = df[df["num_aspects"] == 1].copy()
df_single["aspect"] = df_single["Aspect_Labels"].apply(lambda x: x[0])

n_multi = len(df) - len(df_single)
pct_retained = (len(df_single) / len(df)) * 100

print(f"\n  FILTERING STRATEGY:")
print(f"    Single-aspect segments:  {len(df_single):>7,} ({pct_retained:>5.1f}%) ‚Üí KEPT")
print(f"    Multi-aspect segments:   {n_multi:>7,} ({100-pct_retained:>5.1f}%) ‚Üí DROPPED")

# Encode labels
df_single["label"] = df_single["Sentiment_Label"].map(LABEL2ID)

print(f"\n  Label distribution:")
for label_name, label_id in LABEL2ID.items():
    count = (df_single["label"] == label_id).sum()
    pct = count / len(df_single) * 100
    print(f"    {label_name:<10}: {count:>7,} ({pct:>5.1f}%)")

print(f"\n‚úì Data preparation complete: {len(df_single):,} segments ready for training")

LOADING & PREPARING DATA
  Raw segments loaded: 128,778

  ‚ö†Ô∏è  DATA LEAKAGE PREVENTION:
  Loading gold standard to identify held-out review IDs...
  ‚úì Gold dataset loaded: 645 annotations
  ‚úì Unique review IDs in gold: 0
  ‚úì Filtered out 0 segments from gold reviews (0.0%)
  ‚úì Training segments remaining: 128,778

  FILTERING STRATEGY:
    Single-aspect segments:   98,693 ( 76.6%) ‚Üí KEPT
    Multi-aspect segments:    30,085 ( 23.4%) ‚Üí DROPPED

  Label distribution:
    negative  :   6,783 (  6.9%)
    positive  :  91,910 ( 93.1%)

‚úì Data preparation complete: 98,693 segments ready for training


# Stage 2: Text Preprocessing (Conventional NLP)

In [6]:
def preprocess_text(text):
    """
    Apply conventional NLP preprocessing pipeline.

    Steps:
    1. Lowercase conversion
    2. Remove special characters (keep letters, numbers, spaces)
    3. Tokenization
    4. Remove English stopwords
    5. Lemmatization

    Why:
        Traditional ML models (Random Forest, SVM) lack contextual understanding.
        Preprocessing reduces noise and dimensionality for TF-IDF vectorization.

    Note:
        We do NOT remove Manglish terms (sedap, mamak) as they carry sentiment.
        Stopwords removal is conservative to preserve sentiment-bearing phrases.
    """
    if pd.isna(text):
        return ""

    # Lowercase
    text = text.lower()

    # Remove special characters but keep spaces
    text = re.sub(r'[^a-z0-9\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join back
    return ' '.join(tokens)

# Test preprocessing
sample_text = "The nasi lemak was incredibly sedap but the service was lambat!"
print(f"Original:  {sample_text}")
print(f"Processed: {preprocess_text(sample_text)}")

Original:  The nasi lemak was incredibly sedap but the service was lambat!
Processed: nasi lemak incredibly sedap service lambat


In [None]:
print("="*70)
print("PREPROCESSING TEXT DATA")
print("="*70)

# Apply preprocessing to all segments
print(f"Processing {len(df_single):,} segments...")
df_single['processed_text'] = df_single['Segment'].apply(preprocess_text)

PREPROCESSING TEXT DATA
Processing 98,693 segments...


In [8]:
# Check for empty processed texts
print("="*70)
print("EMPTY PROCESSED TEXTS ORIGNIAL SEGMENTS")
print("="*70)

df_single[df_single['processed_text'].str.len() == 0]['Segment'].value_counts()

EMPTY PROCESSED TEXTS ORIGNIAL SEGMENTS


Unnamed: 0_level_0,count
Segment,Unnamed: 1_level_1
again,46
once again,4
over again,2
again),1
again not Ë±¨Ê≤πÊ∏£,1
again ‚ù§Ô∏èüòä,1
here again,1
was again,1
because again,1
will be here again,1


In [9]:
# Check for empty processed texts
n_empty = (df_single['processed_text'].str.len() == 0).sum()

if n_empty > 0:
    print(f"  ‚ö†Ô∏è  Warning: {n_empty} segments became empty after preprocessing")
    df_single = df_single[df_single['processed_text'].str.len() > 0].copy()
    print(f"  ‚úì Removed empty segments. Remaining: {len(df_single):,}")

print(f"\n‚úì Text preprocessing complete")
print(f"\nSample processed segments:")
for i in range(3):
    print(f"\n  Original:  {df_single.iloc[i]['Segment'][:80]}...")
    print(f"  Processed: {df_single.iloc[i]['processed_text'][:80]}...")

  ‚úì Removed empty segments. Remaining: 98,632

‚úì Text preprocessing complete

Sample processed segments:

  Original:  coconut cream - the perfect finish to the meal...
  Processed: coconut cream perfect finish meal...

  Original:  cooked perfectly...
  Processed: cooked perfectly...

  Original:  don‚Äôt miss the sweet appam with brown sugar...
  Processed: dont miss sweet appam brown sugar...


# Stage 3: Train/Val/Test Split

In [10]:
print("="*70)
print("TRAIN/VAL/TEST SPLIT")
print("="*70)

# Stage 1: Separate test set (stratified)
df_trainval, df_test = train_test_split(
    df_single,
    test_size=TEST_SIZE,
    stratify=df_single["label"],
    random_state=RANDOM_SEED
)

# Stage 2: Split remainder into train + val
adjusted_val_size = VAL_SIZE / (1.0 - TEST_SIZE)
df_train, df_val = train_test_split(
    df_trainval,
    test_size=adjusted_val_size,
    stratify=df_trainval["label"],
    random_state=RANDOM_SEED
)

print(f"\nSplit sizes:")
for name, split_df in [("Train", df_train), ("Val", df_val), ("Test", df_test)]:
    pos = (split_df["label"] == 1).sum()
    neg = (split_df["label"] == 0).sum()
    print(f"  {name:<6}: {len(split_df):>7,} rows | "
          f"pos: {pos:>6,} ({pos/len(split_df)*100:>5.1f}%) | "
          f"neg: {neg:>6,} ({neg/len(split_df)*100:>5.1f}%)")

# Extract X (processed text) and y (labels)
X_train = df_train['processed_text'].values
X_val = df_val['processed_text'].values
X_test = df_test['processed_text'].values

y_train = df_train['label'].values
y_val = df_val['label'].values
y_test = df_test['label'].values

print(f"\n‚úì Data split complete")

TRAIN/VAL/TEST SPLIT

Split sizes:
  Train :  73,973 rows | pos: 68,891 ( 93.1%) | neg:  5,082 (  6.9%)
  Val   :   9,864 rows | pos:  9,186 ( 93.1%) | neg:    678 (  6.9%)
  Test  :  14,795 rows | pos: 13,779 ( 93.1%) | neg:  1,016 (  6.9%)

‚úì Data split complete


# Stage 4: TF-IDF Vectorization

In [11]:
print("="*70)
print("TF-IDF VECTORIZATION")
print("="*70)

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000,     # Limit to top 5000 features (prevent overfitting)
    ngram_range=(1, 2),    # Unigrams + bigrams (capture phrases like "nasi lemak")
    min_df=2,              # Ignore terms appearing in < 2 documents
    max_df=0.8,            # Ignore terms appearing in > 80% of documents
    sublinear_tf=True      # Apply sublinear tf scaling (1 + log(tf))
)

print(f"TF-IDF Configuration:")
print(f"  Max features:  {vectorizer.max_features:,}")
print(f"  N-gram range:  {vectorizer.ngram_range}")
print(f"  Min doc freq:  {vectorizer.min_df}")
print(f"  Max doc freq:  {vectorizer.max_df}")

# Fit on training data and transform all splits
print(f"\nFitting TF-IDF on {len(X_train):,} training samples...")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

print(f"\n‚úì TF-IDF vectorization complete")
print(f"  Train shape: {X_train_tfidf.shape}")
print(f"  Val shape:   {X_val_tfidf.shape}")
print(f"  Test shape:  {X_test_tfidf.shape}")
print(f"  Vocabulary size: {len(vectorizer.vocabulary_):,} terms")

# Show top features by TF-IDF score
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = X_train_tfidf.sum(axis=0).A1
top_indices = tfidf_scores.argsort()[-20:][::-1]

print(f"\nTop 20 features by TF-IDF score:")
for idx in top_indices:
    print(f"  {feature_names[idx]:<20} {tfidf_scores[idx]:.2f}")

TF-IDF VECTORIZATION
TF-IDF Configuration:
  Max features:  5,000
  N-gram range:  (1, 2)
  Min doc freq:  2
  Max doc freq:  0.8

Fitting TF-IDF on 73,973 training samples...

‚úì TF-IDF vectorization complete
  Train shape: (73973, 5000)
  Val shape:   (9864, 5000)
  Test shape:  (14795, 5000)
  Vocabulary size: 5,000 terms

Top 20 features by TF-IDF score:
  food                 2054.48
  service              1326.58
  good                 1203.42
  delicious            1177.20
  friendly             1018.66
  staff                918.34
  chicken              827.56
  taste                801.65
  price                790.02
  nice                 779.33
  fresh                730.00
  place                644.65
  great                643.54
  attentive            638.20
  tasty                637.55
  dish                 613.12
  restaurant           545.40
  come                 531.83
  rice                 512.88
  also                 509.62


# Stage 5: Multi-Model Training

**Academic Justification:**
- Compare 5 traditional ML classifiers: Logistic Regression, SVM, Naive Bayes, XGBoost, Random Forest
- Evaluate on validation set to identify best model(s) for hyperparameter tuning
- Different models capture different patterns: linear (LR, SVM) vs non-linear (RF, XGB) vs probabilistic (NB)
- Best practice: broad comparison before expensive hyperparameter tuning

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
import time

print("="*70)
print("TRAINING MULTIPLE ML MODELS")
print("="*70)

# Compute class weights (for models that support it)
class_weights_array = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = {i: class_weights_array[i] for i in range(len(class_weights_array))}

# Calculate scale_pos_weight for XGBoost (ratio of negative to positive)
scale_pos_weight = class_weights_array[0] / class_weights_array[1]

print(f"Class imbalance handling:")
print(f"  Negative weight: {class_weights_dict[0]:.4f}")
print(f"  Positive weight: {class_weights_dict[1]:.4f}")
print(f"  XGBoost scale_pos_weight: {scale_pos_weight:.4f}")

# Define 5 models with reasonable default parameters
models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight=class_weights_dict,
        random_state=RANDOM_SEED,
        n_jobs=-1
    ),
    "SVM (Linear)": SVC(
        kernel='linear',
        class_weight=class_weights_dict,
        random_state=RANDOM_SEED
    ),
    "Naive Bayes": MultinomialNB(
        alpha=1.0
    ),
    "XGBoost": XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        scale_pos_weight=scale_pos_weight,
        random_state=RANDOM_SEED,
        n_jobs=-1,
        eval_metric='logloss'
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        max_depth=20,
        class_weight=class_weights_dict,
        random_state=RANDOM_SEED,
        n_jobs=-1
    )
}

print(f"\n{'='*70}")
print(f"TRAINING {len(models)} MODELS")
print(f"{'='*70}\n")

# Train all models and store results
trained_models = {}
training_times = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    start_time = time.time()

    model.fit(X_train_tfidf, y_train)

    training_time = time.time() - start_time
    trained_models[model_name] = model
    training_times[model_name] = training_time

    print(f"  ‚úì Complete in {training_time:.2f} seconds\n")

print(f"‚úì All models trained successfully")

TRAINING MULTIPLE ML MODELS
Class imbalance handling:
  Negative weight: 7.2779
  Positive weight: 0.5369
  XGBoost scale_pos_weight: 13.5559

TRAINING 5 MODELS

Training Logistic Regression...
  ‚úì Complete in 1.60 seconds

Training SVM (Linear)...
  ‚úì Complete in 225.01 seconds

Training Naive Bayes...
  ‚úì Complete in 0.01 seconds

Training XGBoost...
  ‚úì Complete in 5.01 seconds

Training Random Forest...
  ‚úì Complete in 1.07 seconds

‚úì All models trained successfully


# Stage 6: Model Comparison on Validation Set

In [13]:
print("="*70)
print("COMPARING MODELS ON VALIDATION SET")
print("="*70)

# Evaluate all models on validation set
results = []

def compute_metrics(y_true, y_pred):
    """Compute same metrics as BERT for fair comparison."""
    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    per_class_f1 = f1_score(y_true, y_pred, average=None, labels=[0, 1])

    return {
        "accuracy": round(acc, 4),
        "macro_f1": round(macro_f1, 4),
        "neg_f1": round(per_class_f1[0], 4),
        "pos_f1": round(per_class_f1[1], 4),
    }

for model_name, model in trained_models.items():
    # Predict on validation set
    y_pred_val = model.predict(X_val_tfidf)

    # Compute metrics
    val_metrics = compute_metrics(y_val, y_pred_val)

    results.append({
        'Model': model_name,
        'Accuracy': val_metrics['accuracy'],
        'Macro-F1': val_metrics['macro_f1'],
        'Negative F1': val_metrics['neg_f1'],
        'Positive F1': val_metrics['pos_f1'],
        'Training Time (s)': training_times[model_name]
    })

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Macro-F1', ascending=False)

print(f"\n{'='*70}")
print(f"VALIDATION SET RESULTS (Ranked by Macro-F1)")
print(f"{'='*70}\n")
print(results_df.to_string(index=False))

COMPARING MODELS ON VALIDATION SET

VALIDATION SET RESULTS (Ranked by Macro-F1)

              Model  Accuracy  Macro-F1  Negative F1  Positive F1  Training Time (s)
Logistic Regression    0.8161    0.6254       0.3581       0.8927           1.599633
       SVM (Linear)    0.7981    0.6102       0.3395       0.8808         225.013811
        Naive Bayes    0.9379    0.6002       0.2328       0.9676           0.009229
            XGBoost    0.9342    0.5240       0.0820       0.9659           5.010101
      Random Forest    0.6187    0.4926       0.2397       0.7456           1.072564


# Stage 7: Model Evaluation on Test Set

In [14]:
print("="*70)
print("COMPARING MODELS ON TEST SET")
print("="*70)

# Evaluate all models on test set
results = []

for model_name, model in trained_models.items():
    # Predict on test set
    y_pred_test = model.predict(X_test_tfidf)

    # Compute metrics
    test_metrics = compute_metrics(y_test, y_pred_test)

    results.append({
        'Model': model_name,
        'Accuracy': test_metrics['accuracy'],
        'Macro-F1': test_metrics['macro_f1'],
        'Negative F1': test_metrics['neg_f1'],
        'Positive F1': test_metrics['pos_f1'],
        'Training Time (s)': training_times[model_name]
    })

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Macro-F1', ascending=False)

print(f"\n{'='*70}")
print(f"TEST SET RESULTS (Ranked by Macro-F1)")
print(f"{'='*70}\n")
print(results_df.to_string(index=False))

COMPARING MODELS ON TEST SET

TEST SET RESULTS (Ranked by Macro-F1)

              Model  Accuracy  Macro-F1  Negative F1  Positive F1  Training Time (s)
Logistic Regression    0.8158    0.6253       0.3581       0.8925           1.599633
       SVM (Linear)    0.7982    0.6103       0.3397       0.8809         225.013811
        Naive Bayes    0.9398    0.6096       0.2506       0.9686           0.009229
            XGBoost    0.9342    0.5253       0.0847       0.9659           5.010101
      Random Forest    0.6212    0.4936       0.2394       0.7478           1.072564


# Stage 8: Gold Standard Evaluation (Before Tuning)

In [15]:
print("="*70)
print("EVALUATING ON GOLD STANDARD (GROUND TRUTH)")
print("="*70)

# Load gold standard
gold_df = pd.read_csv(GOLD_PATH)
print(f"  Gold dataset loaded: {len(gold_df):,} rows")

# Prepare gold data (same as BERT)
gold_df_prep = gold_df.copy()
gold_df_prep.rename(columns={
    "Manual_Aspect": "aspect",
    "Manual_Sentiment": "Sentiment_Label",
}, inplace=True)

# Normalize sentiment labels to lowercase
gold_df_prep["Sentiment_Label"] = gold_df_prep["Sentiment_Label"].str.lower()

# Handle multi-aspect segments: explode into separate rows
import ast
def parse_aspect(val):
    if isinstance(val, str):
        try:
            parsed = ast.literal_eval(val)
            if isinstance(parsed, list):
                return parsed
            else:
                return [parsed]
        except (ValueError, SyntaxError):
            return [val]
    elif isinstance(val, list):
        return val
    else:
        return [str(val)]

gold_df_prep["aspect"] = gold_df_prep["aspect"].apply(parse_aspect)
gold_df_exploded = gold_df_prep.explode("aspect").reset_index(drop=True)

print(f"  After exploding: {len(gold_df_exploded):,} aspect-segment pairs")

# Encode labels
gold_df_exploded["label"] = gold_df_exploded["Sentiment_Label"].map(LABEL2ID)

# Preprocess gold text
print(f"\nPreprocessing gold segments...")
gold_df_exploded['processed_text'] = gold_df_exploded['Segment'].apply(preprocess_text)

# Vectorize gold text
print(f"Vectorizing gold segments...")
X_gold = gold_df_exploded['processed_text'].values
X_gold_tfidf = vectorizer.transform(X_gold)
y_gold = gold_df_exploded['label'].values

# Predict on gold using best model
print(f"Running inference on gold set using ...")

# Evaluate all models on gold data set
results = []

for model_name, model in trained_models.items():
    # Predict on test set
    y_pred_gold = model.predict(X_gold_tfidf)

    # Compute metrics
    gold_metrics = compute_metrics(y_gold, y_pred_gold)

    results.append({
        'Model': model_name,
        'Accuracy': gold_metrics['accuracy'],
        'Macro-F1': gold_metrics['macro_f1'],
        'Negative F1': gold_metrics['neg_f1'],
        'Positive F1': gold_metrics['pos_f1'],
        'Training Time (s)': training_times[model_name]
    })

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Macro-F1', ascending=False)

print(f"\n{'='*70}")
print(f"GOLD SET RESULTS (Ranked by Macro-F1)")
print(f"{'='*70}\n")
print(results_df.to_string(index=False))

EVALUATING ON GOLD STANDARD (GROUND TRUTH)
  Gold dataset loaded: 645 rows
  After exploding: 721 aspect-segment pairs

Preprocessing gold segments...
Vectorizing gold segments...
Running inference on gold set using ...

GOLD SET RESULTS (Ranked by Macro-F1)

              Model  Accuracy  Macro-F1  Negative F1  Positive F1  Training Time (s)
       SVM (Linear)    0.8363    0.8251       0.7807       0.8695         225.013811
Logistic Regression    0.8322    0.8179       0.7669       0.8689           1.599633
      Random Forest    0.7767    0.7706       0.7330       0.8081           1.072564
        Naive Bayes    0.6852    0.5472       0.2972       0.7971           0.009229
            XGBoost    0.6477    0.4601       0.1419       0.7784           5.010101


**Findings:**
1. Best model: SVM (Linear)
- Pro: Highest accuracy and Macro-F1 for Gold Standard Dataset
- Cons: Longest training time compared to others

2. Best model balance between training time and performance: Logistic Regression
- Pro: Highest accuracy and Macro-F1 for Gold Standard Dataset, Low Training Time


In [None]:
# Identify best model
best_model_name = results_df.iloc[0]['Model']
best_macro_f1 = results_df.iloc[0]['Macro-F1']

# Store best model for hyperparameter tuning
best_model = trained_models[best_model_name]

print(f"\n{'='*70}")
print(f"üèÜ BEST MODEL: {best_model_name}")

# Best model prediction results
y_pred_gold = best_model.predict(X_gold_tfidf)

# Compute metrics
gold_metrics = compute_metrics(y_gold, y_pred_gold)

print(f"‚òÖ GOLD TEST SET RESULTS - {best_model_name}")
print(f"‚òÖ Total samples: {len(gold_df_exploded):,} aspect-segment pairs")
print(f"{'='*70}")
print(f"\nOVERALL PERFORMANCE:")
print(f"  Accuracy:  {gold_metrics['accuracy']:.4f}")
print(f"  Macro-F1:  {gold_metrics['macro_f1']:.4f}")
print(f"    Negative F1: {gold_metrics['neg_f1']:.4f}")
print(f"    Positive F1: {gold_metrics['pos_f1']:.4f}")

print(f"\n‚ö†Ô∏è  Note: This is baseline performance before hyperparameter tuning")


üèÜ BEST MODEL: SVM (Linear)
‚òÖ GOLD TEST SET RESULTS - SVM (Linear) (Before Tuning)
‚òÖ Total samples: 721 aspect-segment pairs

OVERALL PERFORMANCE:
  Accuracy:  0.8363
  Macro-F1:  0.8251
    Negative F1: 0.7807
    Positive F1: 0.8695

‚ö†Ô∏è  Note: This is baseline performance before hyperparameter tuning


In [17]:
# Per-aspect breakdown
print(f"\nPER-ASPECT BREAKDOWN (for comparison with BERT):")
print(f"{'Aspect':<20} {'Samples':>8} {'Accuracy':>10} {'Macro-F1':>10}")
print(f"{'-'*20} {'-'*8} {'-'*10} {'-'*10}")

aspects_unique = sorted(gold_df_exploded["aspect"].unique())
for aspect in aspects_unique:
    mask = gold_df_exploded["aspect"] == aspect
    y_aspect = y_gold[mask]
    y_pred_aspect = y_pred_gold[mask]

    try:
        metrics = compute_metrics(y_aspect, y_pred_aspect)
        n_samples = mask.sum()
        print(f"{aspect:<20} {n_samples:>8} {metrics['accuracy']:>10.4f} {metrics['macro_f1']:>10.4f}")
    except:
        print(f"{aspect:<20} {mask.sum():>8} {'N/A':>10} {'N/A':>10}")


PER-ASPECT BREAKDOWN (for comparison with BERT):
Aspect                Samples   Accuracy   Macro-F1
-------------------- -------- ---------- ----------
AMBIENCE                   62     0.8226     0.7244
AUTHENTICITY & LOCAL VIBE       16     0.8125     0.4483
FOOD                      303     0.8383     0.8176
HALAL COMPLIANCE            2     1.0000     1.0000
LOCATION                   21     0.7619     0.7529
LOYALTY (RETURN INTENT)       91     0.8022     0.7861
NON-HALAL ELEMENTS         10     0.8000     0.7917
SERVICE                   128     0.9062     0.9062
VALUE                      88     0.7955     0.7950


In [18]:
print(f"\nFULL CLASSIFICATION REPORT:")
print(classification_report(y_gold, y_pred_gold, target_names=["Negative", "Positive"], digits=4))


FULL CLASSIFICATION REPORT:
              precision    recall  f1-score   support

    Negative     0.7985    0.7636    0.7807       275
    Positive     0.8581    0.8812    0.8695       446

    accuracy                         0.8363       721
   macro avg     0.8283    0.8224    0.8251       721
weighted avg     0.8353    0.8363    0.8356       721



In [26]:
import os
import joblib

# Save the Vectorizer
vectorizer_path = os.path.join(OUTPUT_PATH, 'tfidf_vectorizer.pkl')
joblib.dump(vectorizer, vectorizer_path)
print(f"‚úÖ Saved TF-IDF Vectorizer to: {vectorizer_path}")

# Save each model from the 'trained_models' dictionary
for model_name, model in trained_models.items():
    # create a safe filename (replace spaces with underscores)
    safe_name = model_name.replace(" ", "_").replace("/", "-")
    file_path = os.path.join(OUTPUT_PATH, f"{safe_name}.pkl")

    # Save the model object
    joblib.dump(model, file_path)
    print(f"‚úÖ Saved model '{model_name}' to: {file_path}")

print("\nAll saving operations completed.")

‚úÖ Saved TF-IDF Vectorizer to: /content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Modelling/models/tfidf_vectorizer.pkl
‚úÖ Saved model 'Logistic Regression' to: /content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Modelling/models/Logistic_Regression.pkl
‚úÖ Saved model 'SVM (Linear)' to: /content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Modelling/models/SVM_(Linear).pkl
‚úÖ Saved model 'Naive Bayes' to: /content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Modelling/models/Naive_Bayes.pkl
‚úÖ Saved model 'XGBoost' to: /content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Modelling/models/XGBoost.pkl
‚úÖ Saved model 'Random Forest' to: /content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Modelling/models/Random_Forest.pkl

All saving operations completed.


# Stage 9: VADER Baseline Comparison

**Academic Justification:**
- VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool
- Establishes rule-based baseline for 3-way architecture comparison: Rule-Based (VADER) vs Traditional ML vs Deep Learning (BERT)
- VADER requires no training - applies directly to raw text
- Evaluates on **same data splits** (validation, test, gold) for fair comparison
- Expected to underperform on Manglish code-switching (lacks cultural context and mixed-language support)
- Demonstrates value of ML/DL approaches for low-resource languages

In [27]:
# Install and import VADER
try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    print("‚úì VADER already installed")
except ImportError:
    print("Installing VADER...")
    import subprocess
    subprocess.check_call(['pip', 'install', 'vaderSentiment'])
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    print("‚úì VADER installed successfully")

# Initialize VADER analyzer
vader_analyzer = SentimentIntensityAnalyzer()

print("\n" + "="*70)
print("VADER SENTIMENT ANALYSIS - RULE-BASED BASELINE")
print("="*70)
print("\n‚ö†Ô∏è  VADER Characteristics:")
print("  ‚Ä¢ Rule-based lexicon approach (no training required)")
print("  ‚Ä¢ Designed for English social media text")
print("  ‚Ä¢ Returns compound score: [-1.0, 1.0]")
print("  ‚Ä¢ Decision rule: compound >= 0.05 ‚Üí positive, < 0.05 ‚Üí negative")
print("  ‚Ä¢ Expected challenge: Manglish code-switching (e.g., 'sedap', 'mamak')")

def vader_predict(text):
    """
    Apply VADER sentiment analysis and convert to binary classification.

    Args:
        text (str): Raw text segment

    Returns:
        int: 0 for negative, 1 for positive

    Why compound score?
        VADER's compound score normalizes sentiment across text length.
        Standard threshold: >= 0.05 is positive, < 0.05 is negative.
    """
    if pd.isna(text) or text == "":
        return 1  # Default to positive if empty

    scores = vader_analyzer.polarity_scores(text)
    compound = scores['compound']

    # Binary classification using standard threshold
    return 1 if compound >= 0.05 else 0

Installing VADER...
‚úì VADER installed successfully

VADER SENTIMENT ANALYSIS - RULE-BASED BASELINE

‚ö†Ô∏è  VADER Characteristics:
  ‚Ä¢ Rule-based lexicon approach (no training required)
  ‚Ä¢ Designed for English social media text
  ‚Ä¢ Returns compound score: [-1.0, 1.0]
  ‚Ä¢ Decision rule: compound >= 0.05 ‚Üí positive, < 0.05 ‚Üí negative
  ‚Ä¢ Expected challenge: Manglish code-switching (e.g., 'sedap', 'mamak')


In [28]:
print("="*70)
print("VADER EVALUATION ON VALIDATION SET")
print("="*70)

# Use ORIGINAL text (not preprocessed) for VADER
X_val_original = df_val['Segment'].values

print(f"\nApplying VADER to {len(X_val_original):,} validation segments...")
y_pred_val_vader = np.array([vader_predict(text) for text in X_val_original])

# Compute metrics
val_metrics_vader = compute_metrics(y_val, y_pred_val_vader)

print(f"\n‚òÖ VADER VALIDATION SET RESULTS:")
print(f"  Accuracy:    {val_metrics_vader['accuracy']:.4f}")
print(f"  Macro-F1:    {val_metrics_vader['macro_f1']:.4f}")
print(f"    Negative F1: {val_metrics_vader['neg_f1']:.4f}")
print(f"    Positive F1: {val_metrics_vader['pos_f1']:.4f}")

print(f"\nClassification Report:")
print(classification_report(y_val, y_pred_val_vader, target_names=["Negative", "Positive"], digits=4))

VADER EVALUATION ON VALIDATION SET

Applying VADER to 9,864 validation segments...

‚òÖ VADER VALIDATION SET RESULTS:
  Accuracy:    0.5314
  Macro-F1:    0.4420
    Negative F1: 0.2187
    Positive F1: 0.6654

Classification Report:
              precision    recall  f1-score   support

    Negative     0.1235    0.9543    0.2187       678
    Positive     0.9933    0.5002    0.6654      9186

    accuracy                         0.5314      9864
   macro avg     0.5584    0.7272    0.4420      9864
weighted avg     0.9335    0.5314    0.6347      9864



In [29]:
print("="*70)
print("VADER EVALUATION ON TEST SET")
print("="*70)

# Use ORIGINAL text (not preprocessed) for VADER
X_test_original = df_test['Segment'].values

print(f"\nApplying VADER to {len(X_test_original):,} test segments...")
y_pred_test_vader = np.array([vader_predict(text) for text in X_test_original])

# Compute metrics
test_metrics_vader = compute_metrics(y_test, y_pred_test_vader)

print(f"\n‚òÖ VADER TEST SET RESULTS:")
print(f"  Accuracy:    {test_metrics_vader['accuracy']:.4f}")
print(f"  Macro-F1:    {test_metrics_vader['macro_f1']:.4f}")
print(f"    Negative F1: {test_metrics_vader['neg_f1']:.4f}")
print(f"    Positive F1: {test_metrics_vader['pos_f1']:.4f}")

print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_test_vader, target_names=["Negative", "Positive"], digits=4))

VADER EVALUATION ON TEST SET

Applying VADER to 14,795 test segments...

‚òÖ VADER TEST SET RESULTS:
  Accuracy:    0.5350
  Macro-F1:    0.4448
    Negative F1: 0.2210
    Positive F1: 0.6686

Classification Report:
              precision    recall  f1-score   support

    Negative     0.1249    0.9606    0.2210      1016
    Positive     0.9943    0.5037    0.6686     13779

    accuracy                         0.5350     14795
   macro avg     0.5596    0.7321    0.4448     14795
weighted avg     0.9346    0.5350    0.6379     14795



In [30]:
print("="*70)
print("VADER EVALUATION ON GOLD STANDARD")
print("="*70)

# Use ORIGINAL gold segments
X_gold_original = gold_df_exploded['Segment'].values
y_gold = gold_df_exploded['label'].values

print(f"\nApplying VADER to {len(X_gold_original):,} gold standard segments...")
y_pred_gold_vader = np.array([vader_predict(text) for text in X_gold_original])

# Compute metrics
gold_metrics_vader = compute_metrics(y_gold, y_pred_gold_vader)

print(f"\n‚òÖ VADER GOLD STANDARD RESULTS:")
print(f"‚òÖ Total samples: {len(gold_df_exploded):,} aspect-segment pairs")
print(f"\nOVERALL PERFORMANCE:")
print(f"  Accuracy:  {gold_metrics_vader['accuracy']:.4f}")
print(f"  Macro-F1:  {gold_metrics_vader['macro_f1']:.4f}")
print(f"    Negative F1: {gold_metrics_vader['neg_f1']:.4f}")
print(f"    Positive F1: {gold_metrics_vader['pos_f1']:.4f}")

# Per-aspect breakdown
print(f"\nPER-ASPECT BREAKDOWN:")
print(f"{'Aspect':<20} {'Samples':>8} {'Accuracy':>10} {'Macro-F1':>10}")
print(f"{'-'*20} {'-'*8} {'-'*10} {'-'*10}")

aspects_unique = sorted(gold_df_exploded["aspect"].unique())
for aspect in aspects_unique:
    mask = gold_df_exploded["aspect"] == aspect
    y_aspect = y_gold[mask]
    y_pred_aspect = y_pred_gold_vader[mask]

    try:
        metrics = compute_metrics(y_aspect, y_pred_aspect)
        n_samples = mask.sum()
        print(f"{aspect:<20} {n_samples:>8} {metrics['accuracy']:>10.4f} {metrics['macro_f1']:>10.4f}")
    except:
        print(f"{aspect:<20} {mask.sum():>8} {'N/A':>10} {'N/A':>10}")

print(f"\nFULL CLASSIFICATION REPORT:")
print(classification_report(y_gold, y_pred_gold_vader, target_names=["Negative", "Positive"], digits=4))

VADER EVALUATION ON GOLD STANDARD

Applying VADER to 721 gold standard segments...

‚òÖ VADER GOLD STANDARD RESULTS:
‚òÖ Total samples: 721 aspect-segment pairs

OVERALL PERFORMANCE:
  Accuracy:  0.7351
  Macro-F1:  0.7336
    Negative F1: 0.7136
    Positive F1: 0.7535

PER-ASPECT BREAKDOWN:
Aspect                Samples   Accuracy   Macro-F1
-------------------- -------- ---------- ----------
AMBIENCE                   62     0.7903     0.7569
AUTHENTICITY & LOCAL VIBE       16     0.6250     0.6000
FOOD                      303     0.7393     0.7342
HALAL COMPLIANCE            2     0.5000     0.3333
LOCATION                   21     0.7143     0.7083
LOYALTY (RETURN INTENT)       91     0.6923     0.6859
NON-HALAL ELEMENTS         10     0.7000     0.6970
SERVICE                   128     0.7109     0.7069
VALUE                      88     0.7955     0.7928

FULL CLASSIFICATION REPORT:
              precision    recall  f1-score   support

    Negative     0.6071    0.8655    0.713

# Stage 12: Three-Way Architecture Comparison

**Comparison Framework:**
- **VADER (Rule-Based)**: Lexicon + rules, no training, English-centric
- **Traditional ML (Tuned)**: TF-IDF + SVM/Logistic Regression, supervised learning
- **XLM-RoBERTa (BERT)**: Transformer with cross-lingual pre-training, contextual embeddings

**Evaluation Protocol:**
- Same data splits (validation, test, gold standard)
- Same metrics (accuracy, macro-F1, negative F1, positive F1)
- Gold standard as final ground truth for thesis comparison

In [None]:
# print("="*70)
# print("THREE-WAY MODEL COMPARISON")
# print("="*70)

# # Create comprehensive comparison table
# comparison_data = []

# # VADER (Rule-Based)
# comparison_data.append({
#     'Model': 'VADER (Rule-Based)',
#     'Architecture': 'Lexicon + Rules',
#     'Training': 'None',
#     'Val Accuracy': val_metrics_vader['accuracy'],
#     'Val Macro-F1': val_metrics_vader['macro_f1'],
#     'Test Accuracy': test_metrics_vader['accuracy'],
#     'Test Macro-F1': test_metrics_vader['macro_f1'],
#     'Gold Accuracy': gold_metrics_vader['accuracy'],
#     'Gold Macro-F1': gold_metrics_vader['macro_f1'],
#     'Gold Neg F1': gold_metrics_vader['neg_f1'],
#     'Gold Pos F1': gold_metrics_vader['pos_f1']
# })

# # Traditional ML (Tuned)
# comparison_data.append({
#     'Model': f'Traditional ML ({best_model_name})',
#     'Architecture': 'TF-IDF + ML',
#     'Training': f'{len(X_train):,} samples',
#     'Val Accuracy': val_metrics['accuracy'],  # From Stage 6
#     'Val Macro-F1': val_metrics['macro_f1'],
#     'Test Accuracy': test_metrics_tuned['accuracy'],
#     'Test Macro-F1': test_metrics_tuned['macro_f1'],
#     'Gold Accuracy': gold_metrics_tuned['accuracy'],
#     'Gold Macro-F1': gold_metrics_tuned['macro_f1'],
#     'Gold Neg F1': gold_metrics_tuned['neg_f1'],
#     'Gold Pos F1': gold_metrics_tuned['pos_f1']
# })

# # XLM-RoBERTa (from your completed BERT training)
# # Note: Update these values with your actual BERT results
# comparison_data.append({
#     'Model': 'XLM-RoBERTa (BERT)',
#     'Architecture': 'Transformer',
#     'Training': f'{len(X_train):,} samples',
#     'Val Accuracy': 0.0000,  # ‚Üê UPDATE with your BERT val accuracy
#     'Val Macro-F1': 0.0000,  # ‚Üê UPDATE with your BERT val macro-F1
#     'Test Accuracy': 0.0000,  # ‚Üê UPDATE with your BERT test accuracy
#     'Test Macro-F1': 0.0000,  # ‚Üê UPDATE with your BERT test macro-F1
#     'Gold Accuracy': 0.9240,  # Your reported BERT gold accuracy
#     'Gold Macro-F1': 0.9170,  # Your reported BERT gold macro-F1
#     'Gold Neg F1': 0.8940,    # Your reported BERT negative F1
#     'Gold Pos F1': 0.9340     # Your reported BERT positive F1
# })

# comparison_df = pd.DataFrame(comparison_data)

# print(f"\n{'='*70}")
# print(f"COMPARISON TABLE: GOLD STANDARD (GROUND TRUTH)")
# print(f"{'='*70}\n")

# # Display gold standard comparison (most important)
# gold_comparison = comparison_df[['Model', 'Architecture', 'Gold Accuracy', 'Gold Macro-F1', 'Gold Neg F1', 'Gold Pos F1']]
# print(gold_comparison.to_string(index=False))

# print(f"\n{'='*70}")
# print(f"FULL COMPARISON TABLE: ALL EVALUATION SETS")
# print(f"{'='*70}\n")
# print(comparison_df.to_string(index=False))

# # Summary insights
# print(f"\n{'='*70}")
# print(f"KEY FINDINGS")
# print(f"{'='*70}")

# vader_gold_f1 = gold_metrics_vader['macro_f1']
# ml_gold_f1 = gold_metrics_tuned['macro_f1']
# bert_gold_f1 = 0.9170  # Update with your actual BERT value

# print(f"\nüìä Gold Standard Macro-F1 Performance:")
# print(f"  1. XLM-RoBERTa (BERT):     {bert_gold_f1:.4f} ü•á")
# print(f"  2. Traditional ML:         {ml_gold_f1:.4f} ü•à")
# print(f"  3. VADER (Rule-Based):     {vader_gold_f1:.4f} ü•â")

# improvement_ml_vs_vader = ((ml_gold_f1 - vader_gold_f1) / vader_gold_f1) * 100
# improvement_bert_vs_ml = ((bert_gold_f1 - ml_gold_f1) / ml_gold_f1) * 100

# print(f"\nüìà Relative Improvements:")
# print(f"  Traditional ML vs VADER:   +{improvement_ml_vs_vader:.1f}%")
# print(f"  BERT vs Traditional ML:    +{improvement_bert_vs_ml:.1f}%")

# print(f"\nüí° Implications:")
# print(f"  ‚úì Rule-based (VADER) struggles with Manglish code-switching")
# print(f"  ‚úì Traditional ML benefits from supervised learning on domain data")
# print(f"  ‚úì BERT's cross-lingual pre-training provides significant advantage")
# print(f"  ‚úì Transformer architecture captures contextual nuances in code-switched text")

In [None]:
# # Visualization: Gold Standard Comparison
# fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# # Plot 1: Macro-F1 Comparison
# models = ['VADER\n(Rule-Based)', f'Traditional ML\n({best_model_name})', 'XLM-RoBERTa\n(BERT)']
# gold_f1_scores = [
#     gold_metrics_vader['macro_f1'],
#     gold_metrics_tuned['macro_f1'],
#     0.9170  # Update with your actual BERT value
# ]

# colors = ['#FF6B6B', '#4ECDC4', '#95E1D3']
# bars = axes[0].bar(models, gold_f1_scores, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
# axes[0].set_ylabel('Macro-F1 Score', fontsize=12, fontweight='bold')
# axes[0].set_title('Gold Standard: Macro-F1 Comparison', fontsize=14, fontweight='bold')
# axes[0].set_ylim([0, 1.0])
# axes[0].axhline(y=0.8, color='gray', linestyle='--', alpha=0.5, label='Strong Performance (0.8)')
# axes[0].grid(axis='y', alpha=0.3)
# axes[0].legend()

# # Add value labels on bars
# for bar in bars:
#     height = bar.get_height()
#     axes[0].text(bar.get_x() + bar.get_width()/2., height + 0.02,
#                 f'{height:.4f}', ha='center', va='bottom', fontweight='bold', fontsize=10)

# # Plot 2: Per-Class F1 Comparison
# class_labels = ['Negative F1', 'Positive F1']
# x = np.arange(len(class_labels))
# width = 0.25

# vader_scores = [gold_metrics_vader['neg_f1'], gold_metrics_vader['pos_f1']]
# ml_scores = [gold_metrics_tuned['neg_f1'], gold_metrics_tuned['pos_f1']]
# bert_scores = [0.8940, 0.9340]  # Update with your actual BERT values

# axes[1].bar(x - width, vader_scores, width, label='VADER', color=colors[0], alpha=0.8, edgecolor='black')
# axes[1].bar(x, ml_scores, width, label=f'Traditional ML', color=colors[1], alpha=0.8, edgecolor='black')
# axes[1].bar(x + width, bert_scores, width, label='BERT', color=colors[2], alpha=0.8, edgecolor='black')

# axes[1].set_ylabel('F1 Score', fontsize=12, fontweight='bold')
# axes[1].set_title('Gold Standard: Per-Class F1 Comparison', fontsize=14, fontweight='bold')
# axes[1].set_xticks(x)
# axes[1].set_xticklabels(class_labels, fontsize=11)
# axes[1].set_ylim([0, 1.0])
# axes[1].legend(fontsize=10)
# axes[1].grid(axis='y', alpha=0.3)

# plt.tight_layout()
# plt.show()

# print(f"\n‚úì Visualization complete: Gold Standard performance comparison")