In [1]:
import pandas as pd
import re
from thefuzz import process

In [2]:
#df_gold = pd.read_csv(r'/content/drive/MyDrive/Aspect-Based-Sentiment-Analysis/Dataset/silver_std.pkl')
df_gold = pd.read_csv(r'C:\Users\Ong Hui Ling\Dropbox\PC\Documents\Github\Aspect-Based-Sentiment-Analysis\Dataset\Final_Gold_Standard.csv')

In [3]:
df_gold.head()

Unnamed: 0,Original_Review_ID,Segment,Manual_Aspect,Manual_Sentiment
0,9734,it took a really long time like it took 1 hour...,"['SERVICE', 'VALUE']",NEGATIVE
1,9734,the food is tasty,['FOOD'],POSITIVE
2,9734,the worker here is also rude,['SERVICE'],NEGATIVE
3,10357,great food,['FOOD'],POSITIVE
4,10357,nice beer,['NON-HALAL ELEMENTS'],POSITIVE


In [4]:
# ==========================================
# 1. THE MASTER MALAYSIAN ASPECT DICTIONARY
# ==========================================
ASPECT_DICT = {
    'FOOD': [
        # Taste & Quality
        'food', 'taste', 'tasty', 'delicious', 'sedap', 'yummy', 'flavor', 'flavour',
        'fresh', 'juicy', 'tender', 'crispy', 'flavorful', 'perfect', 
        'bland', 'tasteless', 'hambar', 'burnt', 'raw', 'undercooked',
        'overcooked', 'soggy', 'stale', 'spoiled', 'terrible', 'awful', 'unappetizing',
        'salty', 'masin', 'sweet', 'manis', 'sour', 'masam', 'spicy', 'pedas',
        'umami', 'kick', 'wok hei', 'lemak', 'dishes','herbs', 'spices',
        'chili', 'chewy', 'shiok','well-fried','dish',

        # Specific Dishes & Items
        'chicken', 'rice', 'nasi', 'mee', 'noodle', 'soup', 'meat', 'seafood',
        'drink', 'beverage', 'coffee', 'tea', 'dessert', 'cake', 'roti', 'sambal',
        'kuah', 'goreng', 'nasi lemak', 'char kuey teow', 'roti canai', 'laksa',
        'satay', 'rendang', 'dim sum', 'wan tan mee', 'hokkien mee', 'chicken rice',
        'nasi kandar', 'banana leaf', 'tomyam', 'cendol', 'teh tarik', 'kopi',
        'cook', 'chef', 'latte', 'matcha', 'cheese', 'sauce', 'gravy','beef',
        'mutton', 'sushi','macha', 'kek', 'murtabak', 'cincalok','hot dog',
        'kerabu', 'bak kut teh', 'yong tau foo', 'naan', 'otak-otak', 'kuih',
        'apam', 'chee cheong fun', 'burger', 'biryani', 'ikan', 'ayam', 'watermelon',
        'salmon', 'crab', 'oyster', 'fish','omelette', 'juice', 'samosa','snack', 'kangkung',
        'pumpkin',

        # Meal Times
        'lunch', 'dinner', 'breakfast', 'brunch', 'supper', 'meal', 'eating', 'eat',
        'makan', 'minum', 'hungry', 'full', 'appetite', 'tapau'
    ],

    'SERVICE': [
        # Speed
        'service', 'slow', 'fast', 'laju', 'lambat', 'quick', 'rapid', 'prompt',
        'efficient', 'delay', 'wait', 'waiting', 'queue', 'tunggu', 'late', 'long', 'lama',
        'minutes'

        # Attitude & Staff
        'staff', 'waiter', 'waitress', 'manager', 'boss', 'worker', 'crew',
        'friendly', 'rude', 'polite', 'kasar', 'mesra', 'helpful', 'attentive',
        'welcoming', 'arrogant', 'ignore', 'responsive', 'smiling', 'courteous',
        'knowledgeable', 'clueless', 'kantoi', 'slumber', 'server',
        'amoi', 'anneh', 'brother', 'abang', 'kakak', 'auntie', 'uncle',
        'guy', 'lady', 'personnel', 'team', 'cashier', 'people', 'disrespect', 'thoughtfully',
        'thoughtful', 'cheating'
    ],

    'VALUE': [
        # Price & Worth
        'price', 'cost', 'expensive', 'cheap', 'mahal', 'murah', 'affordable',
        'pricey', 'reasonable', 'worth', 'berbaloi', 'value', 'budget', 'bill',
        'ringgit', 'rm', 'charge', 'tax', 'standard', 'cut throat', 'wallet', 'money',
        'rip-off', 'bargain', 'overpriced','amount',

        # Portion Size
        'portion', 'size', 'quantity', 'serving', 'big', 'besar', 'small',
        'kecil', 'huge', 'tiny', 'generous', 'stingy', 'banyak', 'sikit'
    ],

    'LOCATION': [
        # Accessibility & Parking
        'location', 'loc', 'spot', 'area', 'zone', 'position',
        'parking', 'park', 'carpark', 'valet', 'lot','jauh', 'dekat',
        'waze', 'map', 'maps', 'direction', 'find', 'locate', 'accessible',
        'traffic', 'jam', 'station', 'lrt', 'mrt', 'transport', 'strategic'
    ],

    'AMBIENCE': [
        # Interior & Vibes
        'environment', 'ambience', 'atmosphere','decorate', 'relaxed','vibes'
        'vibe', 'decor', 'decoration', 'interior', 'aesthetic', 'view', 'scenery',
        'comfortable', 'selesa', 'cozy', 'spacious', 'luas', 'sempit', 'cramped',
        'relaxing', 'chill', 'instagram', 'instagrammable', 'music', 'happening',
        'chillax',

        # Comfort & Facilities
        'seat', 'seating', 'table', 'chair', 'toilet', 'washroom', 'aircon',
        'air conditioning', 'fan', 'ventilation', 'hot', 'panas', 'cold', 'sejuk',
        'warm', 'stuffy', 'noise', 'noisy', 'bising', 'loud', 'quiet', 'senyap',
        'crowd', 'crowded', 'sesak', 'packed', 'busy','space',

         # Cleanliness & Safety
        'clean', 'dirty', 'bersih', 'kotor', 'tidy', 'messy', 'spotless',
        'filthy', 'hygiene', 'sanitary', 'grimy', 'dusty', 'smell', 'stink',

        # Pests & Illness
        'fly', 'flies', 'lalat', 'cockroach', 'lipas', 'roach', 'rat', 'tikus',
        'insect', 'bug', 'poisoning', 'sick', 'stomach', 'diarrhea', 'vomit',
        'hair', 'worm'
    ],

    'HALAL COMPLIANCE': [
        'halal', 'muslim', 'syariah', 'zabihah', 'prayer', 'surau', 'solat',
        'mosque', 'wudhu', 'muslimah', 'jakim', 'bersih', 'suci', 'JAKIM',
        'muslim-friendly', 'pork-free'
    ],

    'NON-HALAL ELEMENTS': [
        'pork', 'lard', 'babi', 'alcohol', 'beer', 'wine', 'liquor', 'stout',
        'draught', 'pint', 'cocktail', 'pub', 'bar', 'char siew', 'siew yoke', 'pour'
    ],

    'AUTHENTICITY & LOCAL VIBE':[
        'authentic', 'traditional', 'asli', 'original', 'local', 'typical',
        'kampung', 'fusion', 'modern', 'style', 'muhibbah', 'mamak', 'nyonya',
        'penang', 'ipoh', 'heritage', 'classic'
    ],

    'LOYALTY (RETURN INTENT)': [
        'come', 'coming', 'return', 'visit', 'repeat', 'recommend', 'suggestion', 'recommended'
        'choice', 'option', 'second', 'again', 'definitely','unacceptable',
        'sure', 'always', 'regular', 'back', 'must try', 'disappointed', 'favourite', 'repeatable'
    ]
}

# Flatten dictionary for faster lookup
KEYWORD_TO_ASPECT = {word: aspect for aspect, keywords in ASPECT_DICT.items() for word in keywords}

# Get a clean list of unique keywords
unique_keywords = [k for k in KEYWORD_TO_ASPECT.keys() if len(k) > 3]

In [5]:
def identify_aspects_fuzzy(segment):
    """
    Maps a text segment to aspects using Exact Match AND Fuzzy Match.
    Includes 'Emoji Protection' to stop warnings.
    """
    found_aspects = set()
    words = segment.split()

    for word in words:
        # CLEANUP: Remove punctuation
        clean_word = word.strip(".,!?").lower()

        if len(clean_word) < 3:
            continue

        # Skip emoji
        # If the word has no letters (a-z), it is likely an emoji or number. Skip it.
        if not re.search('[a-zA-Z]', clean_word):
            continue

        # --- PASS 1: EXACT MATCH  ---
        if clean_word in KEYWORD_TO_ASPECT:
            found_aspects.add(KEYWORD_TO_ASPECT[clean_word])
            continue

        # --- PASS 2: FUZZY MATCH  ---
        if len(clean_word) > 4:
            match, score = process.extractOne(clean_word, unique_keywords)

            if score >= 85:
                found_aspects.add(KEYWORD_TO_ASPECT[match])

    if not found_aspects:
        return ['GENERAL']

    return list(found_aspects)

In [6]:
"""
Evaluates the Aspect Dictionary's performance on the Gold Standard Dataset.
This follows a Fuzzy Matching + Exact Matching pipeline and calculates:
- Precision: How many predicted aspects were correct?
- Recall: How many true aspects were captured?
- F1-Score: Harmonic mean of Precision and Recall
- Accuracy: Exact match accuracy (per segment)
"""

import ast
from sklearn.metrics import classification_report, multilabel_confusion_matrix
import numpy as np

print("=" * 60)
print("ASPECT DICTIONARY EVALUATION ON GOLD STANDARD")
print("=" * 60)

# ==========================================
# STEP 1: PREDICT ASPECTS FOR EACH SEGMENT
# ==========================================
print("\n[1/4] Running Aspect Extraction with Fuzzy Matching...")

predictions = []
true_labels = []

for index, row in df_gold.iterrows():
    segment_text = row['Segment']
    
    # Parse the Manual_Aspect column (it's stored as a string representation of a list)
    try:
        true_aspects = ast.literal_eval(row['Manual_Aspect'])
        if not isinstance(true_aspects, list):
            true_aspects = [true_aspects]
    except:
        true_aspects = []
    
    # Predict aspects using our fuzzy matching function
    predicted_aspects = identify_aspects_fuzzy(segment_text)
    
    # Remove 'GENERAL' from predictions for fair comparison
    predicted_aspects = [a for a in predicted_aspects if a != 'GENERAL']
    
    predictions.append(predicted_aspects)
    true_labels.append(true_aspects)

print(f"✓ Processed {len(df_gold)} segments")

# ==========================================
# STEP 2: CALCULATE MULTILABEL METRICS
# ==========================================
print("\n[2/4] Calculating Multilabel Classification Metrics...")

# Get all unique aspects from both predictions and true labels
all_aspects = sorted(list(set(
    [aspect for pred in predictions for aspect in pred] +
    [aspect for true in true_labels for aspect in true]
)))

print(f"✓ Found {len(all_aspects)} unique aspects: {all_aspects}")

# Convert to binary multilabel format
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=all_aspects)
y_true_binary = mlb.fit_transform(true_labels)
y_pred_binary = mlb.transform(predictions)

# Calculate metrics per aspect
print("\n[3/4] Per-Aspect Performance:")
print("-" * 60)

aspect_metrics = []
for i, aspect in enumerate(all_aspects):
    y_true_aspect = y_true_binary[:, i]
    y_pred_aspect = y_pred_binary[:, i]
    
    # Calculate metrics
    tp = np.sum((y_true_aspect == 1) & (y_pred_aspect == 1))
    fp = np.sum((y_true_aspect == 0) & (y_pred_aspect == 1))
    fn = np.sum((y_true_aspect == 1) & (y_pred_aspect == 0))
    tn = np.sum((y_true_aspect == 0) & (y_pred_aspect == 0))
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    support = np.sum(y_true_aspect)
    
    aspect_metrics.append({
        'Aspect': aspect,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Support': support
    })
    
    print(f"{aspect:30s} | P: {precision:.3f} | R: {recall:.3f} | F1: {f1:.3f} | Support: {support}")

# ==========================================
# STEP 3: CALCULATE OVERALL METRICS
# ==========================================
print("\n[4/4] Overall Performance Metrics:")
print("-" * 60)

# Micro-averaged metrics (treats each label equally)
tp_total = np.sum((y_true_binary == 1) & (y_pred_binary == 1))
fp_total = np.sum((y_true_binary == 0) & (y_pred_binary == 1))
fn_total = np.sum((y_true_binary == 1) & (y_pred_binary == 0))

micro_precision = tp_total / (tp_total + fp_total) if (tp_total + fp_total) > 0 else 0
micro_recall = tp_total / (tp_total + fn_total) if (tp_total + fn_total) > 0 else 0
micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0

print(f"Micro-Avg Precision: {micro_precision:.4f}")
print(f"Micro-Avg Recall:    {micro_recall:.4f}")
print(f"Micro-Avg F1-Score:  {micro_f1:.4f}")

# Macro-averaged metrics (average across aspects)
macro_precision = np.mean([m['Precision'] for m in aspect_metrics])
macro_recall = np.mean([m['Recall'] for m in aspect_metrics])
macro_f1 = np.mean([m['F1-Score'] for m in aspect_metrics])

print(f"\nMacro-Avg Precision: {macro_precision:.4f}")
print(f"Macro-Avg Recall:    {macro_recall:.4f}")
print(f"Macro-Avg F1-Score:  {macro_f1:.4f}")

# Exact match accuracy (how many segments had all aspects correctly predicted)
exact_matches = sum([set(pred) == set(true) for pred, true in zip(predictions, true_labels)])
exact_match_accuracy = exact_matches / len(predictions)

print(f"\nExact Match Accuracy: {exact_match_accuracy:.4f} ({exact_matches}/{len(predictions)} segments)")

# ==========================================
# STEP 4: CREATE RESULTS DATAFRAME
# ==========================================
df_results = pd.DataFrame(aspect_metrics)

print("\n" + "=" * 60)
print("EVALUATION COMPLETE ✓")
print("=" * 60)
print(f"\nResults stored in: df_results")
print(f"Binary labels stored in: y_true_binary, y_pred_binary")
print(f"Raw predictions stored in: predictions, true_labels")

ASPECT DICTIONARY EVALUATION ON GOLD STANDARD

[1/4] Running Aspect Extraction with Fuzzy Matching...
✓ Processed 799 segments

[2/4] Calculating Multilabel Classification Metrics...
✓ Found 9 unique aspects: ['AMBIENCE', 'AUTHENTICITY & LOCAL VIBE', 'FOOD', 'HALAL COMPLIANCE', 'LOCATION', 'LOYALTY (RETURN INTENT)', 'NON-HALAL ELEMENTS', 'SERVICE', 'VALUE']

[3/4] Per-Aspect Performance:
------------------------------------------------------------
AMBIENCE                       | P: 0.802 | R: 0.904 | F1: 0.850 | Support: 94
AUTHENTICITY & LOCAL VIBE      | P: 0.769 | R: 0.833 | F1: 0.800 | Support: 24
FOOD                           | P: 0.859 | R: 0.955 | F1: 0.904 | Support: 375
HALAL COMPLIANCE               | P: 1.000 | R: 1.000 | F1: 1.000 | Support: 2
LOCATION                       | P: 0.475 | R: 0.864 | F1: 0.613 | Support: 22
LOYALTY (RETURN INTENT)        | P: 0.727 | R: 0.903 | F1: 0.805 | Support: 103
NON-HALAL ELEMENTS             | P: 0.909 | R: 0.952 | F1: 0.930 | Suppor

In [9]:
"""
Filter and analyze segments where predicted aspects differ from manual annotations.
This helps identify weaknesses in the aspect dictionary.
"""

print("=" * 60)
print("MISMATCH ANALYSIS: Finding Dictionary Weaknesses")
print("=" * 60)

# ==========================================
# CREATE DETAILED COMPARISON DATAFRAME
# ==========================================
mismatches = []

for i, (pred, true) in enumerate(zip(predictions, true_labels)):
    pred_set = set(pred)
    true_set = set(true)
    
    # Check if there's any difference
    if pred_set != true_set:
        segment_text = df_gold.iloc[i]['Segment']
        
        # Calculate what's missing and what's extra
        missing_aspects = true_set - pred_set  # Should have predicted but didn't
        extra_aspects = pred_set - true_set    # Predicted but shouldn't have
        
        mismatches.append({
            'Segment_ID': i,
            'Segment_Text': segment_text,
            'Manual_Aspects': sorted(list(true_set)),
            'Predicted_Aspects': sorted(list(pred_set)),
            'Missing_Aspects': sorted(list(missing_aspects)),
            'Extra_Aspects': sorted(list(extra_aspects)),
            'Mismatch_Type': 'Both' if (missing_aspects and extra_aspects) else ('Missing' if missing_aspects else 'Extra')
        })

df_mismatches = pd.DataFrame(mismatches)

print(f"\n✓ Found {len(df_mismatches)} mismatched segments out of {len(df_gold)} total")
print(f"  → Match Rate: {(1 - len(df_mismatches)/len(df_gold))*100:.2f}%")

# ==========================================
# SUMMARY STATISTICS
# ==========================================
print("\n" + "-" * 60)
print("MISMATCH BREAKDOWN:")
print("-" * 60)

# Count by mismatch type
mismatch_counts = df_mismatches['Mismatch_Type'].value_counts()
print("\nBy Type:")
for mtype, count in mismatch_counts.items():
    print(f"  {mtype:10s}: {count:4d} segments ({count/len(df_mismatches)*100:.1f}%)")

# Most commonly missed aspects
all_missing = [asp for row in df_mismatches['Missing_Aspects'] for asp in row if asp]
if all_missing:
    print("\nMost Commonly MISSED Aspects (False Negatives):")
    missing_counts = pd.Series(all_missing).value_counts()
    for aspect, count in missing_counts.head(5).items():
        print(f"  {aspect:30s}: {count:4d} times")

# Most commonly over-predicted aspects
all_extra = [asp for row in df_mismatches['Extra_Aspects'] for asp in row if asp]
if all_extra:
    print("\nMost Commonly OVER-PREDICTED Aspects (False Positives):")
    extra_counts = pd.Series(all_extra).value_counts()
    for aspect, count in extra_counts.head(5).items():
        print(f"  {aspect:30s}: {count:4d} times")

MISMATCH ANALYSIS: Finding Dictionary Weaknesses

✓ Found 195 mismatched segments out of 799 total
  → Match Rate: 75.59%

------------------------------------------------------------
MISMATCH BREAKDOWN:
------------------------------------------------------------

By Type:
  Extra     :  131 segments (67.2%)
  Both      :   38 segments (19.5%)
  Missing   :   26 segments (13.3%)

Most Commonly MISSED Aspects (False Negatives):
  SERVICE                       :   18 times
  FOOD                          :   17 times
  LOYALTY (RETURN INTENT)       :   10 times
  AMBIENCE                      :    9 times
  AUTHENTICITY & LOCAL VIBE     :    4 times

Most Commonly OVER-PREDICTED Aspects (False Positives):
  FOOD                          :   59 times
  LOYALTY (RETURN INTENT)       :   35 times
  SERVICE                       :   27 times
  LOCATION                      :   21 times
  VALUE                         :   21 times


In [8]:
# Display the dataframe for interactive exploration
df_mismatches.to_csv(r'C:\Users\Ong Hui Ling\Dropbox\PC\Documents\Github\Aspect-Based-Sentiment-Analysis\Dataset\Mismatched_Aspect_Gold_Standard.csv')