# Feedback Aspect-Based Sentiment Analysis Pipeline 

This notebook combines preprocessing, aspect extraction, and sentiment analysis for Vietnamese customer feedback data.

## 1. Setup and Imports

In [19]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import warnings
import re
import emoji
import json
import torch
import time
import openai
from collections import Counter
from transformers import RobertaForSequenceClassification, AutoTokenizer
from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')
nltk.download('punkt', quiet=True)

# Configure OpenAI settings for local LM Studio
openai.api_base = 'http://localhost:1234/v1'
openai.api_key = ''

# Load PhoBERT model for sentiment analysis
model = RobertaForSequenceClassification.from_pretrained("wonrax/phobert-base-vietnamese-sentiment")
tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False)

## 2. Text Preprocessing Functions 


In [20]:
def remove_urls(text):
    """Remove URLs from text.
    
    Args:
        text (str): Input text containing URLs
        
    Returns:
        str: Text with URLs removed
    """
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_html_tags(text):
    """Remove HTML tags from text.
    
    Args:
        text (str): Input text containing HTML tags
        
    Returns:
        str: Clean text without HTML tags
    """
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_special_characters(text):
    """Remove special characters while preserving Vietnamese diacritics.
    
    Args:
        text (str): Input text with special characters
        
    Returns:
        str: Text with only alphanumeric and Vietnamese characters
    """
    pattern = re.compile(r'[^a-zA-Z0-9\s\u00C0-\u1EF9.,!?]')
    return pattern.sub(r'', text)

def remove_extra_spaces(text):
    """Remove redundant whitespace.
    
    Args:
        text (str): Input text with extra spaces
        
    Returns:
        str: Text with normalized spacing
    """
    return re.sub(r'\s+', ' ', text).strip()

def remove_hashtags(text):
    """Remove hashtags from text.
    
    Args:
        text (str): Input text containing hashtags
        
    Returns:
        str: Text with hashtags removed
    """
    return re.sub(r'#\w+', '', text)

def remove_phone_numbers(text):
    """Replace phone numbers with [PHONE] token.
    
    Args:
        text (str): Input text containing phone numbers
        
    Returns:
        str: Text with phone numbers masked
    """
    return re.sub(r'\b(?:\+?84|0)(?:\d{9,10})\b', '[PHONE]', text)

def is_meaningful(text, min_length=2, max_length=200, max_repetition_ratio=0.5, max_consonant_streak=5):
    """Check if text is meaningful based on various criteria.
    
    Args:
        text (str): Input text to evaluate
        min_length (int): Minimum character length
        max_length (int): Maximum character length 
        max_repetition_ratio (float): Maximum ratio of most common character
        max_consonant_streak (int): Maximum consecutive consonants allowed
        
    Returns:
        bool: True if text is meaningful, False otherwise
    """
    cleaned_text = re.sub(r'[^\w\s]', '', text.lower())
    
    if len(cleaned_text) < min_length or len(cleaned_text) > max_length:
        return False
    
    if re.search(r'[bcdfghjklmnpqrstvwxyz]{' + str(max_consonant_streak) + ',}', cleaned_text):
        return False
    
    char_counts = Counter(cleaned_text)
    most_common_char_count = char_counts.most_common(1)[0][1]
    repetition_ratio = most_common_char_count / len(cleaned_text)
    
    if repetition_ratio > max_repetition_ratio:
        return False
    
    for pattern_length in range(2, 6):
        for i in range(len(cleaned_text) - pattern_length * 2):
            pattern = cleaned_text[i:i+pattern_length]
            if pattern == cleaned_text[i+pattern_length:i+pattern_length*2]:
                return False
    
    return True

def clean_text(text):
    """Apply all cleaning operations to text.
    
    Args:
        text (str): Raw input text
        
    Returns:
        str: Fully cleaned text
    """
    text = remove_urls(text)
    text = remove_html_tags(text)
    text = remove_hashtags(text)
    text = remove_phone_numbers(text)
    text = remove_special_characters(text)
    text = remove_extra_spaces(text)
    text = text.lower()
    text = emoji.demojize(text)
    return text


## 3. LM Studio API Integration Functions

In [21]:
def get_completion(prompt, max_retries=3):
    """Get completion from local LM Studio API.
    
    Args:
        prompt (str): Input prompt for the model
        max_retries (int): Number of retries on failure
        
    Returns:
        str: Model completion response or None on failure
    """
    for attempt in range(max_retries):
        try:
            response = openai.ChatCompletion.create(
                model="local model",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1
            )
            return response.choices[0].message["content"]
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(2)
    return None

def clean_and_validate_response(response):
    """Clean and validate JSON response from LM Studio.
    
    Args:
        response (str): Raw response from model
        
    Returns:
        dict: Parsed JSON response or None if invalid
    """
    if not response:
        return None
    try:
        return json.loads(response)
    except json.JSONDecodeError:
        try:
            json_start = response.find('{')
            json_end = response.rfind('}') + 1
            if json_start >= 0 and json_end > json_start:
                return json.loads(response[json_start:json_end])
        except:
            pass
    return None

## 4. Sentiment Analysis Functions


In [22]:
def analyze_sentiment(text):
    """Analyze sentiment of text using PhoBERT model.
    
    Args:
        text (str): Input text for sentiment analysis
        
    Returns:
        dict: Sentiment probabilities for negative, positive, neutral
    """
    input_ids = torch.tensor([tokenizer.encode(text)])
    with torch.no_grad():
        out = model(input_ids)
        probs = out.logits.softmax(dim=-1).tolist()[0]
    return {
        "Negative": probs[0],
        "Positive": probs[1],
        "Neutral": probs[2]
    }

def get_dominant_sentiment(sentiment_dict):
    """Get the dominant sentiment from probability distribution.
    
    Args:
        sentiment_dict (dict): Sentiment probabilities
        
    Returns:
        str: Dominant sentiment category
    """
    return max(sentiment_dict, key=sentiment_dict.get)

def sentiment_to_polarity(sentiment):
    """Convert sentiment category to polarity label.
    
    Args:
        sentiment (str): Sentiment category
        
    Returns:
        str: Polarity label (NEG, POS, NEU)
    """
    mapping = {
        "negative": "NEG",
        "positive": "POS",
        "neutral": "NEU"
    }
    return mapping.get(sentiment.lower(), "NEU")


## 5. Main Processing Functions

In [23]:
def process_feedback(feedback):
    """Process feedback by extracting and analyzing aspects.
    
    Args:
        feedback (dict): Feedback entry with content and aspects
        
    Returns:
        list: Processed aspects with sentiment analysis
    """
    content = feedback['Content']
    results = []
    
    for aspect in feedback['Aspects']:
        if len(aspect['AspectTerms']) == 0:
            results.append({
                'AspectCategory': aspect['AspectCategory'],
                'AspectTerms': [],
                'Polarity': None,
                'DominantScore': None
            })
        else:
            for term in aspect['AspectTerms']:
                start = max(0, content.find(term) - 20)
                end = min(len(content), content.find(term) + len(term) + 20)
                context = content[start:end]
                
                sentiment = analyze_sentiment(context)
                dominant_sentiment = get_dominant_sentiment(sentiment)
                dominant_score = round(sentiment[dominant_sentiment], 3)
                
                results.append({
                    'AspectCategory': aspect['AspectCategory'],
                    'AspectTerms': term,
                    'Polarity': sentiment_to_polarity(dominant_sentiment),
                    'DominantScore': dominant_score
                })
    
    return results

def process_single_feedback(feedback):
    """Process single feedback entry through LM Studio.
    
    Args:
        feedback (dict): Feedback entry
        
    Returns:
        dict: Processed feedback with aspects or None
    """
    prompt = f'''### Instruction:
Analyze this feedback and extract aspects. Return ONLY a JSON object:
{json.dumps(feedback, ensure_ascii=False)}

Required format:
{{
    "GeneralFeedbackID": {feedback["GeneralFeedbackID"]},
    "ID": {feedback["ID"]},
    "Content": "{feedback["Content"]}",
    "Aspects": [
        {{
            "AspectCategory": "Về Sản Phẩm",
            "AspectTerms": []
        }},
        {{
            "AspectCategory": "Về Dịch Vụ", 
            "AspectTerms": []
        }}
    ]
}}
### Response:'''
    
    response = get_completion(prompt)
    if response:
        return clean_and_validate_response(response)
    return None


## 6. Main Pipeline Execution

In [None]:
# Load and preprocess data
print("\n=== Loading and Preprocessing Data ===")
df = pd.read_csv('warehouse/feedback/CustomerSatisfaction.csv', encoding='utf-8-sig')
initial_count = len(df)
print(f'[INFO] Loaded {initial_count:,} initial records')

# Basic preprocessing
df = df[['GeneralFeedbackID', 'Content']].drop_duplicates(subset='GeneralFeedbackID', keep='first')
dedup_count = len(df)
print(f'[INFO] After deduplication: {dedup_count:,} records ({initial_count - dedup_count:,} duplicates removed)')

df['Content'] = df['Content'].astype(str)
print('[INFO] Converted content to string type')

# Clean and tokenize
print("\n=== Cleaning and Filtering ===")
print('[INFO] Applying text cleaning...')
df['Content'] = df['Content'].apply(clean_text)
df['is_meaningful'] = df['Content'].apply(is_meaningful)
df = df[df['is_meaningful']]
df = df.drop(columns=['is_meaningful'])
df = df[df['Content'].str.strip().astype(bool)]
final_count = len(df)
print(f'[INFO] After cleaning: {final_count:,} meaningful records')
print(f'[INFO] Removed {dedup_count - final_count:,} non-meaningful entries')

# Process through LM Studio
print("\n=== Processing through LM Studio ===")
processed_results = []
success_count = 0
failed_count = 0
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing feedback"):
    feedback = {
        'GeneralFeedbackID': row['GeneralFeedbackID'],
        'ID': idx,
        'Content': row['Content']
    }
    processed = process_single_feedback(feedback)
    if processed:
        processed_results.append(processed)
        success_count += 1
    else:
        failed_count += 1

print(f'[INFO] Successfully processed {success_count:,} entries')
print(f'[INFO] Failed to process {failed_count:,} entries')


=== Loading and Preprocessing Data ===
[INFO] Loaded 5,015 initial records
[INFO] After deduplication: 2,858 records (2,157 duplicates removed)
[INFO] Converted content to string type

=== Cleaning and Filtering ===
[INFO] Applying text cleaning...
[INFO] After cleaning: 2,503 meaningful records
[INFO] Removed 355 non-meaningful entries

=== Processing through LM Studio ===


Processing feedback:   0%|          | 0/2503 [00:00<?, ?it/s]

[INFO] Successfully processed 2,502 entries
[INFO] Failed to process 1 entries

=== Performing Sentiment Analysis ===


Analyzing sentiment:   0%|          | 0/2502 [00:00<?, ?it/s]

KeyError: 'AspectTerms'

In [None]:
processed_results

[{'GeneralFeedbackID': 14766294,
  'ID': 0,
  'Content': 'hàng nguyên seal, chất lượng ổn hơn các dòng android tablet khác. dùng để lướt web, xem phim tốt.',
  'Aspects': [{'AspectCategory': 'Về Sản Phẩm',
    'AspectTerms': ['nguyên seal', 'chất lượng ổn']},
   {'AspectCategory': 'Về Dịch Vụ', 'AspectTerms': []}]},
 {'GeneralFeedbackID': 19380399,
  'ID': 1,
  'Content': 'giá tốt, hàng xịn, tiki luôn là lựa chọn số 1 của tôi!',
  'Aspects': [{'AspectCategory': 'Về Sản Phẩm',
    'AspectTerms': ['giá tốt', 'hàng xịn']},
   {'AspectCategory': 'Về Dịch Vụ', 'AspectTerms': []}]},
 {'GeneralFeedbackID': 15188338,
  'ID': 2,
  'Content': 'máy chính hãng nguyên seal, kiểm tra mã imei thì chưa kích hoạt, giá tốt. hàng được xử lí nhanh, đóng gói kỹ lưỡng chắc chắn. giao đúng hẹn.',
  'Aspects': [{'AspectCategory': 'Về Sản Phẩm',
    'AspectTerms': ['chính hãng', 'nguyên seal', 'giá tốt']},
   {'AspectCategory': 'Về Dịch Vụ',
    'AspectTerms': ['xử lí nhanh', 'đóng gói kỹ lưỡng', 'giao đúng hẹ

In [None]:
print("\n=== Performing Sentiment Analysis ===")
final_results = []
pos_count = 0
neg_count = 0
neu_count = 0

for feedback in tqdm(processed_results, desc="Analyzing sentiment"):
    content = feedback['Content']
    results = []
    
    for aspect in feedback['Aspects']:
        if not aspect.get('AspectTerms'):
            results.append({
                "AspectCategory": aspect['AspectCategory'],
                "AspectTerms": [],
                "Polarity": None,
                "DominantScore": None
            })
        else:
            terms = aspect['AspectTerms']
            if isinstance(terms, str):
                terms = [terms]
                
            for term in terms:
                start = max(0, content.find(term) - 20)
                end = min(len(content), content.find(term) + len(term) + 20)
                context = content[start:end]
                
                sentiment = analyze_sentiment(context)
                dominant_sentiment = get_dominant_sentiment(sentiment)
                dominant_score = round(sentiment[dominant_sentiment], 3)
                polarity = sentiment_to_polarity(dominant_sentiment)
                
                if polarity == "POS":
                    pos_count += 1
                elif polarity == "NEG":
                    neg_count += 1
                else:
                    neu_count += 1
                    
                results.append({
                    "AspectCategory": aspect['AspectCategory'],
                    "AspectTerms": term,
                    "Polarity": polarity,
                    "DominantScore": dominant_score
                })

    processed_feedback = {
        "GeneralFeedbackID": feedback['GeneralFeedbackID'],
        "ID": feedback['ID'],
        "Content": content,
        "Aspects": results
    }
    final_results.append(processed_feedback)

print(f'[INFO] Sentiment Distribution:')
print(f'      Positive: {pos_count:,} aspects')
print(f'      Negative: {neg_count:,} aspects') 
print(f'      Neutral: {neu_count:,} aspects')
print(f'[INFO] Completed sentiment analysis for {len(final_results):,} entries')

# Save results
print("\n=== Saving Results ===")
with open('warehouse/feedback/ABSA_CustomerSatisfaction.json', 'w', encoding='utf-8-sig') as f:
   json.dump(final_results, f, ensure_ascii=False, indent=2)
print(f'[SUCCESS] Saved {len(final_results):,} processed entries to ABSA_CustomerSatisfaction.json')
print("\n=== Pipeline Complete ===")


=== Performing Sentiment Analysis ===


Analyzing sentiment:   0%|          | 0/2502 [00:00<?, ?it/s]

[INFO] Sentiment Distribution:
      Positive: 3,811 aspects
      Negative: 785 aspects
      Neutral: 726 aspects
[INFO] Completed sentiment analysis for 2,502 entries

=== Saving Results ===
[SUCCESS] Saved 2,502 processed entries to ABSA_CustomerSatisfaction.json

=== Pipeline Complete ===
