In [1]:
import xml.etree.ElementTree as ET
import torch
from transformers import RobertaForSequenceClassification, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re


In [None]:
# Load the model and tokenizer
model = RobertaForSequenceClassification.from_pretrained("wonrax/phobert-base-vietnamese-sentiment")
tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False)

In [2]:
def load_xml_data(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8-sig') as file:
            content = file.read()
            
        # Remove any invisible characters
        content = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', content)
        
        # Parse the cleaned content
        root = ET.fromstring(content)
        return root
    except ET.ParseError as e:
        print(f"XML Parse Error: {e}")
        print(f"Error occurred at line {e.position[0]}, column {e.position[1]}")
        
        # Print the problematic line
        lines = content.split('\n')
        if e.position[0] <= len(lines):
            print(f"Problematic line: {lines[e.position[0] - 1].strip()}")
            print(f"                  {' ' * (e.position[1] - 1)}^")
        return None

def analyze_sentiment(text):
    input_ids = torch.tensor([tokenizer.encode(text)])
    with torch.no_grad():
        out = model(input_ids)
        probs = out.logits.softmax(dim=-1).tolist()[0]
    return {
        "negative": probs[0],
        "positive": probs[1],
        "neutral": probs[2]
    }

def get_dominant_sentiment(sentiment_dict):
    return max(sentiment_dict, key=sentiment_dict.get)

def train_classifier(xml_root):
    product_terms = []
    service_terms = []
    
    for sentence in xml_root.findall('.//sentence'):
        terms = [term.attrib['term'] for term in sentence.find('aspectTerms')]
        categories = [cat.attrib['category'] for cat in sentence.find('aspectCategories')]
        
        if "Về Sản Phẩm" in categories:
            product_terms.extend(terms)
        if "Về Dịch Vụ" in categories:
            service_terms.extend(terms)
    
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(product_terms + service_terms)
    
    product_vector = np.mean(tfidf_matrix[:len(product_terms)].toarray(), axis=0)
    service_vector = np.mean(tfidf_matrix[len(product_terms):].toarray(), axis=0)
    
    return vectorizer, product_vector, service_vector

def classify_term(term, vectorizer, product_vector, service_vector):
    term_vector = vectorizer.transform([term]).toarray()
    product_similarity = cosine_similarity(term_vector, product_vector.reshape(1, -1))[0][0]
    service_similarity = cosine_similarity(term_vector, service_vector.reshape(1, -1))[0][0]
    
    if product_similarity > service_similarity:
        return "Về Sản Phẩm"
    else:
        return "Về Dịch Vụ"

def process_sentence(sentence, vectorizer, product_vector, service_vector):
    text = sentence.find('text').text
    aspect_terms = sentence.find('aspectTerms')
    aspect_categories = sentence.find('aspectCategories')
    
    for term in aspect_terms:
        start = max(0, text.find(term.attrib['term']) - 20)
        end = min(len(text), text.find(term.attrib['term']) + len(term.attrib['term']) + 20)
        context = text[start:end]
        
        sentiment = analyze_sentiment(context)
        dominant_sentiment = get_dominant_sentiment(sentiment)
        dominant_score = sentiment[dominant_sentiment]
        
        term.set('polarity', dominant_sentiment)
        term.set('dominant_score', f"{dominant_score:.4f}")
        
        category = classify_term(term.attrib['term'], vectorizer, product_vector, service_vector)
        term.set('category', category)
    
    # Clear existing categories and add new ones based on term classifications
    aspect_categories.clear()
    categories = set(term.get('category') for term in aspect_terms)
    for category in categories:
        ET.SubElement(aspect_categories, 'aspectCategory', {'category': category})

In [3]:
# Load and process the XML data
xml_root = load_xml_data('manual_label.xml')

if xml_root is None:
    print("Failed to load XML. Please fix the XML file and try again.")
else:
    # Train the classifier
    vectorizer, product_vector, service_vector = train_classifier(xml_root)

    # Process sentences
    for sentence in xml_root.findall('.//sentence'):
        process_sentence(sentence, vectorizer, product_vector, service_vector)

    # Write the processed XML to a file
    tree = ET.ElementTree(xml_root)
    tree.write('processed.xml', encoding='utf-8-sig', xml_declaration=True)

    print("Processing complete. Results saved in 'processed.xml'.")



Processing complete. Results saved in 'processed.xml'.
