In [None]:
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from datasets import load_dataset
import spacy
import spacy.displacy

# Dataset

In [47]:
df = load_dataset("tomaarsen/setfit-absa-semeval-restaurants")
df

DatasetDict({
    train: Dataset({
        features: ['text', 'span', 'label', 'ordinal'],
        num_rows: 3693
    })
    test: Dataset({
        features: ['text', 'span', 'label', 'ordinal'],
        num_rows: 1134
    })
})

In [48]:
train_df = df['train']
train_df

Dataset({
    features: ['text', 'span', 'label', 'ordinal'],
    num_rows: 3693
})

In [49]:
unique_labels = set(train_df['label'])
print(f"Number of unique labels: {len(unique_labels)}")
print(f"Unique labels: {unique_labels}")

Number of unique labels: 4
Unique labels: {'conflict', 'neutral', 'negative', 'positive'}


In [None]:
train_df = train_df.filter(lambda x: x['label'] != 'conflict')
train_df

Dataset({
    features: ['text', 'span', 'label', 'ordinal'],
    num_rows: 3602
})

In [51]:
print(f"Number of unique labels: {len(unique_labels)}")
unique_labels = set(train_df['label'])
print(f"Unique labels: {unique_labels}")

Number of unique labels: 4
Unique labels: {'neutral', 'negative', 'positive'}


# Compare Pretrained Model

In [52]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
device_index = 0 if device == 'cuda' else -1

Using device: cuda


In [74]:
def evaluate_model(model_name, train_df, device_index=0):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device_index)
    
    correct = 0
    total = len(train_df)

    for i in tqdm(range(total)):
        text = train_df[i]['text']
        label = train_df[i]['label']
        aspect = train_df[i]['span']
        
        result = classifier(text, text_pair=aspect)[0]
        
        sentiment = result.get("label")

        if label.lower() == sentiment.lower():
            correct += 1

    accuracy = correct / total
    return accuracy

## The model that we choose

In [75]:
model_name = "yangheng/deberta-v3-base-absa-v1.1"
accuracy = evaluate_model(model_name, train_df)
print(f"Accuracy: {accuracy * 100:.2f}%")

100%|██████████| 3602/3602 [01:17<00:00, 46.36it/s]


Accuracy: 86.51%


In [57]:
model_name = "yangheng/deberta-v3-large-absa-v1.1"
accuracy = evaluate_model(model_name, train_df)
print(f"Accuracy: {accuracy * 100:.2f}%")

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

100%|██████████| 3602/3602 [02:21<00:00, 25.44it/s]

Accuracy: 86.17%





In [None]:
nlp = spacy.load("en_core_web_sm")

def visualize_dependencies(text):
    doc = nlp(text)
    spacy.displacy.render(doc, style="dep", jupyter=True)

def extract_dependency_context(text, aspect):
    doc = nlp(text)
    aspect_modifiers = []

    for token in doc:
        if aspect.lower() in token.text.lower():
            for child in token.children:
                if child.pos_ == "ADJ" and child.dep_ in ['amod', 'acomp']:
                    aspect_modifiers.append(child.text)

            if token.head.pos_ == "ADJ" and token.head.dep_ in ['acomp', 'attr', 'ROOT']:
                aspect_modifiers.append(token.head.text)

    if not aspect_modifiers:
        for token in doc:
            if token.pos_ == "ADJ":
                return token.text

    return " ".join(aspect_modifiers) if aspect_modifiers else None


def evaluate_model_with_dependency(model_name, train_df, device_index=0):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device_index)
    
    correct = 0
    total = len(train_df)

    # Iterate over the dataset
    for i in tqdm(range(total)):
        text = train_df[i]['text']
        label = train_df[i]['label']
        aspect = train_df[i]['span']
    
        aspect_context = extract_dependency_context(text, aspect)

        # if aspect_context:
        #     print(f"Dependency Graph for Aspect Context: '{aspect_context}'")
            # visualize_dependencies(aspect_context)
        
        if aspect_context:
            result = classifier(aspect_context)[0]
            sentiment = result.get("label")
        else:
            sentiment = "neutral"
        
        if label.lower() == sentiment.lower():
            correct += 1

    accuracy = correct / total
    return accuracy

In [82]:
model_name = "yangheng/deberta-v3-base-absa-v1.1"
accuracy = evaluate_model_with_dependency(model_name, train_df)
print(f"Accuracy: {accuracy * 100:.2f}%")

100%|██████████| 3602/3602 [01:23<00:00, 43.36it/s]


Accuracy: 60.02%


In [84]:
model_name = "yangheng/deberta-v3-large-absa-v1.1"
accuracy = evaluate_model_with_dependency(model_name, train_df)
print(f"Accuracy: {accuracy * 100:.2f}%")

100%|██████████| 3602/3602 [02:20<00:00, 25.67it/s]


Accuracy: 58.05%


# Experiment

In [85]:
df = pd.read_csv('/kaggle/input/restaurant-review/temporary_result.csv')
df

Unnamed: 0.1,Unnamed: 0,text,category,topic_food,score_food,topic_place,score_place,topic_price,score_price,topic_service,score_service,topic_count,main_topics,primary_topic,primary_score
0,0,"To be completely fair, the only redeeming fact...",food,1,0.223695,0,0.000000,0,0.0,0,0.000000,1,food,food,0.223695
1,1,"The food is uniformly exceptional, with a very...",food,1,0.515413,1,0.378481,0,0.0,0,0.000000,2,"food, place",food,0.515413
2,2,"Not only was the food outstanding, but the lit...",food,1,0.330244,0,0.000000,0,0.0,0,0.000000,1,food,food,0.330244
3,3,It is very overpriced and not very tasty.,food,1,0.371381,0,0.000000,0,0.0,0,0.000000,1,food,food,0.371381
4,4,Our agreed favorite is the orrechiete with sau...,food,1,0.424373,1,0.321227,0,0.0,0,0.000000,2,"food, place",food,0.424373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,2195,"Warm, comfortable surroundings, nice appointme...",place,1,0.324377,1,0.594371,0,0.0,1,0.418131,3,"food, place, service",place,0.594371
2196,2196,"This is such a lovely, peaceful place to eat o...",place,1,0.330050,1,0.373284,0,0.0,0,0.000000,2,"food, place",place,0.373284
2197,2197,"This is a great place to take out-of-towners, ...",place,0,0.000000,1,0.386226,0,0.0,0,0.000000,1,place,place,0.386226
2198,2198,"You will pay a lot for the decore, but the foo...",place,1,0.322529,1,0.377474,0,0.0,0,0.000000,2,"food, place",place,0.377474


In [86]:
model_name = "yangheng/deberta-v3-base-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device_index)



In [87]:
def extract_aspect_sentiments(dataset, aspects, max_length=50):
    for row in tqdm(dataset, desc="Processing rows", unit="row"):
        text = row['text']
        for aspect in aspects:
            # Check if the aspect needs to be analyzed
            if row[f'topic_{aspect}'] == 1:
                result = classifier(text, text_pair=aspect)[0]
                row[f'sentiment_{aspect}'] = result.get("label")
            else:
                row[f'sentiment_{aspect}'] = "Not Found"
    return dataset

In [88]:
aspects = ["food", "place", "price", "service"]

In [89]:
updated_data = extract_aspect_sentiments(df.to_dict(orient='records'), aspects)
df_updated = pd.DataFrame(updated_data)

Processing rows: 100%|██████████| 2200/2200 [01:04<00:00, 33.86row/s]


In [90]:
df_updated

Unnamed: 0.1,Unnamed: 0,text,category,topic_food,score_food,topic_place,score_place,topic_price,score_price,topic_service,score_service,topic_count,main_topics,primary_topic,primary_score,sentiment_food,sentiment_place,sentiment_price,sentiment_service
0,0,"To be completely fair, the only redeeming fact...",food,1,0.223695,0,0.000000,0,0.0,0,0.000000,1,food,food,0.223695,Positive,Not Found,Not Found,Not Found
1,1,"The food is uniformly exceptional, with a very...",food,1,0.515413,1,0.378481,0,0.0,0,0.000000,2,"food, place",food,0.515413,Positive,Neutral,Not Found,Not Found
2,2,"Not only was the food outstanding, but the lit...",food,1,0.330244,0,0.000000,0,0.0,0,0.000000,1,food,food,0.330244,Positive,Not Found,Not Found,Not Found
3,3,It is very overpriced and not very tasty.,food,1,0.371381,0,0.000000,0,0.0,0,0.000000,1,food,food,0.371381,Negative,Not Found,Not Found,Not Found
4,4,Our agreed favorite is the orrechiete with sau...,food,1,0.424373,1,0.321227,0,0.0,0,0.000000,2,"food, place",food,0.424373,Neutral,Neutral,Not Found,Not Found
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,2195,"Warm, comfortable surroundings, nice appointme...",place,1,0.324377,1,0.594371,0,0.0,1,0.418131,3,"food, place, service",place,0.594371,Neutral,Positive,Not Found,Positive
2196,2196,"This is such a lovely, peaceful place to eat o...",place,1,0.330050,1,0.373284,0,0.0,0,0.000000,2,"food, place",place,0.373284,Neutral,Positive,Not Found,Not Found
2197,2197,"This is a great place to take out-of-towners, ...",place,0,0.000000,1,0.386226,0,0.0,0,0.000000,1,place,place,0.386226,Not Found,Positive,Not Found,Not Found
2198,2198,"You will pay a lot for the decore, but the foo...",place,1,0.322529,1,0.377474,0,0.0,0,0.000000,2,"food, place",place,0.377474,Negative,Neutral,Not Found,Not Found


In [91]:
output_path = "updated_aspect_sentiments.csv"
df_updated.to_csv(output_path, index=False)
print(f"Updated aspect-based sentiment results saved to {output_path}")

Updated aspect-based sentiment results saved to updated_aspect_sentiments.csv
