# 01-09 : Aspect Based Sentiment Analysis (LLM)

In [None]:
import os
import pandas as pd
import json
from time import sleep
from typing import List, Dict, Tuple
from pprint import pprint
from tqdm.notebook import tqdm

from dotenv import load_dotenv, find_dotenv
import openai
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

In [None]:
# read local .env file
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
tqdm.pandas()

## Data Load

In [None]:
# load the human feedback dataset
df_source = pd.read_parquet('../../data/interim/01-06_human_classified.parquet')

# show the data loaded
print(df_source.shape)
display(df_source.head(3))

## Aspects

In [None]:
entities_aspects = {
    "Billing & Payments": [
        "Billing accuracy",
        "Payment methods",
        "Refunds/credits",
        "Hidden charges",
        "Monthly costs"
    ],
    "Network & Connectivity": [
        "Signal strength",
        "Network coverage",
        "Data speeds (4G, 5G, etc.)",
        "Call quality",
        "Dropped calls",
        "Roaming"
    ],
    "Customer Service": [
        "Responsiveness",
        "Friendliness/professionalism",
        "Knowledge/competence",
        "Resolution time",
        "Availability (e.g., 24/7 support)"
    ],
    "Chatbots": [
        "User-friendliness",
        "Response accuracy",
        "Speed of response",
        "Ability to understand query",
        "Escalation to human agents"
    ],
    "Account & Plans": [
        "Account management (online portal/apps)",
        "Plan flexibility",
        "Plan pricing",
        "Upgrade/downgrade process",
        "Promotions and offers"
    ],
    "Hardware/Devices": [
        "Setup/ease of installation",
        "Device reliability",
        "Device performance/speed",
        "Rental vs. purchase options",
        "Technical issues"
    ],
    "Value-added Services": [
        "Quality of service",
        "Pricing/value for money",
        "Reliability",
        "Content variety (for streaming)",
        "Ease of use"
    ]
}

## Prompt Template

In [None]:
# get the entities and aspects as a string
entities_aspects_str = json.dumps(entities_aspects, indent=4)

In [None]:
# an example of the output the llm should produce
output_example = {
    "Customer Service": {
        "Responsiveness": "Negative",
        "Friendliness/professionalism": "Negative",
        "Knowledge/competence": "Negative",
        "Availability (e.g., 24/7 support)": "Negative"
    },
    "Chatbots": {
        "User-friendliness": "Neutral",
        "Response accuracy": "Negative",
        "Ability to understand query": "Negative"
    }
}

output_example_str = json.dumps(output_example, indent=4)

In [None]:
# set the template string
template_string = """\
Given the following entities and aspects per entity:

```json
{entities_aspects}
```

Please perform Aspect Based Sentiment Analysis on the following text:

```text
{text}
```

Only the JSON output is expected without any "```json" text surrounding it. Do not answer with anything except JSON. Only respond with entities and aspects present in the text.

Output example:

```json
{output_example}
```
"""

In [None]:
# create the prompt template
prompt_template = ChatPromptTemplate.from_template(template_string)
prompt_template

### Test the template

In [None]:
# get the complaint text
review_title = df_source.loc[1, 'review_title']
review_content = df_source.loc[1, 'review_content']
complaint_text = f'# {review_title}\n\n{review_content}'

pprint(complaint_text)

In [None]:
# create the prompt
prompt = prompt_template.format_messages(
    entities_aspects=entities_aspects_str,
    text=complaint_text,
    output_example=output_example_str
)

#print(prompt[0].content)

## Functions

In [None]:
def classify_aspects(text:str,
                     entities_aspects:str,
                     output_example:str,
                     prompt_template:ChatPromptTemplate) -> Dict:
    """Classify the aspects of a given text and return an output dictionary."""
    chat = ChatOpenAI(
        temperature=0.0,
        max_tokens=512,
        model='gpt-4')

    # create the prompt
    prompt = prompt_template.format_messages(
        entities_aspects=entities_aspects,
        text=text,
        output_example=output_example
    )

    # get the llm response
    response = chat(prompt)

    # return the result
    return json.loads(response.content)

# test the function
aspects = classify_aspects(
    text=complaint_text,
    entities_aspects=entities_aspects_str,
    output_example=output_example_str,
    prompt_template=prompt_template)
pprint(aspects)  

In [None]:
def flatten_aspects(id:str, classification:Dict) -> Dict:
    """Flatten the aspects to be compatible with a dataframe and add the id as a column"""
    flat_aspects = []

    for entity, aspects in classification.items():
        for aspect, polarity in aspects.items():
            flat_aspects.append({
                'id': id,
                'entity': entity,
                'aspect': aspect,
                'polarity': polarity,
            })

    return flat_aspects

# test the function
flatten = flatten_aspects("12", aspects)
display(pd.DataFrame(flatten))
#pprint(flatten)

In [None]:
def classify_dataset(
        data:pd.DataFrame,
        entities_aspects:str,
        output_example:str,
        prompt_template:ChatPromptTemplate,
        id_column:str='id',
        text_column:str='text') -> pd.DataFrame:
    """Classify the entire dataset using the given entities and aspects."""
    result = []

    for index, row in tqdm(data.iterrows(), total=data.shape[0]):
        # get the row data
        id = row[id_column]
        text = row[text_column]

        # classify the text
        classification = classify_aspects(
            text=text,
            entities_aspects=entities_aspects,
            output_example=output_example,
            prompt_template=prompt_template)
        
        result.extend(
            flatten_aspects(id=id, classification=classification)
        )

    return pd.DataFrame(result)

# test the function
df_test = df_source.head(2)
df_test['text'] = '# ' + df_source.review_title + '\n\n' + df_source.review_content

df_result = classify_dataset(
    data=df_test,
    entities_aspects=entities_aspects_str,
    output_example=output_example_str,
    prompt_template=prompt_template,
    id_column='id',
    text_column='text')

display(df_result)

## Classification

In [None]:
# add the text column
df_source['text'] = '# ' + df_source.review_title + '\n\n' + df_source.review_content

# perform the classification
df_result = classify_dataset(
    data=df_source,
    entities_aspects=entities_aspects_str,
    output_example=output_example_str,
    prompt_template=prompt_template,
    id_column='id',
    text_column='text')

# save the results
df_result.to_parquet('../../data/interim/01-09_absa.parquet')