# Sentiment Analysis

This notebook generate the sentiment analysis results and save it in UC for downstream use. 

# Load Data

In [0]:
# Standard library imports
import textwrap

# Third-party imports
import pandas as pd
import mlflow
from transformers import pipeline

# Local application imports
import de_utils as dut

In [0]:
df_spark = spark.sql("SELECT * FROM bupa_call_synthetic_dataset")
df = df_spark.toPandas()

# Sentiment Analysis

In [0]:
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    framework="pt"
)

def summarize_sentiment(text, chunk_size=500):
    # Break the text into chunks
    chunks = textwrap.wrap(text, chunk_size)
    
    # Initialize variables to store scores
    scores = []
    first_chunk_score = None
    last_chunk_score = None
    
    # Analyze each chunk
    for i, chunk in enumerate(chunks):
        result = sentiment_analyzer(chunk)
        score = result[0]['score']
        scores.append(score)
        
        # Capture scores for the first and last chunks
        if i == 0:
            first_chunk_score = score
        if i == len(chunks) - 1:
            last_chunk_score = score
    
    # Calculate the average score
    average_score = sum(scores) / len(scores) if scores else 0.0
    
    return {
        'average_score': average_score,
        'first_chunk_score': first_chunk_score,
        'last_chunk_score': last_chunk_score
    }

In [0]:
# Implement the analysis 
sentiment_scores = df['Summary'].apply(summarize_sentiment)
df[['average_score', 'first_chunk_score', 'last_chunk_score']] = pd.DataFrame(sentiment_scores.tolist(), index=df.index)

In [0]:
# Write the data back to UC
table_name = "bupa_call_synthetic_dataset_with_sentiment_score"
dut.write_data_to_bricks_catalog(df, table_name)