<a href="https://colab.research.google.com/github/JJingLu/CBS5055-Generative-Artificial-Intelligence-for-Innovative-Communications/blob/main/Workshop_4_Generative_AI_for_Advanced_Text_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Workshop 4: Generative AI for Advanced Text Analysis CBS5055

**Instructor: Jessie Lu**  

Welcome to Workshop 4!  

In today’s session, you will learn the efficient fine-tuning technique called LoRA, enabling the model to better adapt to your data. You will get hands-on experience applying the fine-tuned model to real data for sentiment analysis, and you will learn how to interpret the analysis results.



The specific dataset we will explore today is **go_emotions** , originally created by Google Research and published on Hugging Face. It contains approximately 58,000 Reddit comments, each annotated (by human raters) with one or more of 28 fine-grained emotion categories. This dataset is widely used in emotion detection, affective computing, and social media analysis research.  

You can view and explore the dataset directly here:  
*https://huggingface.co/datasets/google-research-datasets/go_emotions*

In [None]:
#@title 1. Install Required Libraries
!pip install -q \
    transformers>=4.30.0 \
    datasets>=2.12.0 \
    peft>=0.4.0 \
    tqdm>=4.65.0 \
    scikit-learn>=1.2.2 \
    torch>=2.0.0 \
    matplotlib>=3.7.0 \
    seaborn>=0.12.0

In [None]:
#@title 2. Import Libraries
from datasets import load_dataset # Import function to load datasets
from transformers import (
    DistilBertTokenizer, # Import tokenizer for DistilBERT
    DistilBertForSequenceClassification, # Import DistilBERT model for sequence classification
    get_linear_schedule_with_warmup # Import learning rate scheduler
)
from torch.optim import AdamW # Import AdamW optimizer
from peft import get_peft_model, LoraConfig, TaskType # Import PEFT utilities for LoRA
from tqdm.auto import tqdm # Import tqdm for progress bars
import torch # Import PyTorch library
from torch.nn.utils import clip_grad_norm_ # Import gradient clipping utility
from sklearn.metrics import accuracy_score # Import accuracy metric from scikit-learn
from torch.utils.data import DataLoader # Import DataLoader for batching data
import os # Import os module for interacting with the operating system
from pathlib import Path # Import Path for object-oriented filesystem paths
import matplotlib.pyplot as plt # Import matplotlib for plotting
import seaborn as sns # Import seaborn for enhanced data visualizations

In [None]:
#@title 3. Directory Setup
SAVE_DIR = Path("saved_data") # Define the base directory for saving data
MODEL_DIR = SAVE_DIR / "model" # Define the directory for saving models
DATASET_DIR = SAVE_DIR / "dataset" # Define the directory for saving datasets
TOKENIZED_DIR = SAVE_DIR / "tokenized_dataset" # Define the directory for saving tokenized datasets
LORA_DIR = MODEL_DIR / "trained_LoRA" # Define the directory for saving trained LoRA models

os.makedirs(SAVE_DIR, exist_ok=True) # Create the base save directory if it doesn't exist
os.makedirs(MODEL_DIR, exist_ok=True) # Create the model save directory if it doesn't exist
os.makedirs(DATASET_DIR, exist_ok=True) # Create the dataset save directory if it doesn't exist
os.makedirs(TOKENIZED_DIR, exist_ok=True) # Create the tokenized dataset save directory if it doesn't exist
os.makedirs(LORA_DIR, exist_ok=True) # Create the LoRA model save directory if it doesn't exist

In [None]:
#@title 4. Load Dataset
dataset_path = DATASET_DIR / "go_emotions_simplified" # Define the path for the simplified go_emotions dataset
if os.path.exists(dataset_path):
    print("Loading cached dataset...") # Inform the user that a cached dataset is being loaded
    dataset = load_dataset("go_emotions", "simplified", cache_dir=str(dataset_path)) # Load dataset from cache
else:
    print("Downloading dataset...") # Inform the user that the dataset is being downloaded
    dataset = load_dataset("go_emotions", "simplified") # Download the go_emotions dataset
    dataset.save_to_disk(str(dataset_path)) # Save the downloaded dataset to disk

# Get number of unique labels
num_labels = len(set(label for example in dataset['train'] for label in example['labels'])) # Calculate the number of unique emotion labels
print(f"Number of emotion labels: {num_labels}") # Print the total number of unique emotion labels

# Define emotion mapping
EMOTIONS = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring",
    "confusion", "curiosity", "desire", "disappointment", "disapproval",
    "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief",
    "joy", "love", "nervousness", "neutral", "optimism", "pride", "realization",
    "relief", "remorse", "sadness", "surprise"
] # Define a list of 28 fine-grained emotion categories

In [None]:
#@title 5. Initialize Model and Tokenizer
from peft import PeftModel, PeftConfig # Import PeftModel and PeftConfig

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") # Load the pre-trained DistilBERT tokenizer

# Initialize or load the model
model_path = LORA_DIR / "distilbert_lora_go_emotions" # Define the path where the LoRA model might be saved

if os.path.exists(model_path):
    print("Loading saved LoRA adapter...") # Inform user that a saved LoRA adapter is being loaded
    # Load the base model first
    base_model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", # Load the pre-trained DistilBERT base model
        num_labels=num_labels, # Initialize model with the correct number of labels
        problem_type="multi_label_classification" # Explicitly set for multi-label
    )
    # Load the LoRA adapter on top of the base model
    model = PeftModel.from_pretrained(base_model, str(model_path))
else:
    print("Initializing new model without LoRA...") # Inform user that a new model is being initialized
    model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", # Load the pre-trained DistilBERT base model
        num_labels=num_labels, # Initialize model with the correct number of labels
        problem_type="multi_label_classification" # Explicitly set for multi-label
    )


In [None]:
#@title 6. Configure and Apply LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, # Define the task type as Sequence Classification
    r=8, # Set the LoRA attention dimension (rank)
    lora_alpha=32, # Set the scaling factor for LoRA weights
    lora_dropout=0.1, # Set the dropout probability for LoRA layers
    bias="none", # Specify that no bias will be trained
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin"] # Specify the modules to apply LoRA to
)

model = get_peft_model(model, lora_config) # Apply the LoRA configuration to the model
model.print_trainable_parameters() # Print the number of trainable parameters after applying LoRA

In [None]:
#@title 7. Data Preprocessing
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples['text'], # Input text to the tokenizer
        padding='max_length', # Pad sequences to the maximum length
        truncation=True, # Truncate sequences longer than max_length
        max_length=64, # Set the maximum sequence length
        return_tensors=None # Return Python lists/arrays, not PyTorch tensors yet
    )

    # Handle labels (one-hot encode for multi-label classification)
    batch_one_hot_labels = []
    for labels_list_for_one_example in examples['labels']:
        one_hot_labels = [0.0] * num_labels
        for label_id in labels_list_for_one_example:
            if 0 <= label_id < num_labels:
                one_hot_labels[label_id] = 1.0
        batch_one_hot_labels.append(one_hot_labels)
    tokenized_inputs['labels'] = batch_one_hot_labels
    return tokenized_inputs # Return the tokenized example with labels

batch_size = 64 # Define the batch size for data loaders

# Tokenize dataset
tokenized_path = TOKENIZED_DIR / "tokenized_dataset" # Define the path for saving tokenized dataset
if os.path.exists(tokenized_path):
    print("Loading cached tokenized dataset...") # Inform user that a cached dataset is being loaded
    tokenized_dataset = load_dataset("go_emotions", "simplified", cache_dir=str(tokenized_path)) # Load tokenized dataset from cache
    # Re-tokenize as the previous save might not be in the right format
    tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=batch_size*4, remove_columns=dataset["train"].column_names, num_proc=1) # Apply tokenization function to the dataset again
else:
    print("Tokenizing dataset...") # Inform user that the dataset is being tokenized
    tokenized_dataset = dataset.map(
        tokenize_function, # Apply the tokenization function
        batched=True, # Process examples in batches
        batch_size=batch_size * 4, # Set batch size for map function
        remove_columns=dataset["train"].column_names, # Remove original text and labels columns
        num_proc=1 # Use a single process to prevent potential hangs
    )
    tokenized_dataset.save_to_disk(str(tokenized_path)) # Save the tokenized dataset to disk

# Set format for PyTorch
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"]) # Convert dataset columns to PyTorch tensors

# Create DataLoaders
train_loader = DataLoader(tokenized_dataset["train"], batch_size=batch_size, shuffle=True) # Create a DataLoader for the training set
eval_loader = DataLoader(tokenized_dataset["validation"], batch_size=batch_size) # Create a DataLoader for the validation set

In [None]:
#@title 8. Train the Model
# Setup
import torch.cuda.amp as amp # Import Automatic Mixed Precision utilities
scaler = amp.GradScaler() # Initialize a gradient scaler for mixed precision training

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") # Determine if CUDA (GPU) is available, otherwise use CPU
print(f"Using device: {device}") # Confirm which device is being used
model.to(device) # Move the model to the selected device
optimizer = AdamW(model.parameters(), lr=4e-5) # Initialize the AdamW optimizer with a learning rate
max_grad_norm = 1.0 # Define the maximum gradient norm for clipping
num_epochs = 3 # Set the number of training epochs

num_training_steps = len(train_loader) * num_epochs # Calculate the total number of training steps
num_warmup_steps = num_training_steps // 10 # Calculate the number of warmup steps for the scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # Initialize the learning rate scheduler

# Training loop
for epoch in range(num_epochs):
    model.train() # Set the model to training mode
    epoch_loss = 0 # Initialize loss for the current epoch
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}") # Create a progress bar for the training loader

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()} # Move batch tensors to the appropriate device
        # Explicitly cast labels to float32 for BCEWithLogitsLoss
        batch['labels'] = batch['labels'].to(torch.float32)
        optimizer.zero_grad() # Reset gradients for the current iteration (moved before forward pass for mixed precision)

        with amp.autocast(enabled=device.type == 'cuda'): # Enable automatic mixed precision only if CUDA is available
            outputs = model(**batch) # Perform a forward pass
            loss = outputs.loss # Get the loss from the model outputs

        scaler.scale(loss).backward() # Scale loss and perform backpropagation
        scaler.unscale_(optimizer) # Unscale gradients before clipping
        clip_grad_norm_(model.parameters(), max_grad_norm) # Clip gradients to prevent exploding gradients
        scaler.step(optimizer) # Update model parameters using the scaled gradients
        scaler.update() # Update the scale for the next iteration
        scheduler.step() # Update the learning rate scheduler

        epoch_loss += loss.item() # Accumulate the loss for the epoch
        progress_bar.set_postfix({"loss": f"{epoch_loss/(progress_bar.n+1):.4f}"}) # Update the progress bar with current average loss

    avg_loss = epoch_loss / len(train_loader) # Calculate the average loss for the epoch
    print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}") # Print the average loss for the epoch

In [None]:
#@title 9. Evaluate the Model
# Before running this cell, please ensure that cells 5, 6, and 8 have been executed successfully.

from sklearn.metrics import accuracy_score # Import accuracy metric from scikit-learn
import torch # Ensure torch is imported for tensor operations

# Ensure the model is on the correct device before evaluation
model.to(device)
model.eval() # Set the model to evaluation mode
all_predictions = [] # Initialize a list to store all model predictions
all_labels = [] # Initialize a list to store all true labels

with torch.no_grad(): # Disable gradient calculations for inference
    for batch in tqdm(eval_loader, desc="Evaluating"): # Iterate over the evaluation data loader with a progress bar
        batch = {k: v.to(device) for k, v in batch.items()} # Move batch tensors to the appropriate device (CPU/GPU)
        # Explicitly cast labels to float32 for BCEWithLogitsLoss
        batch['labels'] = batch['labels'].to(torch.float32)
        outputs = model(**batch) # Perform a forward pass to get model outputs

        # For multi-label classification, apply sigmoid and threshold to get binary predictions
        probabilities = torch.sigmoid(outputs.logits) # Get probabilities for each class
        predictions = (probabilities > 0.5).int() # Convert probabilities to binary (0 or 1) based on a 0.5 threshold

        all_predictions.extend(predictions.cpu().numpy()) # Store predictions, moving them to CPU and converting to NumPy array
        all_labels.extend(batch['labels'].cpu().numpy()) # Store true labels, moving them to CPU and converting to NumPy array

accuracy = accuracy_score(all_labels, all_predictions) # Calculate the accuracy score (subset accuracy for multi-label)
print(f"Validation Accuracy: {accuracy:.4f}") # Print the calculated validation accuracy

In [None]:
#@title 10. Model Prediction
def predict_sentiment(text):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=64, return_tensors="pt").to(device) # Tokenize the input text and move to device
    with torch.no_grad(): # Disable gradient calculations for inference
        outputs = model(**inputs) # Get model outputs (logits)
        # For multi-label classification, apply sigmoid and threshold
        probabilities = torch.sigmoid(outputs.logits) # Get probabilities for each class
        predictions = (probabilities > 0.5).int() # Convert probabilities to binary (0 or 1) based on a 0.5 threshold

    # Convert predictions to a list of emotion names
    predicted_emotions = [EMOTIONS[idx] for idx, val in enumerate(predictions[0]) if val == 1]

    if not predicted_emotions:
        return "neutral" # Return 'neutral' if no emotion is predicted above the threshold
    return ", ".join(predicted_emotions) # Return a comma-separated string of predicted emotions

# Test with examples
examples = [
    "I'm so excited about this workshop!",
    "This is the worst experience ever.",
    "The meeting is scheduled for 3 PM.",
    "Thank you for your help!",
    "I love spending time with my family."
]

for text in examples:
    print(f"Text: {text}") # Print the input text
    print(f"Predicted Emotion: {predict_sentiment(text)}\n") # Print the predicted emotion

In [None]:
#@title 11. Visualization: Emotion Distribution in Training Data

df = dataset["train"].to_pandas() # Convert the training dataset to a pandas DataFrame
df["emotion"] = df["labels"].apply(lambda x: EMOTIONS[x[0]] if len(x) > 0 else "neutral") # Map numerical labels to emotion names, handling empty lists explicitly
emotion_counts = df["emotion"].value_counts().head(10) # Get the top 10 most common emotions and their counts

plt.figure(figsize=(12, 6)) # Create a new figure with a specified size
sns.barplot(x=emotion_counts.values, y=emotion_counts.index, palette="viridis") # Create a bar plot of emotion counts
plt.title("Top 10 Most Common Emotions in GoEmotions Dataset") # Set the title of the plot
plt.xlabel("Number of Comments") # Set the label for the x-axis
plt.ylabel("Emotion") # Set the label for the y-axis
plt.show() # Display the plot

In [None]:
#@title 12. Save the Model
model.save_pretrained(str(LORA_DIR / "distilbert_lora_go_emotions")) # Save the trained LoRA model to the specified directory
tokenizer.save_pretrained(str(LORA_DIR / "distilbert_lora_go_emotions")) # Save the tokenizer to the same directory
print("Model and tokenizer saved successfully!") # Print a success message

In [None]:
#@title 13. Social media engagement analysis
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from collections import Counter
import re
import warnings
warnings.filterwarnings('ignore')

# ====================== Step 1: Load and Preprocess facebook_engagement_data Dataset ======================
# Load dataset (stable and available without extra permissions)
dataset = load_dataset("Falah/facebook_engagement_data", split="train")
df = dataset.to_pandas()

# 1. Standardize core fields (align analysis logic)
df = df.rename(columns={
    'engagement_reaction_count': 'like_count',  # Likes/Reactions (corresponds to like)
    'engagement_comment_count': 'comment_count',  # Comment count
    'engagement_share_count': 'share_count',  # Share count (corresponds to retweet)
    'url_to_image': 'image_url'  # Image link (to determine if media is present)
})

# 2. Construct key features (optimization: relax text length filtering)
df['text'] = df['title'] + " " + df['content'].fillna("")  # Combine title + body as analysis text
df['has_media'] = df['image_url'].notna()  # Boolean: whether it contains an image (True=has media)
df['total_engagement'] = df['like_count'] + df['comment_count'] + df['share_count']  # Total engagement count

# 3. Data Cleaning (optimization: reduce filtering, retain more samples)
for col in ['like_count', 'comment_count', 'share_count', 'total_engagement']:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
df = df[df['text'].str.len() > 1]  # Only filter out empty text (no longer filtering short text)
df = df.reset_index(drop=True)

# Print data volume monitoring
print(f"✅ Dataset loaded: {len(df)} valid records")
print(f"Records with media: {df['has_media'].sum()} | Records without media: {len(df) - df['has_media'].sum()}")
print(f"Average likes: {df['like_count'].mean():.2f} | Average shares: {df['share_count'].mean():.2f}")

# ====================== Step 2: Perform Text Sentiment Annotation with Twitter-specific Sentiment Model ======================
# Load cardiffnlp/twitter-xlm-roberta-base-sentiment (adapted for social media text)
MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Text preprocessing (adapted to model requirements: replace @users and links)
def preprocess_text(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

df['clean_text'] = df['text'].apply(preprocess_text)

# Batch sentiment annotation (returns positive/neutral/negative labels)
def get_sentiment(text):
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = np.exp(scores) / np.sum(np.exp(scores))  # Softmax normalization
    ranking = np.argsort(scores)[::-1]
    return config.id2label[ranking[0]]  # Return the sentiment label with the highest confidence

# Optimization: use more samples (first 2000), avoid insufficient samples
sample_df = df.head(2000).copy()
sample_df['sentiment'] = sample_df['clean_text'].apply(get_sentiment)

# Print annotated sample distribution
print(f"\n=== Sentiment Label Distribution ===")
print(sample_df['sentiment'].value_counts())
print(f"Number of valid annotated samples: {len(sample_df)}")

# ====================== Step 3: Core Correlation Analysis (Content Features → Engagement) ======================
# 3.1 Relationship between sentiment labels and engagement metrics (group statistics)
sentiment_engage = sample_df.groupby('sentiment').agg({
    'like_count': ['mean', 'median'],
    'comment_count': ['mean', 'median'],
    'share_count': ['mean', 'median'],
    'total_engagement': ['mean', 'median']
}).round(2)

print("\n=== Sentiment Labels vs Engagement Metrics ===")
print(sentiment_engage)

# 3.2 Relationship between media features and engagement metrics
media_engage = sample_df.groupby('has_media').agg({
    'like_count': 'mean',
    'share_count': 'mean',
    'total_engagement': 'mean'
}).round(2)

print("\n=== Has Media vs Average Engagement ===")
print(media_engage)

# 3.3 Correlation Analysis (optimization: add sample count validation + error handling)
# Encode categorical variables
sample_df['sentiment_code'] = sample_df['sentiment'].map({'positive': 2, 'neutral': 1, 'negative': 0})
sample_df['has_media_code'] = sample_df['has_media'].astype(int)

# Filter valid data
valid_data = sample_df.dropna(subset=['sentiment_code', 'total_engagement', 'has_media_code'])
print(f"\n=== Correlation Analysis Preparation ===")
print(f"Number of valid samples for analysis: {len(valid_data)}")

# Core fix: add sample count check
if len(valid_data) >= 2:
    # Check if variables have more than one unique value (to avoid meaningless correlation calculation)
    if valid_data['sentiment_code'].nunique() > 1:
        corr_senti_total, p_senti = pearsonr(valid_data['sentiment_code'], valid_data['total_engagement'])
        print(f"Correlation between sentiment label and total engagement: {corr_senti_total:.4f} (p-value: {p_senti:.4f})")
    else:
        print("⚠️ Sentiment label has only one value, correlation cannot be calculated")
        corr_senti_total, p_senti = np.nan, np.nan

    if valid_data['has_media_code'].nunique() > 1:
        corr_media_total, p_media = pearsonr(valid_data['has_media_code'], valid_data['total_engagement'])
        print(f"Correlation between media feature and total engagement: {corr_media_total:.4f} (p-value: {p_media:.4f})")
    else:
        print("⚠️ Media feature has only one value, correlation cannot be calculated")
        corr_media_total, p_media = np.nan, np.nan
else:
    print("⚠️ Insufficient valid samples (less than 2), skipping correlation calculation")
    corr_senti_total, p_senti = np.nan, np.nan
    corr_media_total, p_media = np.nan, np.nan

# 3.4 Association between high-frequency words and engagement (text feature level)
def clean_text_for_wordcount(text):
    """Clean text for word frequency counting"""
    text = re.sub(r'@user|http|[^\w\s]', '', text.lower())  # Remove special characters, lowercase
    # Extend stopwords list to improve word frequency quality
    stopwords = ['the', 'and', 'for', 'with', 'to', 'of', 'a', 'in', 'is', 'it', 'on', 'at', 'by', 'from']
    return [word for word in text.split() if len(word) > 2 and word not in stopwords]  # Filter stopwords

# Count high-frequency words under different sentiments (associated with high engagement)
high_engage_threshold = sample_df['total_engagement'].quantile(0.7)  # Top 30% are high engagement
high_engage_df = sample_df[sample_df['total_engagement'] > high_engage_threshold]

print(f"\n=== High Engagement Text Analysis ===")
print(f"High engagement threshold: {high_engage_threshold:.2f} | Number of high engagement samples: {len(high_engage_df)}")

if len(high_engage_df) > 0:
    emotion_high_words = {}
    for emotion in high_engage_df['sentiment'].unique():
        texts = high_engage_df[high_engage_df['sentiment'] == emotion]['clean_text'].apply(clean_text_for_wordcount)
        all_words = [word for sublist in texts for word in sublist]
        emotion_high_words[emotion] = Counter(all_words).most_common(8)

    print("\n=== High-frequency words in high engagement texts for each sentiment (top 8) ===")
    for emotion, words in emotion_high_words.items():
        print(f"{emotion}: {words}")
else:
    print("⚠️ No high engagement text samples, skipping word frequency analysis")

# ====================== Step 4: Visualize Analysis Results (Academic Quality Plots) ======================
plt.rcParams['font.sans-serif'] = ['DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# 4.1 Comparison of total engagement across different sentiments (bar chart)
plt.figure(figsize=(10, 6))
sns.barplot(x='sentiment', y='total_engagement', data=sample_df, palette='Set2')
plt.title('Sentiment vs Total Engagement (Facebook Data)', fontsize=14, pad=20)
plt.xlabel('Sentiment Label', fontsize=12)
plt.ylabel('Average Total Engagement (Like+Comment+Share)', fontsize=12)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('facebook_sentiment_engagement.png', dpi=300)
plt.show()


# 4.2 Engagement metric correlation heatmap (optimization: handle NaN values)
corr_cols = ['like_count', 'comment_count', 'share_count', 'total_engagement', 'sentiment_code', 'has_media_code']
corr_matrix = sample_df[corr_cols].corr().round(3)
# Fill NaN values with 0 (to prevent heatmap errors)
corr_matrix = corr_matrix.fillna(0)

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt='.3f')
plt.title('Correlation Matrix of Engagement Metrics & Features', fontsize=14, pad=20)
plt.tight_layout()
plt.savefig('facebook_correlation_heatmap.png', dpi=300)
plt.show()