In [1]:
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

In [3]:
# Load pre-trained model and tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Define the class labels
labels = ['Negative', 'Neutral', 'Positive']

In [5]:
# Function to perform sentiment analysis with detailed output
def analyze_sentiment(text):
    # Tokenize input
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
    
    # Get model output
    output = model(**encoded_input)
    
    # Convert logits to probabilities
    scores = output.logits.detach().numpy()
    probs = softmax(scores[0])
    
    # Print each class with its probability
    print(f"\nText: {text}")
    for i, label in enumerate(labels):
        print(f"{label}: {probs[i]:.4f}")
    
    # Return the label with the highest score
    sentiment = labels[np.argmax(probs)]
    print(f"\n🔍 Predicted Sentiment: **{sentiment}**")
    return sentiment

# Sentiment analysis function for batch processing
def get_sentiment(text):
    try:
        # Handle empty text
        if not isinstance(text, str) or len(text.strip()) == 0:
            return "Neutral"
            
        # Preprocess input
        encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        output = model(**encoded_input)
        scores = output.logits.detach().numpy()
        probs = softmax(scores[0])
        return labels[np.argmax(probs)]
    except Exception as e:
        print(f"Error processing: '{text}' | {str(e)}")
        return "Error"

# Bulk sentiment analysis function
def analyze_csv_sentiment(input_csv, text_column="Comment", output_csv="comments_with_sentiment.csv"):
    # Create output directory if needed
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    
    # Load CSV using OS-agnostic path
    df = pd.read_csv(input_csv)
    
    # Ensure the text column exists
    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found. Available columns: {df.columns.tolist()}")

    # Apply sentiment analysis with progress tracking
    tqdm.pandas(desc="Analyzing Sentiment")
    df["Predicted_Sentiment"] = df[text_column].progress_apply(get_sentiment)

    # Save output
    df.to_csv(output_csv, index=False)
    print(f"\n✅ Analysis complete! {len(df)} records processed")
    print(f"📊 Sentiment distribution:\n{df['Predicted_Sentiment'].value_counts()}")
    print(f"💾 Results saved to: {os.path.abspath(output_csv)}")
    return df

if __name__ == "__main__":
    # Example single-text analysis
    print("="*50)
    print("SINGLE TEXT ANALYSIS DEMO")
    print("="*50)
    sample_text = "Don't worry, I will come by next time!"
    analyze_sentiment(sample_text)
    
    # Batch processing example (modify paths as needed)
    print("\n" + "="*50)
    print("BATCH CSV PROCESSING")
    print("="*50)
    input_path = os.path.join("data", "Comment.csv")  # Relative path
    output_path = os.path.join("results", "comments_with_sentiment.csv")
    
    # Create sample data directory if missing
    os.makedirs("data", exist_ok=True)
    os.makedirs("results", exist_ok=True)
    
    # Generate minimal sample CSV if none exists
    if not os.path.exists(input_path):
        sample_data = pd.DataFrame({
            "Comment": [
                "This product changed my life!",
                "Terrible experience, never buying again",
                "It's okay, nothing special"
            ]
        })
        sample_data.to_csv(input_path, index=False)
        print(f"⚠️ Created sample data at: {input_path}")
    
    # Run analysis
    try:
        analyze_csv_sentiment(
            input_csv=input_path,
            text_column="Comment",
            output_csv=output_path
        )
    except Exception as e:
        print(f"❌ Analysis failed: {str(e)}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


SINGLE TEXT ANALYSIS DEMO

Text: Don't worry, I will come by next time!
Negative: 0.0051
Neutral: 0.1696
Positive: 0.8253

🔍 Predicted Sentiment: **Positive**

BATCH CSV PROCESSING
⚠️ Created sample data at: data\Comment.csv


Analyzing Sentiment: 100%|███████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 11.35it/s]


✅ Analysis complete! 3 records processed
📊 Sentiment distribution:
Predicted_Sentiment
Positive    2
Negative    1
Name: count, dtype: int64
💾 Results saved to: C:\Users\emeka\Downloads\NLP\results\comments_with_sentiment.csv



