In [1]:
import pandas as pd
import numpy as np
import emoji
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from scipy.special import softmax

In [4]:
# Preprocess function
def preprocess(text):
    # Convert emojis to their text representation
    text = emoji.demojize(text)
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return ' '.join(text.split())

# Load the CSV data
df = pd.read_csv('data/in.evolve.android.csv')

# Initialize sentiment analysis model and tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Lists to store sentiment and sentiment scores
sentiments = []
sentiment_scores = []

# Chunk size for processing long texts
chunk_size = 256

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    text = preprocess(row['content'])
    
    # Split the text into chunks
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    
    chunk_scores = []
    
    for chunk in chunks:
        if chunk:
            encoded_input = tokenizer(
                chunk,
                return_tensors='pt'
            )
            output = model(**encoded_input)
            scores = output[0][0].detach().numpy()
            scores = softmax(scores)
            ranking = np.argsort(scores)
            ranking = ranking[::-1]
        
            # Get the top sentiment label and score for the chunk
            top_label = config.id2label[ranking[0]]
            top_score = np.round(float(scores[ranking[0]]), 4)
        
            chunk_scores.append(top_score)
    
    # Calculate the average score for all chunks if there are scores
    if chunk_scores:
        average_score = np.mean(chunk_scores)
    else:
        average_score = np.nan
    
    # Append the results to the lists
    sentiments.append(top_label)
    sentiment_scores.append(average_score)

# Add new columns to the DataFrame
df['sentiment'] = sentiments
df['sentiment_score'] = sentiment_scores

# Define priority levels based on sentiment scores
def categorize_priority(score):
    if score >= 0.75:
        return 'P1'  # most negative reviews
    elif score >= 0.50:
        return 'P2'  # moderately negative reviews
    else:
        return 'P3'  # least negative reviews

# Apply the function to the 'sentiment_score' column for negative reviews
df.loc[df['sentiment'] == 'negative', 'priority'] = df.loc[df['sentiment'] == 'negative', 'sentiment_score'].apply(categorize_priority)

# Save the DataFrame with the new columns 
df.to_csv('data/neg_neu_pos_priority.csv', index=False)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
import pandas as pd
import re
import emoji
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import numpy as np
from scipy.special import softmax

# Preprocess function
def preprocess(text):
    # Convert emojis to their text representation
    text = emoji.demojize(text)
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return ' '.join(text.split())

# Load the CSV data
df = pd.read_csv('data/in.evolve.android.csv')

# Initialize sentiment analysis model and tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Function to perform sentiment analysis and categorize priority
def sentiment_analysis_and_priority(row):
    # Preprocess the text
    preprocessed_text = preprocess(row['content'])
    
    # Split the text into chunks
    chunk_size = 256
    chunks = [preprocessed_text[i:i+chunk_size] for i in range(0, len(preprocessed_text), chunk_size)]
    
    chunk_scores = []
    
    # Initialize scores with a default value
    scores = np.array([0.0, 0.0, 0.0])
    
    for chunk in chunks:
        if chunk:
            encoded_input = tokenizer(chunk, return_tensors='pt')
            output = model(**encoded_input)
            scores = output[0][0].detach().numpy()
            scores = softmax(scores)
            ranking = np.argsort(scores)
            ranking = ranking[::-1]
        
            # Get the top sentiment label and score for the chunk
            top_label = config.id2label[ranking[0]]
            top_score = np.round(float(scores[ranking[0]]), 4)
        
            chunk_scores.append(top_score)
    
    # Calculate the average score for all chunks if there are scores
    if chunk_scores:
        average_score = np.mean(chunk_scores)
        sentiment_score = average_score
    else:
        sentiment_score = np.nan
    
    # The order of labels in this model is ['negative', 'neutral', 'positive']
    sentiment = ['negative', 'neutral', 'positive'][np.argmax(scores)]
    
    # Categorize priority based on sentiment and rating
    if sentiment == 'negative':
        # Use the probability score of the negative sentiment and the rating to categorize priority
        negative_score = scores[0]
        rating = row['score']
        if negative_score > 0.6 or rating <= 2:
            priority = 'P1'
        elif negative_score > 0.3 or rating == 3:
            priority = 'P2'
        else:
            priority = 'P3'
    else:
        priority = None
    
    return sentiment, sentiment_score, priority

# Apply sentiment analysis and priority categorization to the data
df[['sentiment', 'sentiment_score', 'priority']] = df.apply(sentiment_analysis_and_priority, axis=1, result_type='expand')

# Save the processed data to a new CSV file
df.to_csv('data/processed_data.csv', index=False)



Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
