In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import emoji
import re
import pandas as pd

In [2]:
# Preprocess text
def preprocess(text):
    # Convert emojis to their text representation
    text = emoji.demojize(text)
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return ' '.join(text.split())
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)
text = "I've been using Epic! with my now 4 year old for over a year. It's awesome. There are a plethora of great books. They miss many of my absolute favorites, but the ones they have are very good and we've found new favorites. And tons of audiobooks! Those are fantastic to have. However, finding all the books in a reading level is impossible, and I very much wish that was MUCH easier. I create lists myself and have found workarounds, but it's annoying and I wish we could search by reading level only."
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1) positive 0.9622
2) neutral 0.0275
3) negative 0.0103


In [4]:
# Preprocess function
def preprocess(text):
    # Convert emojis to their text representation
    text = emoji.demojize(text)
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return ' '.join(text.split())

# Load the CSV data
df = pd.read_csv('data/in.evolve.android.csv')  # Replace 'your_data.csv' with the actual file path

# Initialize sentiment analysis model and tokenizer
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Lists to store sentiment and sentiment scores
sentiments = []
sentiment_scores = []

# Chunk size for processing long texts
chunk_size = 256  # Adjust this value as needed

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    text = preprocess(row['content'])  # Preprocess the 'content' column
    
    # Split the text into chunks
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    
    chunk_scores = []
    
    for chunk in chunks:
        if chunk:
            encoded_input = tokenizer(
                chunk,
                return_tensors='pt'
            )
            output = model(**encoded_input)
            scores = output[0][0].detach().numpy()
            scores = softmax(scores)
            ranking = np.argsort(scores)
            ranking = ranking[::-1]
        
            # Get the top sentiment label and score for the chunk
            top_label = config.id2label[ranking[0]]
            top_score = np.round(float(scores[ranking[0]]), 4)
        
            chunk_scores.append(top_score)
    
    # Calculate the average score for all chunks if there are scores
    if chunk_scores:
        average_score = np.mean(chunk_scores)
    else:
        # Handle the case where there are no scores (empty text)
        average_score = np.nan
    
    # Append the results to the lists
    sentiments.append(top_label)
    sentiment_scores.append(average_score)

# Add new columns to the DataFrame
df['sentiment'] = sentiments
df['sentiment_score'] = sentiment_scores

# Save the DataFrame with the new columns
df.to_csv('data/neg_neu_pos.csv', index=False)  # Replace 'output_data.csv' with the desired output file path


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
