In [1]:
import numpy as np
import pandas as pd
from scipy.special import softmax


In [2]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig

from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch

print(torch.__version__)

torch.cuda.is_available()

2.1.2


True

## Let's do the EDA.

In [4]:
df = pd.read_csv("youtube_comments.csv", delimiter=',')
df_comments= df["Comment"]

In [5]:
text  = df_comments[:]
text.head()

0                                  USHER IS A LEGEND 🔥
1                                                Yeah!
2                                       Let Him Cook!!
3                                Ok so I ate a raccoon
4    All these star performers came together tonigh...
Name: Comment, dtype: object

In [6]:
max_sequence_length = 514  # Maximum sequence length supported by your model
modified_text = []

for i in range(len(text)): 
    # Check if the element is NaN
    if isinstance(text[i], str):
        # Filter out comments longer than the maximum sequence length
        if len(text[i]) > max_sequence_length:
            continue
        
        # Initialize an empty list to store modified words
        modified_words = []

        # Split the text into words
        for word in text[i].split(' '):
            if ' ' not in word and ('@@' in word or word.startswith('http')):
                continue
            elif '@@' in word:
                word = word.replace('@@', '@user')
            elif word.startswith('http'):
                word = 'http'

            # Keep the original word if it doesn't match any conditions
            modified_words.append(word)

        # Check if the list of modified words is empty
        if modified_words:
            # Join the modified words into a single string and append it to modified_text
            modified_text.append(' '.join(modified_words))


In [7]:
len(modified_text)

37946

## Load the model

In [3]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
labels = ['Negative', 'Neutral', 'Positive']

In [10]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move model to GPU
model.to(device)

# Initialize variables to store cumulative sentiment scores
total_scores = torch.zeros(len(labels), device=device)
count = 0

# Loop through each text in your dataset
for text in modified_text:
    # Encode the comment and move tensors to GPU
    encoded_comment = tokenizer(text, return_tensors='pt').to(device)
    # Check if the token length exceeds the maximum sequence length
    if encoded_comment['input_ids'].size(1) > max_sequence_length:
        continue

    # Analyze sentiment using the model
    output = model(**encoded_comment)
    
    # Extract sentiment scores and move tensor to CPU
    scores = output.logits[0].detach().cpu().numpy() 
    scores = softmax(scores)
    
    # Accumulate sentiment scores
    total_scores += torch.tensor(scores, device=device)
    count += 1 # To check if there were embedings with more lenght than 514

In [11]:
# Calculate average sentiment scores
average_scores = total_scores / count
# print(count)
# Print average sentiment scores
for label, score in zip(labels, average_scores*100):
    print(f"{label}: {score:.2f}")


Negative: 17.63
Neutral: 26.26
Positive: 56.12
