In [2]:
import pandas as pd

Let's load the dataset and extract just the comments, rather than using the entire dataset..

In [3]:
dataset = pd.read_csv("../datasets/Dataset.csv", encoding='ISO-8859-1')
dataset.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [4]:
dataset.columns

Index(['0', '1467810369', 'Mon Apr 06 22:19:45 PDT 2009', 'NO_QUERY',
       '_TheSpecialOne_',
       '@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D'],
      dtype='object')

The dataset does not have any proper column names, lets provide some useful labels..

In [5]:
dataset.columns = ["sentiment", "id", "date", "query", "user", "tweet"]
dataset.head()

Unnamed: 0,sentiment,id,date,query,user,tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


The only neccessary features at this point of time are sentiment, and tweet features..

In [6]:
dataset = dataset.drop(columns=["id", "date", "query", "user"])

In [7]:
dataset

Unnamed: 0,sentiment,tweet
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1599994,4,Just woke up. Having no school is the best fee...
1599995,4,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,Happy 38th Birthday to my boo of alll time!!! ...


Load the transformer Model: RoBerta

In [8]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, pipeline
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"

sentiment_pipeline = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path, device=0) # device=0 means use the first GPU



  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Now let's try and evaluate our model's performance on our dataset:

In [11]:
from sklearn.model_selection import train_test_split


tweets = dataset['tweet'].tolist()
sentiments = dataset['sentiment'].tolist()

X_train, X_test, y_train, y_test = train_test_split(tweets, sentiments, test_size=0.2, random_state=42)



In [12]:
def predict_sentiment(texts):
    results = sentiment_pipeline(texts)
    # Convert the 'positive' and 'negative' labels to 1 and 0 respectively.
    return [1 if res['label'] == 'positive' else 0 for res in results]

y_pred = predict_sentiment(X_test)


In [14]:
print(y_pred)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 

In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.68      0.82      0.75    159494
           1       0.00      0.00      0.00         0
           4       0.00      0.00      0.00    160506

    accuracy                           0.41    320000
   macro avg       0.23      0.27      0.25    320000
weighted avg       0.34      0.41      0.37    320000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Let's plot the confusion Matrix:

In [21]:
from sklearn.metrics import confusion_matrix


confusion_matrix(y_test, y_pred)


array([[131484,  28010,      0],
       [     0,      0,      0],
       [ 60928,  99578,      0]], dtype=int64)

Our overall performance might have taken a hit due to the fact that the Test Data only has 2 labels, but our pipeline produces labels for Three classes i.e. Positive, Neutral and Negative. On the whole, I believe that RoBERT can still be a viable tool for this.

In [23]:
dataset = dataset[:100000]


Now that we've downscaled the dataset, let's try and now predict the label for each of the 

In [33]:
sarcasm_data = pd.read_csv("../datasets/sarcasm_data.csv")

Let's downscale our data:

In [34]:
sarcasm_data = sarcasm_data[:100000]
sarcasm_data.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


Let's drop all tuples with NULL values:

In [35]:
sarcasm_data.dropna()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...
...,...,...,...,...,...,...,...,...,...,...
99995,0,so jealous,rellaller,MaddenUltimateTeam,1,-1,-1,2016-12,2016-12-16 16:39:01,Best pull all year for me!
99996,1,"Yeah, we all know it is the Baby Boomers who h...",lespaulstrat2,AskReddit,1,-1,-1,2016-12,2016-12-16 17:11:25,Look down on the upcoming generation.
99997,0,"Story, setting, artstyle",tullbery,halo,4,-1,-1,2016-12,2016-12-16 17:31:55,What are you most excited about for Halo Wars ...
99998,0,Ebola Virus to Papyrus to the German Iris to M...,cench,AskReddit,4,-1,-1,2016-12,2016-12-24 19:30:40,Natalie Dormer and a clone of the former.


In [36]:
sarcasm_data.columns

Index(['label', 'comment', 'author', 'subreddit', 'score', 'ups', 'downs',
       'date', 'created_utc', 'parent_comment'],
      dtype='object')

Let's drop irrelevant fields:

In [37]:
sarcasm_data = sarcasm_data.drop(columns=["author", "subreddit", "score", "ups", "downs", "date", "created_utc"])
sarcasm_data

Unnamed: 0,label,comment,parent_comment
0,0,NC and NH.,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",deadass don't kill my buzz
4,0,I could use one of those tools.,Yep can confirm I saw the tool they use for th...
...,...,...,...
99995,0,so jealous,Best pull all year for me!
99996,1,"Yeah, we all know it is the Baby Boomers who h...",Look down on the upcoming generation.
99997,0,"Story, setting, artstyle",What are you most excited about for Halo Wars ...
99998,0,Ebola Virus to Papyrus to the German Iris to M...,Natalie Dormer and a clone of the former.


Now let's add features: 
parent_sentiment which has the label of the sentiment, and then parent_setiment_score which determines the extent of it.

In [42]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path).to("cuda")  # Move model to GPU

predicted_sentiments = []
scores = []

for batch_comments in sarcasm_data['parent_comment'].to_numpy().reshape(num_batches, batch_size):
    # Tokenize and ensure each input doesn't exceed the max_length
    inputs = tokenizer(list(batch_comments), return_tensors="pt", padding=True, truncation=True, max_length=max_length).to("cuda")  # Move inputs to GPU

    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract scores and predicted sentiments
    logits = outputs.logits
    softmax_scores = torch.nn.functional.softmax(logits, dim=1)
    preds = torch.argmax(logits, dim=1)
    
    # Depending on the mapping in your model, 0 might be "negative", 1 might be "neutral", and 2 might be "positive".
    # Adjust this mapping as needed.
    sentiment_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
    batch_predicted = [sentiment_mapping[pred.item()] for pred in preds]
    batch_scores = [score[pred.item()].item() for score, pred in zip(softmax_scores, preds)]
    
    predicted_sentiments.extend(batch_predicted)
    scores.extend(batch_scores)

sarcasm_data['predicted_sentiment'] = predicted_sentiments
sarcasm_data['score'] = scores


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
