In [14]:
import numpy as np
import pandas as pd
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

In [15]:
# Load data
df_orig = pd.read_csv('twitter_validation.csv', names=["Twitter ID","Topic","Sentiment","Text"])
df = df_orig.iloc[0:1000]

# Preprocessing (delete username and url)
def preprocess(text):
    temp = []

    for t in text.split(" "): # split a sentence into words by spaces " ".
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        temp.append(t)
    return " ".join(temp)

# Change Irrelevant label into Neutral as mentioned in the data set description.
def adjust_ori_sentiment(sentiment):
    if sentiment == "Irrelevant":
        temp_str = "Neutral"
        return temp_str
    else:
        return sentiment

df['Text'] = df['Text'].apply(preprocess)
df['Sentiment'] = df['Sentiment'].apply(adjust_ori_sentiment)
df

Unnamed: 0,Twitter ID,Topic,Sentiment,Text
0,3364,Facebook,Neutral,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@user Why do I pay for WORD when it functions ...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Neutral,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Neutral,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [16]:
# Get the text data from csv file.
data = df.Text.tolist()

# Choose the pipeline model
model_path = ['cardiffnlp/twitter-roberta-base-sentiment-latest']

sentiment_task = pipeline("sentiment-analysis", model=model_path[0], tokenizer=model_path[0])
sentiment_task("Covid cases are increasing fast!")

df_result = pd.DataFrame(sentiment_task(data))

#df = pd.DataFrame()
df['Label'] = df_result.label.tolist()
df['Confidence'] = df_result.score.tolist()

df

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,Twitter ID,Topic,Sentiment,Text,Label,Confidence
0,3364,Facebook,Neutral,I mentioned on Facebook that I was struggling ...,negative,0.645820
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,neutral,0.693689
2,8312,Microsoft,Negative,@user Why do I pay for WORD when it functions ...,negative,0.927796
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...",negative,0.961268
4,4433,Google,Neutral,Now the President is slapping Americans in the...,negative,0.779104
...,...,...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Neutral,⭐️ Toronto is the arts and culture capital of ...,positive,0.939735
996,4359,CS-GO,Neutral,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,positive,0.972971
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...,negative,0.903055
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.,positive,0.908308


In [17]:
def evalable(label):
    good_label = 0
    bad_label= 0
    for i in range (df.shape[0]):
        if str(df.loc[i,'Sentiment']).lower() == str(df.loc[i,'Label']).lower():
            good_label += 1
        else:
            bad_label += 1
    return good_label, bad_label

In [18]:
evalable(df)

(577, 423)