# Emotion


In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion")




Device set to use cpu


In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

text = "Celebrating my promotion 😎"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

# text = "Celebrating my promotion 😎"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")


1) joy 0.9382
2) optimism 0.0362
3) anger 0.0145
4) sadness 0.0112


## 2. emotion analysis with Emotion English DistilRoBERTa-base

In [4]:
from transformers import pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
classifier("I love this!")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


[[{'label': 'anger', 'score': 0.004419790115207434},
  {'label': 'disgust', 'score': 0.001611991785466671},
  {'label': 'fear', 'score': 0.0004138524236623198},
  {'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'neutral', 'score': 0.005764589179307222},
  {'label': 'sadness', 'score': 0.002092392183840275},
  {'label': 'surprise', 'score': 0.008528688922524452}]]

In [5]:
classifier("the boy tom hansen of margate new jersey grew up believing that hed never truly be happy until the day he met the one.")

[[{'label': 'anger', 'score': 0.004169111605733633},
  {'label': 'disgust', 'score': 0.0010336566483601928},
  {'label': 'fear', 'score': 0.002204807009547949},
  {'label': 'joy', 'score': 0.7707111835479736},
  {'label': 'neutral', 'score': 0.057463470846414566},
  {'label': 'sadness', 'score': 0.10220526903867722},
  {'label': 'surprise', 'score': 0.06221246346831322}]]

In [6]:
classifier("this is not a love story.")

[[{'label': 'anger', 'score': 0.004346416797488928},
  {'label': 'disgust', 'score': 0.010130811482667923},
  {'label': 'fear', 'score': 0.0022675523068755865},
  {'label': 'joy', 'score': 0.003684586612507701},
  {'label': 'neutral', 'score': 0.9365279078483582},
  {'label': 'sadness', 'score': 0.011444678530097008},
  {'label': 'surprise', 'score': 0.03159790113568306}]]

# Sentiment


## 1. sentiment analysis with Twitter-roBERTa-base for Sentiment Analysis

In [29]:
import pandas as pd
import numpy as np

# Read the excel file into a pandas DataFrame
df = pd.read_excel('RimshaResults.xlsx')
test_sentences = df['Cleaned']
mask = test_sentences.isnull()
test_sentences = test_sentences[~mask]
test_sentences = test_sentences.to_frame(name = 'sentences')

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)
text = "Covid cases are increasing fast!"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    #print(f"{i+1}) {l} {np.round(float(s), 4)}")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initia

{'input_ids': tensor([[   0,  347, 1417,  808, 1200,   32, 2284, 1769,  328,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [3]:
type(encoded_input)

transformers.tokenization_utils_base.BatchEncoding

In [10]:
#negative neutral positive
all_scores = []
for sentence in test_sentences:
    encoded_input = tokenizer(sentence, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    all_scores.append(scores)

In [11]:
score_roberta = pd.DataFrame(all_scores, columns=['negative', 'neutral', 'positive'])
score_roberta['label'] = score_roberta.apply(lambda x: x.idxmax(), axis=1)
score_roberta['label'] = score_roberta['label'].map({'neutral': 0, 'negative': 1, 'positive': 2})

In [12]:
score_roberta

Unnamed: 0,negative,neutral,positive,label
0,0.039184,0.817278,0.143539,0
1,0.607574,0.338367,0.054060,1
2,0.846521,0.144251,0.009228,1
3,0.019197,0.938852,0.041951,0
4,0.021781,0.188018,0.790200,2
...,...,...,...,...
283,0.126397,0.749557,0.124047,0
284,0.103022,0.668318,0.228661,0
285,0.073743,0.804408,0.121849,0
286,0.008609,0.033962,0.957429,2


## 2. sentiment analysis with vader

In [13]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
sentiment = []
for sentence in test_sentences:
    vs = analyzer.polarity_scores(sentence)
    sentiment.append(vs)

In [14]:
score_vader = pd.DataFrame(sentiment)
score_vader['label'] = score_vader['compound'].apply(lambda x: 2 if x >= 0.05 else (1 if x <= -0.05 else 0))
score_vader
# 0 neutral, 1 negative, 2 positive

Unnamed: 0,neg,neu,pos,compound,label
0,0.000,1.000,0.000,0.0000,0
1,0.294,0.706,0.000,-0.6602,1
2,0.237,0.763,0.000,-0.4767,1
3,0.000,0.855,0.145,0.2960,2
4,0.000,0.435,0.565,0.5994,2
...,...,...,...,...,...
283,0.000,1.000,0.000,0.0000,0
284,0.000,0.345,0.655,0.2263,2
285,0.000,0.671,0.329,0.5267,2
286,0.000,0.253,0.747,0.7096,2


# word to vec test

In [30]:
def sentence_to_vec(sentence, model, vector_size=300):
    words = sentence.lower().split()  # simple tokenization; you can use nltk or spacy for better tokenization
    word_vectors = [model[word] for word in words if word in model.key_to_index]
    
    if len(word_vectors) == 0:
        return np.zeros(vector_size)
    else:
        return np.mean(word_vectors, axis=0)


In [32]:
from gensim.models import Word2Vec
from gensim.downloader import load

# Load a pre-trained Word2Vec model
model = load("word2vec-google-news-300")


In [33]:
test_sentences['embedding'] = test_sentences['sentences'].apply(lambda x: sentence_to_vec(x, model))


In [36]:
test_sentences

Unnamed: 0,sentences,embedding
0,story boy meets girl,"[0.06713867, 0.07640457, 0.0021972656, -0.0164..."
1,boy tom hansen margate new jersey grew believi...,"[-0.013538905, 0.06836809, -0.021657126, 0.052..."
2,belief stemmed early exposure sad british pop ...,"[0.059362236, 0.008356268, 0.012291648, 0.0843..."
3,graduate elaine elaine girl summer finn shinne...,"[-0.014204759, 0.019424438, -0.08702557, 0.122..."
4,shed loved two things,"[0.067993164, 0.06994629, 0.019569397, 0.07000..."
...,...,...
283,wait one second,"[0.0764974, -0.0052083335, 0.12597656, 0.13720..."
284,okay settle,"[0.056518555, 0.13305664, 0.06585693, 0.143554..."
285,shes girl girl wants keep casual shes bed righ...,"[0.09719413, -0.11569432, -0.003060477, 0.1018..."
286,thats fine thats great,"[0.088012695, -0.04916382, 0.09298706, 0.09790..."


## compare similarity

In [8]:
from scipy.stats import spearmanr

corr, _ = spearmanr(score_vader['label'], score_roberta['label'])
print(f"Spearman correlation: {corr:.2f}")

Spearman correlation: 0.47


In [9]:
similarity = (score_vader['label'] == score_roberta['label']).mean()
print(f"Accuracy: {similarity:.2f}")

Accuracy: 0.66


In [10]:
from sklearn.metrics import cohen_kappa_score

kappa = cohen_kappa_score(score_vader['label'], score_roberta['label'])
print(f"Cohen's Kappa: {kappa:.2f}")

Cohen's Kappa: 0.47


## 3. sentiment analysis from audio

In [15]:
import k2

ModuleNotFoundError: No module named 'k2'

In [16]:
from speechbrain.inference.interfaces import foreign_class
classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
out_prob, score, index, text_lab = classifier.classify_file("speechbrain/emotion-recognition-wav2vec2-IEMOCAP/anger.wav")
print(text_lab)


ModuleNotFoundError: No module named 'speechbrain'