In [1]:
!pip install transformers



In [140]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import math
import torch 

In [2]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [30]:
# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

tasks = ['emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment']
model_paths = []
tokenizers = []
model_mapping = {task: i for i, task in enumerate(tasks)}

for task in tasks:
  model_path = f"cardiffnlp/twitter-roberta-base-{task}"
  model_paths.append(model_path)


In [31]:
print(model_paths)

['cardiffnlp/twitter-roberta-base-emoji', 'cardiffnlp/twitter-roberta-base-emotion', 'cardiffnlp/twitter-roberta-base-hate', 'cardiffnlp/twitter-roberta-base-irony', 'cardiffnlp/twitter-roberta-base-offensive', 'cardiffnlp/twitter-roberta-base-sentiment']


# Loading data

In [133]:
import pandas as pd
import os

In [143]:
#df = pd.read_csv('IMDB Dataset.csv')

df_sad = pd.read_excel('sad.xlsx')
df_sad = df_sad[df_sad['Language'] == 'en']
df_sad = df_sad[['Text']]
df_sad['label'] = 'sad'
df_sad = df_sad.loc[:10, :]

df_happy = pd.read_excel('vicinitas_search_results.xlsx')
df_happy = df_happy[df_happy['Language'] == 'en']
df_happy = df_happy[['Text']]
df_happy['label'] = 'happy'
df_happy = df_happy.loc[:10, :]

In [167]:
df = pd.concat([df_sad, df_happy]).reset_index(drop=True)
df.head()

Unnamed: 0,Text,label
0,RT @thingsiwantlol : Satan: Hey I bought your ...,sad
1,RT @thedeepestmsgs : Comforting yourself when ...,sad
2,"RT @5Sd2gYGxnkFK7eb : ""I asked the nurses to g...",sad
3,RT @Srkians_77 : Feeling sad for Sonakshi Sinh...,sad
4,"im sad but this one took me out, namu face is ...",sad


In [168]:
#X = df['review'][:50]
#y = df['sentiment'][:50]
X = df['Text']
y = df['label']

In [169]:
def predict(model, tokenizer, preprocess, X, emb_max_size=512):
  X = X.apply(preprocess)
  encoded_input = tokenizer(X.to_list(), return_tensors='pt', padding=True)
  encoded_input['input_ids'] = encoded_input['input_ids'][:, :emb_max_size]
  encoded_input['attention_mask'] = encoded_input['attention_mask'][:, :emb_max_size]

  output = model(**encoded_input)
  return output

In [170]:
for m_type in model_mapping.keys():
    model_path = model_paths[model_mapping[m_type]]

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    
    labels=[]
    mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{m_type}/mapping.txt"
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
    labels = [row[1] for row in csvreader if len(row) > 1]
    
    batch_size = 5
    outputs = []
    for i in range(math.ceil(X.shape[0] / batch_size)):
      print(i)
      x = X[i*batch_size: (i+1)*batch_size]
      out = predict(model, tokenizer, preprocess, x)
      out['logits'] = out['logits'].cpu().detach()
      outputs.append(out)
    
    output = {}
    output['logits'] = torch.cat([out['logits'] for out in outputs], axis=0)
    print(output['logits'].shape)
    
    scores = output['logits'].detach().numpy()
    scores = softmax(scores)
    print(scores.shape)
    
    
    scores = scores / scores.sum(axis=1, keepdims=True)
    for i in range(scores.shape[0]):
      for j in range(scores.shape[1]):
        l = labels[j]
        if l not in df.columns:
            df[l] = 0.0
        df.loc[i, l] = scores[i, j]

0
1
2
3
torch.Size([18, 20])
(18, 20)
0
1
2
3
torch.Size([18, 4])
(18, 4)


Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

0
1
2
3
torch.Size([18, 2])
(18, 2)


Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

0
1
2
3
torch.Size([18, 2])
(18, 2)


Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

0
1
2
3
torch.Size([18, 2])
(18, 2)
0
1
2
3
torch.Size([18, 3])
(18, 3)


In [171]:
df.to_excel('sad_happy_predictions.xlsx')

In [158]:
l = labels[0]
df.loc[i, l] = scores[i, j]

ValueError: cannot reindex from a duplicate axis

In [None]:
df.

In [23]:
import numpy as np
l = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
mapping = {label: l[i] for i, label in enumerate(labels)}
mapping

{'negative': array([1., 0., 0.]),
 'neutral': array([0., 1., 0.]),
 'positive': array([0., 0., 1.])}

In [24]:
y

0     positive
1     positive
2     positive
3     negative
4     positive
5     positive
6     positive
7     negative
8     negative
9     positive
10    negative
11    negative
12    negative
13    negative
14    positive
15    negative
16    positive
17    negative
18    positive
19    negative
20    positive
21    negative
22    positive
23    negative
24    negative
25    positive
26    positive
27    negative
28    negative
29    positive
30    positive
31    positive
32    negative
33    positive
34    negative
35    negative
36    negative
37    negative
38    positive
39    negative
40    negative
41    positive
42    negative
43    negative
44    positive
45    positive
46    negative
47    negative
48    positive
49    negative
Name: sentiment, dtype: object

In [25]:
real_labels = pd.DataFrame(y.map(mapping).tolist()).to_numpy()
real_labels

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0

In [26]:
print('MAE:', np.mean(np.abs(scores - real_labels)))
print('MSE:', np.mean((scores - real_labels) ** 2))
print('Acc:', sum(np.argmax(real_labels, axis=1) == np.argmax(scores, axis=1)) / len(scores))

MAE: 0.24148448864463717
MSE: 0.11238429428269295
Acc: 0.76


In [None]:
real_labels

In [None]:
real_labels.shape

In [None]:
np.argmax(real_labels, axis=1)

In [None]:
np.argmax(scores, axis=1)

In [103]:
string = X[1][:100]
ord(string[26])

127874

In [104]:
string[26] == chr(127874)

True