In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import math
import torch 

In [2]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [3]:
# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

tasks = ['emotion', 'hate', 'irony', 'offensive', 'sentiment']
model_paths = []

for task in tasks:
  model_path = f"cardiffnlp/twitter-roberta-base-{task}"
  model_paths.append(model_path)
    
model_mapping = {task: model_paths[i] for i, task in enumerate(tasks)}

In [4]:
print(model_paths)

['cardiffnlp/twitter-roberta-base-emotion', 'cardiffnlp/twitter-roberta-base-hate', 'cardiffnlp/twitter-roberta-base-irony', 'cardiffnlp/twitter-roberta-base-offensive', 'cardiffnlp/twitter-roberta-base-sentiment']


# Loading data

In [5]:
import pandas as pd
import os

In [6]:
df = pd.read_csv('dataset_.csv')
df = df[['Text', 'label', 'emoi', 'hashtags', 'Media URLs']]
df = df.sample(frac=0.02)
df.shape

(147, 5)

In [7]:
from sklearn.model_selection import train_test_split

y = df['label']
df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.2)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
X_train = df_train['Text']
X_test = df_test['Text']

In [8]:
df_train.head()

Unnamed: 0,Text,label,emoi,hashtags,Media URLs
0,RT @FranticKL : This is truly sad and most unk...,sad,[],[],https://pbs.twimg.com/media/FJJAdobagAEra9k.jpg
1,@imVkohli Sad to get this news but you are rhe...,sad,[],[],
2,I'm sad don't touch me https://t.co/ayNqChhXmP...,sad,[],[],https://pbs.twimg.com/media/FJKQ1AlXIAMujwA.jpg
3,RT @AshaRangappa_ : STEP 5: Plan for all of th...,angry,[],[],https://pbs.twimg.com/media/FJKCOhAWQAUEBCE.jpg
4,RT @_EL_PsyCongroo_ : #GenshinImpact #zhongxia...,angry,[],"['GenshinImpact', 'zhongxiao', '鍾魈']",https://pbs.twimg.com/media/FG4kHYxagAAlDAl.jpg


In [9]:
import ast
from tensorflow import keras
hashtags = df_train['hashtags'].str.strip('[]').str.replace("'", '').str.lower()

emojis = df_train['emoi'].str.strip('[]').str.replace("'", '').str.lower()
emoticons_model = keras.models.load_model('emoticons.h5')

images = df_train['Media URLs']

In [10]:
print(hashtags.shape, '==', X_train.shape)

(117,) == (117,)


In [11]:
def predict(model, tokenizer, preprocess, X, emb_max_size=512):
  X = X.apply(preprocess)
  encoded_input = tokenizer(X.to_list(), return_tensors='pt', padding=True)
  encoded_input['input_ids'] = encoded_input['input_ids'][:, :emb_max_size]
  encoded_input['attention_mask'] = encoded_input['attention_mask'][:, :emb_max_size]

  output = model(**encoded_input)
  return output

In [12]:
class TextModel:
    def __init__(self, model_path_mapping, batch_size=10):
        self.batch_size = batch_size
        self.model_path_mapping = model_path_mapping
    
    def predict(self, X, tasks_list, prefix='', verbose=1):
        df = pd.DataFrame()
        
        for i, task in enumerate(tasks_list):
            print('X:', X.shape)
            if verbose:
                print(f'Step {i}/{len(tasks_list)}, Task: {task}')
            model_path = self.model_path_mapping[task]
            
            tokenizer = AutoTokenizer.from_pretrained(model_path)
            model = AutoModelForSequenceClassification.from_pretrained(model_path)
            tokenizer.save_pretrained(model_path)
            model.save_pretrained(model_path) 
            
            labels=[]
            mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
            with urllib.request.urlopen(mapping_link) as f:
                html = f.read().decode('utf-8').split("\n")
                csvreader = csv.reader(html, delimiter='\t')
            labels = [row[1] for row in csvreader if len(row) > 1]
            
            outputs = []
            n_batches = math.ceil(X.shape[0] / self.batch_size)
            for i in range(n_batches):
                if verbose > 1:
                    print(i, '/', n_batches)
                x = X[i*self.batch_size: (i+1)*self.batch_size]

                out = predict(model, tokenizer, preprocess, x)
                out['logits'] = out['logits'].cpu().detach()
                outputs.append(out)
                
            output = {}
            output['logits'] = torch.cat([out['logits'] for out in outputs], axis=0)
            
            scores = output['logits'].detach().numpy()
            scores = softmax(scores, axis=1)
            if verbose:
                print('Output shape:', scores.shape)
            
            for i in range(scores.shape[1]):
                label = labels[i]
                df[prefix + label] = scores[:, i]
        return df

class HashtagModel(TextModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def predict(self, X, *args, **kwargs):
        mask = X.str.len() == 0
        df = super().predict(X, *args, **kwargs)
        print(mask.shape, df.shape)
        df.loc[mask, :] = 0.0
        return df

In [13]:
text_model = TextModel(model_mapping)
text_preds = text_model.predict(X_train, tasks)

X: (117,)
Step 0/5, Task: emotion
Output shape: (117, 4)
X: (117,)
Step 1/5, Task: hate
Output shape: (117, 2)
X: (117,)
Step 2/5, Task: irony
Output shape: (117, 2)
X: (117,)
Step 3/5, Task: offensive
Output shape: (117, 2)
X: (117,)
Step 4/5, Task: sentiment
Output shape: (117, 3)


In [14]:
hashtag_model = HashtagModel(model_mapping)
hashtag_preds = hashtag_model.predict(hashtags, tasks, prefix='hm_')

X: (117,)
Step 0/5, Task: emotion
Output shape: (117, 4)
X: (117,)
Step 1/5, Task: hate
Output shape: (117, 2)
X: (117,)
Step 2/5, Task: irony
Output shape: (117, 2)
X: (117,)
Step 3/5, Task: offensive
Output shape: (117, 2)
X: (117,)
Step 4/5, Task: sentiment
Output shape: (117, 3)
(117,) (117, 13)


In [15]:
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder
import pickle
import os
import ast

class EmojiModel():
    def __init__(self, emoji_data_path, optimizer='adam', loss='bce', metrics=None):
        
        with open(emoji_data_path, 'rb') as f:
            self.emoji_data =  pickle.load(f)
        self.keys = np.array(list(self.emoji_data.keys()))
        
        if metrics is None:
            metrics = ['acc']

        self.model = self.build_model(optimizer, loss, metrics)
        self.encoder = None 
        self.categories = None
        
    def build_model(self, optimizer, loss, metrics):
        model = keras.models.Sequential()
        model.add(keras.layers.Input((len(self.keys),)))
        model.add(keras.layers.Dropout(0.5))
        model.add(keras.layers.Dense(256, activation='relu'))
        model.add(keras.layers.Dropout(0.5))
        model.add(keras.layers.Dense(256, activation='relu'))
        model.add(keras.layers.Dense(4, activation='softmax'))

        model.compile(optimizer=optimizer, metrics=metrics, loss=loss)
        return model
    
    def preprocess(self, X):
        X = X.str.split(', ')
        mask = X.map(lambda d: len(d) > 2 or (len(d) == 1 and d[0] != '' and d[0] != ' '))
        encoded = np.zeros((len(X), self.keys.shape[0]))
        
        for i, x in enumerate(X):
            if mask[i]:
                idx = self._find_indices(x)
                if len(idx) > 0:
                    encoded[i, idx] = 1
            
        return encoded
    
    def fit(self, X, y, *args, validation_data=None, **kwargs):
        mask = X.str.split(', ').map(lambda d: len(d) > 2 or (len(d) == 1 and d[0] != '' and d[0] != ' '))
        X = self.preprocess(X)
        y = self.encode_y(y)
        
        X = X[mask]
        y = y[mask]
        
        if validation_data is not None:
            validation_data = (self.preprocess(validation_data[0]), self.encode_y(validation_data[1]))
        
        return self.model.fit(X, y, *args, validation_data=validation_data, **kwargs)
    
    def predict(self, X):
        mask = X.str.split(', ').map(lambda d: len(d) > 2 or (len(d) == 1 and d[0] != '' and d[0] != ' '))
        X = self.preprocess(X)
        preds = self.model.predict(X)
        preds[~mask] = 0
        return preds
    
    def encode_y(self, y):
        if self.encoder is None:
            self.classes = np.unique(y)
            print(self.classes)
            self.encoder = OneHotEncoder(handle_unknown='ignore')#, categories=self.classes)
            self.encoder.fit(pd.DataFrame(y))
            
            
        return self.encoder.transform(pd.DataFrame(y)).toarray()
    
    def evaluate(self, X, y):
        y = self.encode_y(y)
        preds = self.predict(X)
        
        d = {
            'MSE': ((y - preds) ** 2).mean(),
            'MAE': np.abs((y - preds)).mean(),
            'accuracy': sum(np.argmax(preds, axis=1) == np.argmax(y, axis=1)) / len(y)
        }
        
        return d
    
    def _find_indices(self, values):
        columns = self.keys.flatten()
        inds = np.array([np.where(columns == searchval)[0] for searchval in values if searchval in self.keys]).flatten()
        #print('values:', values)
        #print('columns:', columns.shape)
        #print('inds:', inds)
        return inds
        

In [16]:
mask = emojis.str.split(', ').map(lambda d: len(d) > 2 or (len(d) == 1 and d[0] != '' and d[0] != ' '))
emojis[mask]

5                        🎶
17              🤬, 🧿, 🧿, 🧿
18                 ✋, ✋, ✋
20                 😨, 😱, 😱
30                      🇮🇳
32                      🇮🇳
34                 😂, 😂, 😂
36                       🥲
39             ✊🏻, ♥, 🥺, 🧿
40                       🌚
45                 😭, 😭, 😭
56                       😔
60                       🤍
63                       🦌
68              😂, 😂, 😂, 😂
73           😭, 😭, 😭, 😭, 😭
74                       🤲
76        🚀, 🚀, 📆, 👑, 👑, 🚀
81                       ☺
84                 👏, 👏, 👏
92                       🫀
93                       💌
96                       ❤
98                       ☺
100              🤷\u200d♀️
101    😭, 😭, 😭, 😭, 😭, 😭, ❤
102                     🇮🇳
106             🤷🏽\u200d♀️
108                😘, ❤, ❤
109                      😂
110                      🌚
112                ✋, ✋, ✋
115                🎉, 🎉, 🎉
Name: emoi, dtype: object

In [17]:
''.split('a')

['']

In [19]:
emoji_model = EmojiModel('emoji_data/UNICODE_EMOJI_ALIAS.pkl')
emojis_test = df_test['emoi'].str.strip('[]').str.replace("'", '').str.lower()
emoji_model.fit(emojis, y=df_train['label'], batch_size=16, epochs=10, validation_data=(emojis_test, df_test['label']))

emoji_preds = emoji_model.predict(emojis)

score = emoji_model.evaluate(emojis_test, df_test['label'])

['angry' 'disappointed' 'happy' 'sad']
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
score

{'MSE': 0.2389544735940867, 'MAE': 0.25668826488157115, 'accuracy': 0.4}

In [23]:
import cv2
import fer
import requests

class ImageModel():
    def __init__(self, prefix='im_'):
        self.template = {'angry': 0.0, 'disgust': 0.0, 'fear': 0.0, 'happy': 0.0, 'sad': 0.0, 'surprise': 0.0, 'neutral': 0.0}
        self.detector = fer.FER()
        self.prefix = prefix
    
    def predict(self, X):
        preds = []
        for i, url in enumerate(X):
            print(f'{i}: {url}')
            if pd.isna(url):
                preds.append(self.template.copy())
                continue
            img_data = requests.get(url).content
            with open('./sample/images/temp.png', 'wb') as f:
                f.write(img_data)
            
            bad_net_img = cv2.imread('./sample/images/temp.png')
            try:
                pred = self.detector.detect_emotions(bad_net_img)
            except:
                pred = []
            if len(pred) > 0:
                pred = [p['emotions'] for p in pred]
                final_pred = self.template.copy()
                for j, p in enumerate(pred, 1):
                    for key in p.keys():
                        final_pred[key] += p[key]
                for key in p.keys():
                    final_pred[key] /= j
                        
                print(final_pred)
                preds.append(final_pred)
            else:
                preds.append(self.template.copy())
                
        df = pd.DataFrame(preds)
        df.columns = [self.prefix + col for col in df.columns]
        return df
                
                
            
            

In [24]:
image_model = ImageModel()
image_preds = image_model.predict(images)

0: https://pbs.twimg.com/media/FJJAdobagAEra9k.jpg
{'angry': 0.02, 'disgust': 0.0, 'fear': 0.04, 'happy': 0.0, 'sad': 0.66, 'surprise': 0.0, 'neutral': 0.28}
1: nan
2: https://pbs.twimg.com/media/FJKQ1AlXIAMujwA.jpg
{'angry': 0.075, 'disgust': 0.0, 'fear': 0.075, 'happy': 0.365, 'sad': 0.16, 'surprise': 0.09999999999999999, 'neutral': 0.22999999999999998}
3: https://pbs.twimg.com/media/FJKCOhAWQAUEBCE.jpg
4: https://pbs.twimg.com/media/FG4kHYxagAAlDAl.jpg
{'angry': 0.03, 'disgust': 0.0, 'fear': 0.68, 'happy': 0.0, 'sad': 0.04, 'surprise': 0.23, 'neutral': 0.01}
5: https://video.twimg.com/ext_tw_video/1482346078628642822/pu/vid/320x320/rrNx4OG9HdEMAkiD.mp4?tag=12
6: nan
7: nan
8: nan
9: https://video.twimg.com/ext_tw_video/1325479794671296514/pu/vid/320x320/t67Lav6SVwiWtFJ3.mp4?tag=10
10: nan
11: nan
12: nan
13: https://pbs.twimg.com/media/FJJWa5IWYAAfUtz.jpg
{'angry': 0.0, 'disgust': 0.0, 'fear': 0.0, 'happy': 1.0, 'sad': 0.0, 'surprise': 0.0, 'neutral': 0.0}
14: nan
15: https://video.

In [25]:
df_train.loc[38, 'Text']

'RT @saarthaksing : @_dark_crusader And rakha also said correctly  that KK needs to come out of his shell and  play for himself not for beja. That’s why SK was angry with her because she was guiding Karan in the right direction. \r\n\r\n#KaranKundrra \r\n\r\nKARAN IS AN EMOTION'

In [26]:
df_train.loc[60, 'Text']

'RT @hyungwon_india : #HYUNGWON\r\n#MONSTAX #MONSTA_X #몬스타엑스 #형원 #푸르고아름다운_형원이의하루 \r\n#HBDtoHYUNGWON\r\n\r\n115 It was a happy and meaningful day with Monbebes &amp; MONSTA X , Hope you will always be surrounded by the people who love you and care for you. \r\nHappy Birthday once again Hyungwonie 🤍 https://t.co/JkkahZjSGC'

In [27]:
df_train.head()

Unnamed: 0,Text,label,emoi,hashtags,Media URLs
0,RT @FranticKL : This is truly sad and most unk...,sad,[],[],https://pbs.twimg.com/media/FJJAdobagAEra9k.jpg
1,@imVkohli Sad to get this news but you are rhe...,sad,[],[],
2,I'm sad don't touch me https://t.co/ayNqChhXmP...,sad,[],[],https://pbs.twimg.com/media/FJKQ1AlXIAMujwA.jpg
3,RT @AshaRangappa_ : STEP 5: Plan for all of th...,angry,[],[],https://pbs.twimg.com/media/FJKCOhAWQAUEBCE.jpg
4,RT @_EL_PsyCongroo_ : #GenshinImpact #zhongxia...,angry,[],"['GenshinImpact', 'zhongxiao', '鍾魈']",https://pbs.twimg.com/media/FG4kHYxagAAlDAl.jpg


In [28]:
print(text_preds.shape, hashtag_preds.shape, emoji_preds.shape, image_preds.shape)

(117, 13) (117, 13) (117, 4) (117, 7)


In [29]:
print(type(text_preds), type(hashtag_preds), type(emoji_preds), type(image_preds))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'> <class 'pandas.core.frame.DataFrame'>


In [30]:
df = pd.concat([text_preds, hashtag_preds, pd.DataFrame(emoji_preds), image_preds], axis=1)
df.to_excel('dataset_preds_all_models.xlsx')
df.to_csv('dataset_preds_all_models.csv', index=0)