In [None]:
!pip install transformers==2.11.0

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

In [None]:
import transformers
import os
from transformers import BertTokenizer
from GoEmotions_One.model import BertForMultiLabelClassification
from GoEmotions_One.multilabel_pipeline import MultiLabelPipeline

tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")
#model = BertForMultiLabelClassification.from_pretrained('../emo-para/GoEmotions/ckpt/original/bert-base-cased-goemotions-original/checkpoint-27000')

goemotions = MultiLabelPipeline(
    model=model,
    tokenizer=tokenizer,
    threshold=0.3,
    device=0
)

In [None]:
import numpy as np
import pandas as pd

train_df = pd.read_csv('compiled-data/twit0.825/twit0.825-data.tsv', sep="\t").astype(str)
train_df = train_df[["input_text", "target_text"]]

threshold = 0.5

def Top_Score_Label (outputs):
    scores = 1 / (1 + np.exp(-outputs))  # Sigmoid
    top_score = 0
    top_label = ""
    for item in scores:
        for idx, s in enumerate(item):
            if s > threshold:
                if s > top_score: 
                    top_label = model.config.id2label[idx]
    return top_label

input_labels = []
target_labels = []

for index, row in train_df.iterrows():
    if (index % 1000 == 0):
        print("Row : " + str(index))
    
    i_text = (row.input_text[:512] + '..') if len(row.input_text) > 512 else row.input_text
    t_text = (row.target_text[:512] + '..') if len(row.target_text) > 512 else row.target_text

    input_emo = goemotions(i_text)
    target_emo = goemotions(t_text)

    input_label = Top_Score_Label (input_emo)
    target_label = Top_Score_Label (target_emo)
    
    input_labels.append (input_label)
    target_labels.append (target_label)
    
train_df["input_emo"] = input_labels
train_df["target_emo"] = target_labels 
train_df.to_csv("compiled-data/twit0.825/twit0.825-primary_emo.tsv", sep="\t")

In [None]:
import os
import pandas as pd

training_df = pd.read_csv("compiled-data/twit0.775/twit0.775-primary_emo.tsv", sep="\t").astype(str)

training_emo_diff_df = training_df[training_df['input_emo'] != "nan"]
training_emo_diff_df = training_emo_diff_df[training_emo_diff_df['target_emo'] != "nan"]
training_emo_diff_df = training_emo_diff_df[training_emo_diff_df['input_emo'] != "neutral"]
training_emo_diff_df = training_emo_diff_df[training_emo_diff_df['target_emo'] != "neutral"]
training_emo_diff_df = training_emo_diff_df[training_emo_diff_df['input_emo'] != training_emo_diff_df['target_emo']]
training_emo_diff_df = training_emo_diff_df.reset_index(drop=True)
training_emo_diff_df = training_emo_diff_df.drop(['Unnamed: 0'], axis=1)

prefixes = []

emotion_shorthand_mapping = {
    'neutral': '0',
    'amusement': '1',
    'excitement': '2',
    'joy': '3',
    'love': '4',
    'optimism': '5',
    'desire': '6',
    'caring': '7',
    'pride': '8',
    'admiration': '9',
    'gratitude': '10',
    'relief': '11',
    'approval': '12',
    'realization': '13',
    'surprise': '14',
    'curiosity': '15',
    'confusion': '16',
    'fear': '17',
    'nervousness': '18',
    'remorse': '19',
    'embarrassment': '20',
    'disappointment': '21',
    'sadness': '22',
    'grief': '23',
    'disgust': '24',
    'anger': '25',
    'annoyance': '26',
    'disapproval': '27'
}

for index, row in training_emo_diff_df.iterrows():
    input_emotion = row.input_emo
    target_emotion = row.target_emo
    input_emo_short = emotion_shorthand_mapping[input_emotion]
    target_emo_short = emotion_shorthand_mapping[target_emotion]
    prefix =  input_emo_short + " to " + target_emo_short 
    prefix = prefixes.append(prefix)
    
training_t5_df = training_emo_diff_df
training_t5_df = training_t5_df.drop(['input_emo', 'target_emo'], axis=1) 
training_t5_df.insert(0, "prefix", prefixes)

training_t5_df.to_csv("compiled-data/twit0.825/twit0.825-prefix_emo.tsv", sep="\t")

In [None]:
from sklearn.model_selection import train_test_split

df = pd.read_csv("compiled-data/twit0.825/twit0.825-prefix_emo.tsv", sep="\t")

training_df, testing_df = train_test_split(df)

training_df = training_df.drop(['Unnamed: 0'], axis=1)
testing_df = testing_df.drop(['Unnamed: 0'], axis=1)

training_df.to_csv("emotion-labeled-data/twit0.825/twit0.8255-training_t5.tsv", sep="\t")
testing_df.to_csv("emotion-labeled-data/twit0.825/twit0.825-testing_t5.tsv", sep="\t")