In [None]:
# Installs older transformers version for GoEmotions compatability
!pip install transformers==2.11.0

In [None]:
# Activates CUDA for GPU use by GoEmotions
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

In [None]:
# Allows user input for data set to label emotions
dataset_name = input('Dataset to label emotions (google, msr, quora, mix, twit0.825):  ')

if not (dataset_name.lower() in ['google', 'msr', 'quora', 'mix', 'twit0.825']): 
    print('Please enter a valid dataset name')

print('Dataset selected: ' + dataset_name)

In [None]:
# Initializes a GoEmotions model instance
import transformers
import os
from transformers import BertTokenizer
from GoEmotions.model import BertForMultiLabelClassification
from GoEmotions.multilabel_pipeline import MultiLabelPipeline

tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-group")
model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-group")

goemotions = MultiLabelPipeline(
    model=model,
    tokenizer=tokenizer,
    threshold=0.3,
    device=0
)

In [None]:
# Emotion labeling of both input and target texts on the Mixed training set with the Group model
import numpy as np
import pandas as pd

train_df = pd.read_csv(f'compiled-data/{dataset_name}/{dataset_name}-training_data.tsv', sep="\t").astype(str)
train_df = train_df[["input_text", "target_text"]]


# Sigmoid Function determines a dominant predicted emotion by GoEmotions 
# Above the set threshold
# If none, the function returns nothing
threshold = 0.5

def Top_Score_Label (outputs):
    scores = 1 / (1 + np.exp(-outputs))
    top_score = 0
    top_label = ""
    for item in scores:
        for idx, s in enumerate(item):
            if s > threshold:
                if s > top_score: 
                    top_label = model.config.id2label[idx]
                    top_score = s 
    return top_label, top_score


# Uses GoEmotions to label both the input and target texts
input_labels = []
input_scores = []
target_labels = []
target_scores = []

for index, row in train_df.iterrows():
    if (index %1000 == 0): 
        print ("Processing " + str(index))
    
    # Makes sure that the text received by GoEmotions do not exceed the token limit of 512
    i_text = (row.input_text[:512] + '..') if len(row.input_text) > 512 else row.input_text
    t_text = (row.target_text[:512] + '..') if len(row.target_text) > 512 else row.target_text

    # GoEmotions model returns scores for different emotions
    input_emo = goemotions(i_text)
    target_emo = goemotions(t_text)

    # Sigmoid function determines dominant emotion, if any
    input_label, input_score = Top_Score_Label (input_emo)
    target_label, target_score = Top_Score_Label (target_emo)
    
    input_labels.append (input_label)
    input_scores.append (input_score)
    target_labels.append (target_label)
    target_scores.append (target_score)

    
# Adds input and target emotion labels to data set
# Saves updated data in new tsv file
train_df["input_emo_g"] = input_labels
train_df["input_score"] = input_scores
train_df["target_emo_g"] = target_labels
train_df["target_score"] = target_scores

train_df.to_csv(f'compiled-data/{dataset_name}/{dataset_name}-training_emolabel_g.tsv', sep="\t")

In [None]:
# Repeats same emotion labeling process on the Mixed test set with the group model
import numpy as np
import pandas as pd

eval_df = pd.read_csv(f'compiled-data/{dataset_name}/{dataset_name}-testing_data.tsv', sep="\t").astype(str)
eval_df = eval_df[["input_text", "target_text"]]

threshold = 0.5

def Top_Score_Label (outputs):
    scores = 1 / (1 + np.exp(-outputs))
    top_score = 0
    top_label = ""
    for item in scores:
        for idx, s in enumerate(item):
            if s > threshold:
                if s > top_score: 
                    top_label = model.config.id2label[idx]
                    top_score = s 
    return top_label, top_score

input_labels = []
input_scores = []
target_labels = []
target_scores = []

for index, row in eval_df.iterrows():
    if (index %1000 == 0): 
        print ("Processing " + str(index))  
    
    i_text = (row.input_text[:512] + '..') if len(row.input_text) > 512 else row.input_text
    t_text = (row.target_text[:512] + '..') if len(row.target_text) > 512 else row.target_text

    input_emo = goemotions(i_text)
    target_emo = goemotions(t_text)

    input_label, input_score = Top_Score_Label (input_emo)
    target_label, target_score = Top_Score_Label (target_emo)
    
    input_labels.append (input_label)
    input_scores.append (input_score)
    target_labels.append (target_label)
    target_scores.append (target_score)
    
eval_df["input_emo_g"] = input_labels
eval_df["input_score"] = input_scores
eval_df["target_emo_g"] = target_labels
eval_df["target_score"] = target_scores

eval_df.to_csv(f'compiled-data/{dataset_name}/{dataset_name}-testing_emolabel_g.tsv', sep="\t")

In [None]:
# Initializes a GoEmotions model instance
import transformers
import os
from transformers import BertTokenizer
from GoEmotions.model import BertForMultiLabelClassification
from GoEmotions.multilabel_pipeline import MultiLabelPipeline

tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-ekman")
model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-ekman")

goemotions = MultiLabelPipeline(
    model=model,
    tokenizer=tokenizer,
    threshold=0.3,
    device=0
)

In [None]:
# Emotion labeling of both input and target texts on the Mixed training set with the Ekman model
import numpy as np
import pandas as pd

train_df = pd.read_csv(f'compiled-data/{dataset_name}/{dataset_name}-training_emolabel_g.tsv', sep="\t").astype(str)
#train_df = train_df[["input_text", "target_text"]]


# Sigmoid Function determines a dominant predicted emotion by GoEmotions 
# Above the set threshold
# If none, the function returns nothing
threshold = 0.5
def Top_Score_Label (outputs):
    scores = 1 / (1 + np.exp(-outputs)) 
    top_score = 0
    top_label = ""
    for item in scores:
        for idx, s in enumerate(item):
            if s > threshold:
                if s > top_score: 
                    top_label = model.config.id2label[idx]
                    top_score = s
    return top_label


# Uses GoEmotions to label both the input and target texts
input_labels = []
target_labels = []
for index, row in train_df.iterrows():
    if (index %1000 == 0): 
        print ("Processing " + str(index))  
    
    # Makes sure that the text received by GoEmotions do not exceed the token limit of 512
    i_text = (row.input_text[:512] + '..') if len(row.input_text) > 512 else row.input_text
    t_text = (row.target_text[:512] + '..') if len(row.target_text) > 512 else row.target_text

    # GoEmotions model returns scores for different emotions
    input_emo = goemotions(i_text)
    target_emo = goemotions(t_text)

    # Sigmoid function determines dominant emotion, if any
    input_label = Top_Score_Label (input_emo)
    target_label = Top_Score_Label (target_emo)
    
    input_labels.append (input_label)
    target_labels.append (target_label)

    
# Adds input and target emotion labels to data set
# Saves updated data in new tsv file
train_df["input_emo_e"] = input_labels
train_df["target_emo_e"] = target_labels 
train_df.to_csv(f'compiled-data/{dataset_name}/{dataset_name}-training_emolabel_e.tsv', sep="\t")

In [None]:
# Repeats same emotion labeling process on the Mixed test set with the Ekman model
import numpy as np
import pandas as pd

eval_df = pd.read_csv(f'compiled-data/{dataset_name}/{dataset_name}-testing_emolabel_g.tsv', sep="\t").astype(str)
#eval_df = eval_df[["input_text", "target_text"]]

threshold = 0.5

def Top_Score_Label (outputs):
    scores = 1 / (1 + np.exp(-outputs))
    top_score = 0
    top_label = ""
    for item in scores:
        for idx, s in enumerate(item):
            if s > threshold:
                if s > top_score: 
                    top_label = model.config.id2label[idx]
                    top_score = s
    return top_label

input_labels = []
target_labels = []

for index, row in eval_df.iterrows():
    if (index %1000 == 0): 
        print ("Processing " + str(index))
    
    i_text = (row.input_text[:512] + '..') if len(row.input_text) > 512 else row.input_text
    t_text = (row.target_text[:512] + '..') if len(row.target_text) > 512 else row.target_text

    input_emo = goemotions(i_text)
    target_emo = goemotions(t_text)

    input_label = Top_Score_Label (input_emo)
    target_label = Top_Score_Label (target_emo)
    
    input_labels.append (input_label)
    target_labels.append (target_label)
    
eval_df["input_emo_e"] = input_labels
eval_df["target_emo_e"] = target_labels 
eval_df.to_csv(f'compiled-data/{dataset_name}/{dataset_name}-testing_emolabel_e.tsv', sep="\t")

In [None]:
# Initializes a GoEmotions model instance
import transformers
import os
from transformers import BertTokenizer
from GoEmotions.model import BertForMultiLabelClassification
from GoEmotions.multilabel_pipeline import MultiLabelPipeline

tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")

goemotions = MultiLabelPipeline(
    model=model,
    tokenizer=tokenizer,
    threshold=0.3,
    device=0
)

In [None]:
# Emotion labeling of both input and target texts on the Mixed training set
import numpy as np
import pandas as pd

train_df = pd.read_csv(f'compiled-data/{dataset_name}/{dataset_name}-training_emolabel_e.tsv', sep="\t").astype(str)
#train_df = train_df[["input_text", "target_text"]]


# Sigmoid Function determines a dominant predicted emotion by GoEmotions 
# Above the set threshold
# If none, the function returns nothing
threshold = 0.5
def Top_Score_Label (outputs):
    scores = 1 / (1 + np.exp(-outputs)) 
    top_score = 0
    top_label = ""
    for item in scores:
        for idx, s in enumerate(item):
            if s > threshold:
                if s > top_score: 
                    top_label = model.config.id2label[idx]
                    top_score = s        
    return top_label


# Uses GoEmotions to label both the input and target texts
input_labels = []
target_labels = []
for index, row in train_df.iterrows():
    if (index %1000 == 0): 
        print ("Processing " + str(index))

    # Makes sure that the text received by GoEmotions do not exceed the token limit of 512
    i_text = (row.input_text[:512] + '..') if len(row.input_text) > 512 else row.input_text
    t_text = (row.target_text[:512] + '..') if len(row.target_text) > 512 else row.target_text

    # GoEmotions model returns scores for different emotions
    input_emo = goemotions(i_text)
    target_emo = goemotions(t_text)

    # Sigmoid function determines dominant emotion, if any
    input_label = Top_Score_Label (input_emo)
    target_label = Top_Score_Label (target_emo)
    
    input_labels.append (input_label)
    target_labels.append (target_label)

    
# Adds input and target emotion labels to data set
# Saves updated data in new tsv file
train_df["input_emo"] = input_labels
train_df["target_emo"] = target_labels 
train_df.to_csv(f'compiled-data/{dataset_name}/{dataset_name}-training_emolabel.tsv', sep="\t")

In [None]:
# Repeats same emotion labeling process on the Mixed test set
import numpy as np
import pandas as pd

eval_df = pd.read_csv(f'compiled-data/{dataset_name}/{dataset_name}-testing_emolabel_e.tsv', sep="\t").astype(str)
#eval_df = eval_df[["input_text", "target_text"]]

threshold = 0.5

def Top_Score_Label (outputs):
    scores = 1 / (1 + np.exp(-outputs))
    top_score = 0
    top_label = ""
    for item in scores:
        for idx, s in enumerate(item):
            if s > threshold:
                if s > top_score: 
                    top_label = model.config.id2label[idx]
                    top_score = s
    return top_label

input_labels = []
target_labels = []

for index, row in eval_df.iterrows():
    if (index %1000 == 0): 
        print ("Processing " + str(index))

    i_text = (row.input_text[:512] + '..') if len(row.input_text) > 512 else row.input_text
    t_text = (row.target_text[:512] + '..') if len(row.target_text) > 512 else row.target_text

    input_emo = goemotions(i_text)
    target_emo = goemotions(t_text)

    input_label = Top_Score_Label (input_emo)
    target_label = Top_Score_Label (target_emo)
    
    input_labels.append (input_label)
    target_labels.append (target_label)
    
eval_df["input_emo"] = input_labels
eval_df["target_emo"] = target_labels 
eval_df.to_csv(f'compiled-data/{dataset_name}/{dataset_name}-testing_emolabel.tsv', sep="\t")

In [None]:
# Filtering and data cleansing of the Mixed data set
import os
import pandas as pd

training_df = pd.read_csv("compiled-data/mix/mix-training_emolabel.tsv", sep="\t").astype(str)
testing_df = pd.read_csv("compiled-data/mix/mix-testing_emolabel.tsv", sep="\t").astype(str)


# Removes paraphrase pairs that include 
# blanks, neutral labels, and matching emotion labels 
training_emo_diff_df = training_df[training_df['input_emo'] != "nan"]
training_emo_diff_df = training_emo_diff_df[training_emo_diff_df['target_emo'] != "nan"]
training_emo_diff_df = training_emo_diff_df[training_emo_diff_df['input_emo'] != "neutral"]
training_emo_diff_df = training_emo_diff_df[training_emo_diff_df['target_emo'] != "neutral"]
training_emo_diff_df = training_emo_diff_df[training_emo_diff_df['input_emo'] != training_emo_diff_df['target_emo']]
training_emo_diff_df = training_emo_diff_df.reset_index(drop=True)
training_emo_diff_df = training_emo_diff_df.drop(['Unnamed: 0'], axis=1)

testing_emo_diff_df = testing_df[testing_df['input_emo'] != "nan"]
testing_emo_diff_df = testing_emo_diff_df[testing_emo_diff_df['target_emo'] != "nan"]
testing_emo_diff_df = testing_emo_diff_df[testing_emo_diff_df['input_emo'] != "neutral"]
testing_emo_diff_df = testing_emo_diff_df[testing_emo_diff_df['target_emo'] != "neutral"]
testing_emo_diff_df = testing_emo_diff_df[testing_emo_diff_df['input_emo'] != testing_emo_diff_df['target_emo']]
testing_emo_diff_df = testing_emo_diff_df.reset_index(drop=True)
testing_emo_diff_df = testing_emo_diff_df.drop(['Unnamed: 0'], axis=1)


# Mapping of emotions to designated numbers
# Numbers make the fine-tuning process more effective
emotion_shorthand_mapping = {
    'neutral': '0',
    'amusement': '1',
    'excitement': '2',
    'joy': '3',
    'love': '4',
    'optimism': '5',
    'desire': '6',
    'caring': '7',
    'pride': '8',
    'admiration': '9',
    'gratitude': '10',
    'relief': '11',
    'approval': '12',
    'realization': '13',
    'surprise': '14',
    'curiosity': '15',
    'confusion': '16',
    'fear': '17',
    'nervousness': '18',
    'remorse': '19',
    'embarrassment': '20',
    'disappointment': '21',
    'sadness': '22',
    'grief': '23',
    'disgust': '24',
    'anger': '25',
    'annoyance': '26',
    'disapproval': '27'
}


# Generates prefixes for the Mixed training set
# Uses the predicted input and target emotions and their designated numbers
prefixes = []
for index, row in training_emo_diff_df.iterrows():
    input_emotion = row.input_emo
    target_emotion = row.target_emo
    input_emo_short = emotion_shorthand_mapping[input_emotion]
    target_emo_short = emotion_shorthand_mapping[target_emotion]
    prefix =  input_emo_short + " to " + target_emo_short 
    prefix = prefixes.append(prefix)
    
# Replaces input and target emotion labels with prefix
training_t5_df = training_emo_diff_df
#training_t5_df = training_t5_df.drop(['input_emo', 'target_emo'], axis=1) 
training_t5_df.insert(0, "prefix", prefixes)

# Repeats same process for Mix testing set
prefixes = []
for index, row in testing_emo_diff_df.iterrows():
    input_emotion = row.input_emo
    target_emotion = row.target_emo
    input_emo_short = emotion_shorthand_mapping[input_emotion]
    target_emo_short = emotion_shorthand_mapping[target_emotion]
    prefix =  input_emo_short + " to " + target_emo_short 
    prefix = prefixes.append(prefix)
    
testing_t5_df = testing_emo_diff_df
#testing_t5_df = testing_t5_df.drop(['input_emo', 'target_emo'], axis=1) 
testing_t5_df.insert(0, "prefix", prefixes)


# Ensures directory exists for save location
path = 'emotion-labeled-data/mix'
if not os.path.exists(path):
    os.mkdir(path)

# Saves data with prefixes in new location for fine-tuning and prediction use
training_t5_df.to_csv("emotion-labeled-data/mix/mix-training_t5.tsv", sep="\t")
testing_t5_df.to_csv("emotion-labeled-data/mix/mix-testing_t5.tsv", sep="\t")