In [None]:
high_neg_emo = {'anger', 'disgust', 'grief', 'fear', 'sadness'}
low_neg_emo = {'nervousness', 'annoyance', 'disappointment', 'embarrassment', 'remorse', 'disapproval'}
neutral_emo = {'confusion', 'curiosity', 'realization', 'surprise', 'neutral'}
low_pos_emo = {'approval', 'caring', 'desire', 'relief'}
high_pos_emo = {'amusement', 'excitement', 'pride', 'optimism', 'gratitude', 'joy', 'admiration', 'love'}

In [None]:
# Allows user input for data set to label emotions
dataset_name = input('Dataset to label emotions (google, msr, quora, mix, twit0.825):  ')

if not (dataset_name.lower() in ['google', 'msr', 'quora', 'mix', 'twit0.825']): 
    print('Please enter a valid dataset name')

print('Dataset selected: ' + dataset_name)

training_stats = {'dataset name': dataset_name + '-training'}
eval_stats = {'dataset name': dataset_name + '-testing'}

In [None]:
#Loading train_df and eval_df from files instead rerunning vader
import numpy as np
import pandas as pd
import os

train_df = pd.read_csv(f'compiled-data/{dataset_name}/{dataset_name}-training_emolabel_s.tsv', encoding='utf-8', sep="\t")
eval_df = pd.read_csv(f'compiled-data/{dataset_name}/{dataset_name}-testing_emolabel_s.tsv', encoding='utf-8', sep="\t")

In [None]:
# Removes paraphrase pairs that include 
# blanks, neutral labels, and matching emotion labels 
train_df = train_df[train_df['input_emo'] != "nan"]
train_df = train_df[train_df['target_emo'] != "nan"]
train_df = train_df[train_df['input_emo'] != "neutral"]
train_df = train_df[train_df['target_emo'] != "neutral"]
train_df = train_df[train_df['input_emo'] != train_df['target_emo']]
train_df = train_df.reset_index(drop=True)
train_df = train_df.drop(['Unnamed: 0'], axis=1)

eval_df = eval_df[eval_df['input_emo'] != "nan"]
eval_df = eval_df[eval_df['target_emo'] != "nan"]
eval_df = eval_df[eval_df['input_emo'] != "neutral"]
eval_df = eval_df[eval_df['target_emo'] != "neutral"]
eval_df = eval_df[eval_df['input_emo'] != eval_df['target_emo']]
eval_df = eval_df.reset_index(drop=True)
eval_df = eval_df.drop(['Unnamed: 0'], axis=1)

In [None]:
full_df = pd.DataFrame()
full_df = pd.concat([train_df, eval_df])

In [None]:
def calculate_emos_trans(full_df):
    input_emo_counts = {}
    target_emo_counts = {}
    emo_trans_counts = {}
    emo_range_counts = {}

    for index, row in full_df.iterrows():
        if row.input_emo in input_emo_counts.keys(): 
            input_emo_counts.update({row.input_emo: input_emo_counts[row.input_emo]+1})
        else:
            input_emo_counts.update({row.input_emo: 1})

        if row.target_emo in target_emo_counts.keys(): 
            target_emo_counts.update({row.target_emo: target_emo_counts[row.target_emo]+1})
        else:
            target_emo_counts.update({row.target_emo: 1})

        key = row.input_emo + " to " + row.target_emo
        if key in emo_trans_counts.keys(): 
            emo_trans_counts.update({key: emo_trans_counts[key]+1})
        else:
            emo_trans_counts.update({key: 1})
        
        if row.input_emo in high_neg_emo:
            input_emo_range = 'high_neg'
        elif row.input_emo in low_neg_emo:
            input_emo_range = 'low_neg'
        elif row.input_emo in neutral_emo:
            input_emo_range = 'neutral'
        elif row.input_emo in low_pos_emo:
            input_emo_range = 'low_pos'
        elif row.input_emo in high_pos_emo:
            input_emo_range = 'high_pos'
        
        if row.target_emo in high_neg_emo:
            target_emo_range = 'high_neg'
        elif row.target_emo in low_neg_emo:
            target_emo_range = 'low_neg'
        elif row.target_emo in neutral_emo:
            target_emo_range = 'neutral'
        elif row.target_emo in low_pos_emo:
            target_emo_range = 'low_pos'
        elif row.target_emo in high_pos_emo:
            target_emo_range = 'high_pos'

        rg_key = input_emo_range + " to " + target_emo_range
        if rg_key in emo_range_counts.keys(): 
            emo_range_counts.update({rg_key: emo_range_counts[rg_key]+1})
        else:
            emo_range_counts.update({rg_key: 1})
        
    print("# Input Emotions: " + str(len(input_emo_counts)))
    for emo, count in input_emo_counts.items(): 
        print(emo + ': ' + str(count))

    print('\n')
    print("# Target Emotions: " + str(len(target_emo_counts)))
    for emo, count in target_emo_counts.items(): 
        print(emo + ': ' + str(count))

    print('\n')
    for trans, count in emo_trans_counts.items(): 
        print(trans + ': ' + str(count))

    print('\n')
    for trans, count in emo_range_counts.items(): 
        print(trans + ': ' + str(count))

In [None]:
calculate_emos_trans(full_df)

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

new_train_df, new_eval_df = train_test_split(full_df, train_size = 0.8, random_state = 50)

new_train_df = new_train_df.reset_index()
new_eval_df = new_eval_df.reset_index()

new_train_df = new_train_df.drop(['index'], axis=1)
new_eval_df = new_eval_df.drop(['index'], axis=1)

calculate_emos_trans(new_train_df)
print('\n')
calculate_emos_trans(new_eval_df)

new_train_df.to_csv(f'compiled-data/{dataset_name}/{dataset_name}-training_emolabel_f.tsv', encoding='utf-8', sep='\t')
new_eval_df.to_csv(f'compiled-data/{dataset_name}/{dataset_name}-testing_emolabel_f.tsv', encoding='utf-8', sep='\t')

In [None]:
def cap_emo_trans (df, cap): 
    new_df =  pd.DataFrame()
    emo_trans_counts = {}

    for index, row in df.iterrows():
        key = row.input_emo + " to " + row.target_emo
        if key in emo_trans_counts.keys(): 
            if emo_trans_counts[key] < cap: 
                emo_trans_counts.update({key: emo_trans_counts[key]+1})
                new_df = new_df.append(row)
        else:
            emo_trans_counts.update({key: 1})
            new_df = new_df.append(row)
    return new_df

In [None]:
capped_train_df = cap_emo_trans(new_train_df, 20)
capped_eval_df = cap_emo_trans(new_eval_df, 5)

calculate_emos_trans(capped_train_df)
print('\n')
calculate_emos_trans(capped_eval_df)

capped_train_df.to_csv(f'compiled-data/{dataset_name}/{dataset_name}-training_emolabel_f.tsv', encoding='utf-8', sep='\t')
capped_eval_df.to_csv(f'compiled-data/{dataset_name}/{dataset_name}-testing_emolabel_f.tsv', encoding='utf-8', sep='\t')

In [None]:
new_full_df =  pd.DataFrame()
threshold = 25 
emo_trans_counts = {}

for index, row in full_df.iterrows():
    key = row.input_emo + " to " + row.target_emo
    if key in emo_trans_counts.keys(): 
        if emo_trans_counts[key] < threshold: 
            emo_trans_counts.update({key: emo_trans_counts[key]+1})
            new_full_df = new_full_df.append(row)
    else:
        emo_trans_counts.update({key: 1})
        new_full_df = new_full_df.append(row)

new_full_df_rg =  pd.DataFrame()
threshold_rg = 30 
emo_range_counts = {}    

for index, row in new_full_df.iterrows():
    if row.input_emo in high_neg_emo:
        input_emo_range = 'high_neg'
    elif row.input_emo in low_neg_emo:
        input_emo_range = 'low_neg'
    elif row.input_emo in neutral_emo:
        input_emo_range = 'neutral'
    elif row.input_emo in low_pos_emo:
        input_emo_range = 'low_pos'
    elif row.input_emo in high_pos_emo:
        input_emo_range = 'high_pos'
        
    if row.target_emo in high_neg_emo:
        target_emo_range = 'high_neg'
    elif row.target_emo in low_neg_emo:
        target_emo_range = 'low_neg'
    elif row.target_emo in neutral_emo:
        target_emo_range = 'neutral'
    elif row.target_emo in low_pos_emo:
        target_emo_range = 'low_pos'
    elif row.target_emo in high_pos_emo:
        target_emo_range = 'high_pos'

    key = input_emo_range + " to " + target_emo_range
    if key in emo_range_counts.keys(): 
        if emo_range_counts[key] < threshold_rg: 
            emo_range_counts.update({key: emo_range_counts[key]+1})
            new_full_df_rg = new_full_df_rg.append(row)
    else:
        emo_range_counts.update({key: 1})
        new_full_df_rg = new_full_df_rg.append(row)


In [None]:
calculate_emos_trans(new_full_df)

In [None]:
train_df.to_csv(f'compiled-data/{dataset_name}/{dataset_name}-training_emolabel_f.tsv', encoding='utf-8', sep='\t')
eval_df.to_csv(f'compiled-data/{dataset_name}/{dataset_name}-testing_emolabel_f.tsv', encoding='utf-8', sep='\t')