In [None]:
# Combining GoEmotions Dataset
import numpy as np
import pandas as pd

df1 = pd.read_csv("original-data/goemotions/dev.tsv", encoding='utf-8', sep="\t").astype(str)
df2 = pd.read_csv("original-data/goemotions/train.tsv", encoding='utf-8', sep="\t").astype(str)
df3 = pd.read_csv("original-data/goemotions/test.tsv", encoding='utf-8', sep="\t").astype(str)

all_df = pd.concat([df1, df2, df3])
all_df = all_df.drop(['etc'], axis=1)

all_df.to_csv("original-data/goemotions/all_goemotions.tsv", encoding='utf-8', sep='\t')

In [None]:
# Applying VADER Sentiment scores to GoEmotions Dataset

import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

df = pd.read_csv("original-data/goemotions/all_goemotions.tsv", encoding='utf-8', sep='\t').astype(str)

number_emotion = {
    '0': 'admiration',
    '1': 'amusement',
    '2': 'anger',
    '3': 'annoyance',
    '4': 'approval',
    '5': 'caring',
    '6': 'confusion',
    '7': 'curiosity',
    '8': 'desire',
    '9': 'disappointment',
    '10': 'disapproval',
    '11': 'disgust',
    '12': 'embarrassment',
    '13': 'excitement',
    '14': 'fear',
    '15': 'gratitude',
    '16': 'grief',
    '17': 'joy',
    '18': 'love',
    '19': 'nervousness',
    '20': 'optimism',
    '21': 'pride',
    '22': 'realization',
    '23': 'relief',
    '24': 'remorse',
    '25': 'sadness',
    '26': 'surprise',
    '27': 'neutral'
}

all_sids = []

for index, row in df.iterrows():
    text_sid = sid.polarity_scores(row.text)
    all_sids.append(text_sid['compound'])
    
df['sentiment_score'] = all_sids
df = df.drop(['Unnamed: 0'], axis=1)

df.to_csv("original-data/goemotions/all_goemotions.tsv", encoding='utf-8', sep='\t')

In [None]:
# Compiling VADER scores for all 28 Emotions (Neutral Included)
df = pd.read_csv("original-data/goemotions/all_goemotions.tsv", encoding='utf-8', sep='\t').astype(str)

score_totals = {
    '0': [],
    '1': [],
    '2': [],
    '3': [],
    '4': [],
    '5': [],
    '6': [],
    '7': [],
    '8': [],
    '9': [],
    '10': [],
    '11': [],
    '12': [],
    '13': [],
    '14': [],
    '15': [],
    '16': [],
    '17': [],
    '18': [],
    '19': [],
    '20': [],
    '21': [],
    '22': [],
    '23': [],
    '24': [],
    '25': [],
    '26': [],
    '27': []
}

for index, row in df.iterrows():
    if len(row.emotion.split(',')) == 1:
        emotion = row.emotion
        score_totals[emotion].append(float(row.sentiment_score))

print(score_totals)

In [None]:
# Median
import statistics as stats

emotion_median = {}

for key in score_totals.keys():
    emotion_median[number_emotion[key]] = stats.median(score_totals[key])

sort = {k: v for k, v in sorted(emotion_median.items(), key=lambda item: item[1])}

final_emotions = []
median_scores = []

for item in sort:
    print(f'Emotion: {item}, Median: {sort[item]} \n')
    final_emotions.append(item)
    median_scores.append(sort[item])
    
df = pd.DataFrame()

df['emotions'] = final_emotions
df['median_scores'] = median_scores

df.to_csv('original-data/goemotions/median-w-outliers.tsv', sep='\t')

In [None]:
# Median Without Outliers
import copy
import statistics as stats

temp_score_totals = copy.deepcopy(score_totals)

for key in temp_score_totals.keys():
    q3, q1 = np.percentile(score_totals[key], [75, 25])
    iqr = q3 - q1
    upper_bound = q3 + 1.5 * iqr
    lower_bound = q1 - 1.5 * iqr
    
    temp_score_totals[key] = [x for x in temp_score_totals[key] if x <= upper_bound]
    temp_score_totals[key] = [x for x in temp_score_totals[key] if x >= lower_bound]
    
emotion_median = {}

for key in temp_score_totals.keys():
    emotion_median[number_emotion[key]] = stats.median(temp_score_totals[key])

sort = {k: v for k, v in sorted(emotion_median.items(), key=lambda item: item[1])}

final_emotions = []
median_scores = []

for item in sort:
    print(f'Emotion: {item}, Median: {sort[item]} \n')
    final_emotions.append(item)
    median_scores.append(sort[item])
    
df = pd.DataFrame()

df['emotions'] = final_emotions
df['median_scores'] = median_scores

df.to_csv('original-data/goemotions/median-wo-outliers.tsv', sep='\t')

In [None]:
# Mean
import statistics as stats

emotion_mean = {}

for key in score_totals.keys():
    emotion_mean[number_emotion[key]] = stats.mean(score_totals[key])

sort = {k: v for k, v in sorted(emotion_mean.items(), key=lambda item: item[1])}

final_emotions = []
mean_scores = []

for item in sort:
    print(f'Emotion: {item}, Mean: {sort[item]} \n')
    final_emotions.append(item)
    mean_scores.append(sort[item])
    
df = pd.DataFrame()

df['emotions'] = final_emotions
df['mean_scores'] = mean_scores

df.to_csv('original-data/goemotions/mean-w-outliers.tsv', sep='\t')

In [None]:
# Mean Without Outliers
import copy
import statistics as stats

temp_score_totals = copy.deepcopy(score_totals)

for key in temp_score_totals.keys():
    q3, q1 = np.percentile(score_totals[key], [75, 25])
    iqr = q3 - q1
    upper_bound = q3 + 1.5 * iqr
    lower_bound = q1 - 1.5 * iqr
    
    temp_score_totals[key] = [x for x in temp_score_totals[key] if x <= upper_bound]
    temp_score_totals[key] = [x for x in temp_score_totals[key] if x >= lower_bound]

emotion_mean = {}

for key in temp_score_totals.keys():
    emotion_mean[number_emotion[key]] = stats.mean(temp_score_totals[key])

sort = {k: v for k, v in sorted(emotion_mean.items(), key=lambda item: item[1])}

final_emotions = []
mean_scores = []

for item in sort:
    print(f'Emotion: {item}, Mean: {sort[item]} \n')
    final_emotions.append(item)
    mean_scores.append(sort[item])
    
df = pd.DataFrame()

df['emotions'] = final_emotions
df['mean_scores'] = mean_scores

df.to_csv('original-data/goemotions/mean-wo-outliers.tsv', sep='\t')