In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('../data/go_emotions_dataset.csv')

In [4]:
# drop unclear rows
filtered_data = data[data['example_very_unclear'] != True]

# drop rows where none of the sentiment columns (excluding id, text, example_very_unclear) == 1
# Identify the sentiment columns (assuming all other columns are sentiment-related)
non_sentiment_cols = ['id', 'text', 'example_very_unclear']
sentiment_cols = [col for col in data.columns if col not in non_sentiment_cols]

# filter rows where at least one sentiment column has a value of 1
filtered_data = filtered_data[filtered_data[sentiment_cols].sum(axis=1) > 0]

# check the distribution of sentiment columns
sentiment_distribution = filtered_data[sentiment_cols].sum()
print("Sentiment Distribution:")
sentiment_distribution

Sentiment Distribution:


admiration        17131
amusement          9245
anger              8084
annoyance         13618
approval          17620
caring             5999
confusion          7359
curiosity          9692
desire             3817
disappointment     8469
disapproval       11424
disgust            5301
embarrassment      2476
excitement         5629
fear               3197
gratitude         11625
grief               673
joy                7983
love               8191
nervousness        1810
optimism           8715
pride              1302
realization        8785
relief             1289
remorse            2525
sadness            6758
surprise           5514
neutral           55298
dtype: int64

In [5]:
# merge the sentiment columns based on distribution and meaning
filtered_data['positive recognition'] = filtered_data['admiration'] | filtered_data['approval']
filtered_data['anger'] = filtered_data['anger'] | filtered_data['annoyance']
filtered_data['sadness'] = filtered_data['sadness'] | filtered_data['grief']
filtered_data['anxiety'] = filtered_data['fear'] | filtered_data['nervousness']
filtered_data['regret'] = filtered_data['remorse'] | filtered_data['disappointment']
filtered_data['happiness'] = filtered_data['gratitude'] | filtered_data['joy'] | filtered_data['amusement'] | filtered_data['excitement'] | filtered_data['optimism'] |  filtered_data['pride'] | filtered_data['relief']
filtered_data['discomfort'] = filtered_data['embarrassment'] | filtered_data['confusion'] | filtered_data['disgust'] | filtered_data['disapproval']
filtered_data['affection'] = filtered_data['love'] | filtered_data['caring']
filtered_data['curiosity'] = filtered_data['desire'] | filtered_data['curiosity']
filtered_data['surprise'] = filtered_data['realization'] | filtered_data['surprise']

# list of merged columns
merged_columns = [
    'positive recognition', 'anger', 'sadness', 'anxiety', 'regret',
    'happiness', 'discomfort', 'affection', 'curiosity', 'surprise']

# not merged columns
unmerged_columns = [
    col for col in filtered_data.columns 
    if col not in merged_columns + [
        'admiration', 'approval', 'anger', 'annoyance', 'sadness', 'grief',
        'fear', 'nervousness', 'remorse', 'disappointment', 'optimism', 
        'excitement', 'gratitude', 'joy', 'embarrassment', 'confusion', 
        'love', 'caring', 'disgust', 'disapproval', 'desire', 'curiosity', 
        'realization', 'surprise', 'pride', 'amusement', 'relief']]

# combine merged and unmerged columns for final dataset
final_columns = unmerged_columns + merged_columns
final_data = filtered_data[final_columns]

# see updated columns in the final dataset
final_data.columns

Index(['id', 'text', 'example_very_unclear', 'neutral', 'positive recognition',
       'anger', 'sadness', 'anxiety', 'regret', 'happiness', 'discomfort',
       'affection', 'curiosity', 'surprise'],
      dtype='object')

In [5]:
# write to csv
final_data.to_csv('../data/merged_filtered_data.csv', index=False)