In [76]:
from collections import Counter
import pandas as pd
import numpy as np

# MELD

In [77]:
train_df = pd.read_json('data/EDiReF_train_data/MELD_train_efr.json')
flattened_emotions = [sent for conv in train_df['emotions'] for sent in conv]

emotion_count_train = Counter(flattened_emotions)

for e in emotion_count_train.items():
    print(e)

('neutral', 15263)
('surprise', 4645)
('fear', 1114)
('sadness', 2648)
('joy', 6317)
('disgust', 1049)
('anger', 3964)


In [78]:
val_df = pd.read_json('data/EDiReF_val_data/MELD_val_efr.json')
flattened_emotions = [sent for conv in val_df['emotions'] for sent in conv]

emotion_count_val = Counter(flattened_emotions)

for e in emotion_count_val.items():
    print(e)

('neutral', 1360)
('joy', 597)
('sadness', 343)
('surprise', 520)
('anger', 482)
('disgust', 64)
('fear', 156)


In [79]:
test_df = pd.read_json('data/EDiReF_test_data/MELD_test_efr.json')
flattened_emotions = [sent for conv in test_df['emotions'] for sent in conv]

emotion_count_test = Counter(flattened_emotions)

for e in emotion_count_test.items():
    print(e)

('surprise', 1073)
('anger', 1215)
('neutral', 3784)
('joy', 1376)
('sadness', 712)
('fear', 177)
('disgust', 305)


In [80]:
total_emotion_count = emotion_count_train + emotion_count_val + emotion_count_test

for e in total_emotion_count.items():
    print(e)

('neutral', 20407)
('surprise', 6238)
('fear', 1447)
('sadness', 3703)
('joy', 8290)
('disgust', 1418)
('anger', 5661)


In [81]:
train_df["triggers"] = train_df["triggers"].apply(lambda lst: [np.nan if x is None else x for x in lst])
train_df = train_df[train_df["triggers"].apply(lambda lst: not any(pd.isna(x) for x in lst))]
flattened_triggers_train = [sent for conv in train_df['triggers'] for sent in conv]

val_df["triggers"] = val_df["triggers"].apply(lambda lst: [np.nan if x is None else x for x in lst])
val_df = val_df[val_df["triggers"].apply(lambda lst: not any(pd.isna(x) for x in lst))]
flattened_triggers_val = [sent for conv in val_df['triggers'] for sent in conv]

In [82]:
print(len([i for i in flattened_triggers_train if i == 1.0]))
print(len([i for i in flattened_triggers_val if i == 1.0]))

5558
492


In [83]:
flattened_conversations_train = [sent for conv in train_df['utterances'] for sent in conv]
print(len(flattened_conversations_train))

34897


In [84]:
flattened_conversations_val = [sent for conv in val_df['utterances'] for sent in conv]
print(len(flattened_conversations_val))

3513


In [85]:
flattened_conversations_test = [sent for conv in test_df['utterances'] for sent in conv]
print(len(flattened_conversations_test))

8642


# MaSaC

In [86]:
train_df = pd.read_json('data/EDiReF_train_data/MaSaC_train_efr.json')
flattened_emotions = [sent for conv in train_df['emotions'] for sent in conv]

emotion_count_train = Counter(flattened_emotions)

for e in emotion_count_train.items():
    print(e)

('neutral', 44735)
('anger', 9715)
('surprise', 5233)
('fear', 6476)
('joy', 18446)
('contempt', 5468)
('sadness', 7184)
('disgust', 1520)


In [87]:
val_df = pd.read_json('data/EDiReF_val_data/MaSaC_val_efr.json')
flattened_emotions = [sent for conv in val_df['emotions'] for sent in conv]

emotion_count_val = Counter(flattened_emotions)

for e in emotion_count_val.items():
    print(e)

('neutral', 3159)
('anger', 639)
('surprise', 318)
('fear', 478)
('joy', 1801)
('contempt', 493)
('sadness', 487)
('disgust', 87)


In [88]:
test_df = pd.read_json('data/EDiReF_test_data/MaSaC_test_efr.json')
flattened_emotions = [sent for conv in test_df['emotions'] for sent in conv]

emotion_count_test = Counter(flattened_emotions)

for e in emotion_count_test.items():
    print(e)

('neutral', 3265)
('anger', 749)
('surprise', 348)
('fear', 445)
('joy', 1730)
('sadness', 536)
('contempt', 547)
('disgust', 70)


In [89]:
total_emotion_count = emotion_count_train + emotion_count_val + emotion_count_test

for e in total_emotion_count.items():
    print(e)

('neutral', 51159)
('anger', 11103)
('surprise', 5899)
('fear', 7399)
('joy', 21977)
('contempt', 6508)
('sadness', 8207)
('disgust', 1677)


In [90]:
train_df["triggers"] = train_df["triggers"].apply(lambda lst: [np.nan if x is None else x for x in lst])
train_df = train_df[train_df["triggers"].apply(lambda lst: not any(pd.isna(x) for x in lst))]
flattened_triggers_train = [sent for conv in train_df['triggers'] for sent in conv]

val_df["triggers"] = val_df["triggers"].apply(lambda lst: [np.nan if x is None else x for x in lst])
val_df = val_df[val_df["triggers"].apply(lambda lst: not any(pd.isna(x) for x in lst))]
flattened_triggers_val = [sent for conv in val_df['triggers'] for sent in conv]

In [91]:
print(len([i for i in flattened_triggers_train if i == '1']))
print(len([i for i in flattened_triggers_val if i == '1']))

1991
382


In [92]:
flattened_conversations_train = [sent for conv in train_df['utterances'] for sent in conv]
print(len(flattened_conversations_train))

98777


In [93]:
flattened_conversations_val = [sent for conv in val_df['utterances'] for sent in conv]
print(len(flattened_conversations_val))

7462


In [94]:
flattened_conversations_test = [sent for conv in test_df['utterances'] for sent in conv]
print(len(flattened_conversations_test))

7690
