In [1]:
import numpy as np
import pandas as pd
import os
import csv

## EmoBank

Load in EmoBank and AffectiveText.

In [49]:
# EmoBank
eb = pd.read_csv('../data/raw/emobank.csv')
print("EmoBank:")
print(eb.head())
# drop duplicate text values
eb = eb.drop_duplicates(subset=['text'], keep=False)
# save to csv
eb.to_csv('../data/clean/emobank.csv', index=False)

# AffectiveText trial
at = pd.read_xml('../data/raw/AffectiveText/AffectiveText.trial/affectivetext_trial.xml')
columns = ['id', 'anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
at_em = pd.read_csv('../data/raw/AffectiveText/AffectiveText.trial/affectivetext_trial_emotions.csv'\
                    ,names=columns, sep=' ')
# add emotion labels to at
for emotion in columns[1:]:
    at[emotion] = at_em[emotion]

# AffectiveText test
at_test = pd.read_xml('../data/raw/AffectiveText/AffectiveText.test/affectivetext_test.xml')
at_test_em = pd.read_csv('../data/raw/AffectiveText/AffectiveText.test/affectivetext_test_emotions.csv'\
                    , names=columns, sep=' ')
# add emotion labels to at_test
for emotion in columns[1:]:
    at_test[emotion] = at_test_em[emotion]
    
# combine at and at_test
at = pd.concat([at, at_test], ignore_index=True)
print("AffectiveText:")
print(at.head())
# drop duplicate text values
at = at.drop_duplicates(subset=['instance'], keep=False)
# save to csv
at.to_csv('../data/clean/AffectiveText.csv', index=False)

EmoBank:
                    id  split     V     A     D  \
0  110CYL068_1036_1079  train  3.00  3.00  3.20   
1  110CYL068_1079_1110   test  2.80  3.10  2.80   
2  110CYL068_1127_1130  train  3.00  3.00  3.00   
3  110CYL068_1137_1188  train  3.44  3.00  3.22   
4  110CYL068_1189_1328  train  3.55  3.27  3.46   

                                                text  
0        Remember what she said in my last letter? "  
1                          If I wasn't working here.  
2                                                .."  
3  Goodwill helps people get off of public assist...  
4  Sherry learned through our Future Works class ...  
AffectiveText:
   id                                   instance  anger  disgust  fear  joy  \
0   1     Mortar assault leaves at least 18 dead     22        2    60    0   
1   2                     Goal delight for Sheva      0        0     0   93   
2   3       Nigeria hostage feared dead is freed     18        0    52   66   
3   4                  

Load in SemEval2018

In [39]:
# SemEval2018
se_train = pd.read_csv('../data/raw/SemEval2018/2018-E-c-En-train.txt', sep='\t', names=['ID', 'text', 'anger', 'anticipation', 'disgust', 'fear', 'joy',	'love',	'optimism', 'pessimism', 'sadness',	'surprise',	'trust'])
# drop the first row
se_train = se_train.drop([0])
# drop the ID column
se_train = se_train.drop(['ID'], axis=1)

se_train.to_csv('../data/clean/SemEval2018_train.csv', index=False)
print("SemEval2018:")
se_train.head()

SemEval2018:


Unnamed: 0,text,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
1,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1
2,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
3,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0
4,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
5,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0


In [42]:
# SemEval2018
se_val = pd.read_csv('../data/raw/SemEval2018/2018-E-c-En-dev.txt', sep='\t', names=['ID', 'text', 'anger', 'anticipation', 'disgust', 'fear', 'joy',	'love',	'optimism', 'pessimism', 'sadness',	'surprise',	'trust'])
# drop the first row
se_val = se_val.drop([0])
# drop the ID column
se_val = se_val.drop(['ID'], axis=1)

se_val.to_csv('../data/clean/SemEval2018_val.csv', index=False)
print("SemEval2018:")
se_val.tail()

SemEval2018:


Unnamed: 0,text,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
882,@BadHombreNPS @SecretaryPerry If this didn't m...,1,0,1,0,0,0,0,0,0,0,0
883,Excited to watch #stateoforigin tonight! Come ...,0,0,0,0,1,0,1,0,0,0,0
884,"Blah blah blah Kyrie, IT, etc. @CJC9BOSS leavi...",1,0,1,0,0,0,0,0,1,0,0
885,#ThingsIveLearned The wise #shepherd never tru...,0,0,0,0,0,0,0,0,0,0,0
886,I am really flattered and happy to hear those ...,0,0,0,0,1,0,1,0,0,0,0


In [46]:
# SemEval2018
se_test = pd.read_csv('../data/raw/SemEval2018/2018-E-c-En-test.txt', sep='\t', names=['ID', 'text', 'anger', 'anticipation', 'disgust', 'fear', 'joy',	'love',	'optimism', 'pessimism', 'sadness',	'surprise',	'trust'])
# drop the first row
se_test = se_test.drop([0])
# drop the ID column
se_test = se_test.drop(['ID'], axis=1)

se_test.to_csv('../data/clean/SemEval2018_test.csv', index=False)
print("SemEval2018:")
se_test.head()

SemEval2018:


Unnamed: 0,text,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
1,@Adnan__786__ @AsYouNotWish Dont worry Indian ...,1,1,0,0,0,0,1,0,0,0,1
2,"Academy of Sciences, eschews the normally sobe...",0,0,1,0,0,0,0,0,0,0,0
3,I blew that opportunity -__- #mad,1,0,1,0,0,0,0,0,1,0,0
4,This time in 2 weeks I will be 30... 😥,0,0,0,0,1,0,0,0,1,0,0
5,#Deppression is real. Partners w/ #depressed p...,0,0,0,1,0,0,0,0,1,0,0


Identify and merge the overlap between the two datasets.

In [48]:
# clean up EmoBank ids to match AffectiveText
eb['id'] = eb['id'].str.replace('SemEval_', '')
at['id'] = at['id'].astype(str)
# merge EmoBank and AffectiveText
eb_at = pd.merge(eb, at, how='inner', left_on='id', right_on='id',validate='one_to_one')
# print number of rows
print("EmoBank + AffectiveText:", len(eb_at))
# check for repeated rows - should be 0
print("Repeated rows:",len(eb_at[eb_at['instance'].duplicated()]))
print(eb_at.head())
# check for any non-matching examples
assert len(eb_at[eb_at['text'] == eb_at['instance']]) == len(eb_at)-1\
    , "There are non-matching examples"
eb_at = eb_at.drop(columns=['instance'], axis=1)
eb_at.to_csv('../data/clean/EmoBank_AffectiveText.csv', index=False)

EmoBank + AffectiveText: 1149
Repeated rows: 0
     id  split     V     A     D  \
0     1  train  2.29  3.29  2.86   
1    10  train  3.50  2.88  3.00   
2   100  train  2.88  3.00  3.00   
3  1000  train  2.00  3.62  2.75   
4  1001  train  2.80  3.00  3.00   

                                                text  \
0             Mortar assault leaves at least 18 dead   
1  Alonso would be happy to retire with three titles   
2                Report criticises US press freedoms   
3  Terror officials see Al Qaeda chiefs regaining...   
4  Ivrea journal: In Italian town, a civics lesso...   

                                            instance  anger  disgust  fear  \
0             Mortar assault leaves at least 18 dead     22        2    60   
1  Alonso would be happy to retire with three titles      0        0     0   
2                Report criticises US press freedoms     25       24     6   
3  Terror officials see Al Qaeda chiefs regaining...     13       11    86   
4  Ivrea 

In [60]:
def normalise_labels(df, label_cols):
    # sum all label values in each row and normalise
    df[label_cols] = df[label_cols].div(df[label_cols].sum(axis=1), axis=0)
    return df

def categorise_labels(df, label_cols):
    # 1-hot encode rows with the highest label value
    df[label_cols] = df[label_cols].eq(df[label_cols].max(axis=1), axis=0).astype(int)
    return df

In [65]:
# normalise labels
eb_at = normalise_labels(eb_at, columns[1:])
# 1-hot encode labels
eb_at = categorise_labels(eb_at, columns[1:])

## Children's stories

Write full dataset to csv file.

In [129]:
DATA_PATH = '../data/raw/children/children' # relative to this notebook
authors = ['Grimms', 'HCAndersen', 'Potter']
FOLDER = 'emmood'

files = os.listdir(os.path.join(DATA_PATH, authors[0], FOLDER))

table = pd.DataFrame(columns=['author', 'story', 'annotator_1', 'annotator_2', 'sentence'])
for AUTHOR in authors:
    for f in os.listdir(os.path.join(DATA_PATH, AUTHOR, 'emmood')):
        try:
            new_table = pd.read_csv(os.path.join(DATA_PATH, AUTHOR, 'emmood', f), sep='\t'\
                                    , names=['annotator_1', 'annotator_2', 'sentence']\
                                    , usecols=[1, 2, 3])
            new_table['author'] = AUTHOR
            new_table['story'] = f.split('.')[0]
            table = pd.concat([table, new_table], ignore_index=True)
        except Exception as e:
            print(AUTHOR, f, e)
            continue
print(table.describe())
print(table.head())
table.to_csv('../data/raw/children.csv', index=False)

HCAndersen beetle.emmood Error tokenizing data. C error: EOF inside string starting at row 150
HCAndersen cock.emmood Error tokenizing data. C error: EOF inside string starting at row 32
            author     story annotator_1 annotator_2        sentence
count        14024     14024       14024       14024           14024
unique           3       174          61          64           13913
top     HCAndersen  goloshes         N:N         N:N  Goodbye, Hans.
freq          7042       495        8585        5421              11
   author                                 story annotator_1 annotator_2  \
0  Grimms  106_the_poor_millers_boy_and_the_cat         N:N         N:N   
1  Grimms  106_the_poor_millers_boy_and_the_cat         N:N         N:N   
2  Grimms  106_the_poor_millers_boy_and_the_cat         N:D         N:D   
3  Grimms  106_the_poor_millers_boy_and_the_cat         N:N         N:D   
4  Grimms  106_the_poor_millers_boy_and_the_cat         N:N         N:N   

                 