In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
def load_anew10():
	anew = pd.read_csv(os.environ['DATA_BUECHEL_ANEW10'], sep = '\t')
	anew = anew[['Word','ValMn','AroMn','DomMn']]
	anew.columns = ['word', 'valence', 'arousal',
				   'dominance']
	anew.set_index('word', inplace=True)
	return anew

In [3]:
def load_anew99():
	anew=pd.read_csv(os.environ['DATA_BUECHEL_ANEW99'], sep='\t')
	anew.columns=['word', 'valence', 'arousal', 'dominance']
	anew.set_index('word', inplace=True)
	return anew

In [4]:
def load_warriner13():
    df = pd.read_csv(os.environ['DATA_BUECHEL_WARRINER13'], sep=',')
    df=df[['Word','V.Mean.Sum', 'A.Mean.Sum', 'D.Mean.Sum']]
    df.columns=['word', 'valence', 'arousal', 'dominance']
    df.set_index('word',inplace=True)
    df = df.drop(np.nan)
    for entry in df.index:
        if " " in entry:
            df = df.drop(entry)
    return df

In [5]:
anew10 = load_anew10()
anew10.describe()

Unnamed: 0,valence,arousal,dominance
count,2476.0,2476.0,2476.0
mean,5.206103,5.104439,5.009386
std,1.691787,0.938531,0.964637
min,1.25,2.29,2.17
25%,3.93,4.41,4.4
50%,5.38,5.04,5.095
75%,6.54,5.73,5.67
max,8.82,8.17,7.88


In [6]:
anew99 = load_anew99()
anew99.describe()

Unnamed: 0,valence,arousal,dominance
count,1034.0,1034.0,1034.0
mean,5.153907,5.120087,5.007408
std,1.994206,1.056946,1.019771
min,1.25,2.39,2.27
25%,3.28,4.2825,4.3
50%,5.3,5.065,5.05
75%,6.9075,5.83,5.7275
max,8.82,8.17,7.88


In [7]:
warriner = load_warriner13()
warriner.describe()

Unnamed: 0,valence,arousal,dominance
count,13812.0,13812.0,13812.0
mean,5.06088,4.210674,5.184528
std,1.27484,0.896097,0.938144
min,1.26,1.6,1.68
25%,4.25,3.57,4.58
50%,5.2,4.11,5.26
75%,5.95,4.76,5.84
max,8.53,7.79,7.9


In [8]:
def overlap(lex1, lex2):
    return len(set(lex1.index).intersection(set(lex2.index)))

print(overlap(anew99, anew10))
print(overlap(anew99, warriner))
print(overlap(anew10, warriner))

1032
1031
2328


Test set is anew99 $\cap$ warriner. Dev set is anew10 $\cap$ warriner - test set. Train set is warriner - train - dev.

In [9]:
test = sorted(list(
            set(anew99.index).intersection(set(warriner.index))
         ))
test[:10]

['abduction',
 'abortion',
 'absurd',
 'abundance',
 'abuse',
 'acceptance',
 'accident',
 'ace',
 'ache',
 'achievement']

In [10]:
dev = sorted(list(
        set(anew10.index).intersection(set(warriner.index)).difference(test)
      )) 
dev[:10]

['able',
 'absent',
 'accept',
 'access',
 'accord',
 'accuse',
 'acre',
 'action',
 'actor',
 'ad']

In [11]:
train = sorted(list(
            set(warriner.index).difference(set(dev)).difference(set(test))
        ))
train[:10]

['AIDS',
 'Adidas',
 'Bacardi',
 'Budweiser',
 'Coke',
 'FALSE',
 'Gatorade',
 'Governor',
 'HIV',
 'Nike']

In [12]:
print(len(train), len(dev), len(test))

11484 1297 1031


In [13]:
splits = {'train': train, 
          'dev': dev,
          'test': test}

for split, words in splits.items():
    words = '\n'.join(words)
    with open(split+'.txt', 'w') as f:
        f.write(words)
        

---