# Adding a Stratified Train-Dev-Test Split to EmoBank

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
eb = pd.read_csv('emobank.csv', index_col=0)

In [7]:
eb

Unnamed: 0_level_0,V,A,D,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
110CYL068_1036_1079,3.00,3.00,3.20,"Remember what she said in my last letter? """
110CYL068_1079_1110,2.80,3.10,2.80,If I wasn't working here.
110CYL068_1127_1130,3.00,3.00,3.00,".."""
110CYL068_1137_1188,3.44,3.00,3.22,Goodwill helps people get off of public assist...
110CYL068_1189_1328,3.55,3.27,3.46,Sherry learned through our Future Works class ...
...,...,...,...,...
wwf12_4531_4624,3.00,3.50,3.00,Please let it be a constant reminder of all yo...
wwf12_501_591,3.80,3.40,3.60,That’s why I want to extend my appreciation fo...
wwf12_592_691,3.00,3.00,3.10,And why I’m writing you today to ask you to re...
wwf12_702_921,3.33,3.44,3.44,"In fact, I want to urge you to strengthen your..."


In [8]:
meta = pd.read_csv('meta.tsv', sep='\t', index_col=0)

In [9]:
meta

Unnamed: 0_level_0,document,category,subcategory
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acephalous-Cant-believe_4_47,Acephalous-Cant-believe,blog,
Acephalous-Cant-believe_83_354,Acephalous-Cant-believe,blog,
Acephalous-Cant-believe_355_499,Acephalous-Cant-believe,blog,
Acephalous-Cant-believe_500_515,Acephalous-Cant-believe,blog,
Acephalous-Cant-believe_517_626,Acephalous-Cant-believe,blog,
...,...,...,...
SemEval_1495,SemEval,SemEval,
SemEval_1496,SemEval,SemEval,
SemEval_1497,SemEval,SemEval,
SemEval_1498,SemEval,SemEval,


In [13]:
meta.category.value_counts()

fiction          2893
letters          1479
newspaper        1381
blog             1378
SemEval          1250
essays           1196
travel-guides     971
Name: category, dtype: int64

In [22]:
eb = eb.join(meta, how='inner')

In [23]:
eb

Unnamed: 0_level_0,V,A,D,text,document,category,subcategory
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
110CYL068_1036_1079,3.00,3.00,3.20,"Remember what she said in my last letter? """,110CYL068,letters,philanthropic-fundraising
110CYL068_1079_1110,2.80,3.10,2.80,If I wasn't working here.,110CYL068,letters,philanthropic-fundraising
110CYL068_1127_1130,3.00,3.00,3.00,"..""",110CYL068,letters,philanthropic-fundraising
110CYL068_1137_1188,3.44,3.00,3.22,Goodwill helps people get off of public assist...,110CYL068,letters,philanthropic-fundraising
110CYL068_1189_1328,3.55,3.27,3.46,Sherry learned through our Future Works class ...,110CYL068,letters,philanthropic-fundraising
...,...,...,...,...,...,...,...
wwf12_4531_4624,3.00,3.50,3.00,Please let it be a constant reminder of all yo...,wwf12,letters,solicitation-brochures
wwf12_501_591,3.80,3.40,3.60,That’s why I want to extend my appreciation fo...,wwf12,letters,solicitation-brochures
wwf12_592_691,3.00,3.00,3.10,And why I’m writing you today to ask you to re...,wwf12,letters,solicitation-brochures
wwf12_702_921,3.33,3.44,3.44,"In fact, I want to urge you to strengthen your...",wwf12,letters,solicitation-brochures


In [37]:
tmp, test = train_test_split(eb.index, stratify=eb.category, random_state=42, test_size=1000)
train, dev = train_test_split(tmp, stratify=eb.loc[tmp].category, random_state=42, test_size=1000)

In [38]:
print(len(train), len(dev), len(test))

8062 1000 1000


In [47]:
relfreqs = {}
splits = {'train':train, 'dev': dev, 'test':test}
for key, split in splits.items():
    relfreqs[key] = eb.loc[split].category.value_counts() / len(split)
pd.DataFrame(relfreqs).round(3)

Unnamed: 0,train,dev,test
fiction,0.274,0.274,0.274
letters,0.141,0.14,0.14
blog,0.133,0.133,0.133
newspaper,0.13,0.131,0.131
SemEval,0.119,0.118,0.118
essays,0.113,0.113,0.113
travel-guides,0.091,0.091,0.091


In [49]:
for key, split in splits.items():
    eb.loc[split, 'split'] = key

In [52]:
eb = eb.drop(columns=['document', 'category', 'subcategory'])

In [60]:
eb = eb[['split', 'V', 'A', 'D', 'text']]

In [61]:
eb

Unnamed: 0_level_0,split,V,A,D,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
110CYL068_1036_1079,train,3.00,3.00,3.20,"Remember what she said in my last letter? """
110CYL068_1079_1110,test,2.80,3.10,2.80,If I wasn't working here.
110CYL068_1127_1130,train,3.00,3.00,3.00,".."""
110CYL068_1137_1188,train,3.44,3.00,3.22,Goodwill helps people get off of public assist...
110CYL068_1189_1328,train,3.55,3.27,3.46,Sherry learned through our Future Works class ...
...,...,...,...,...,...
wwf12_4531_4624,train,3.00,3.50,3.00,Please let it be a constant reminder of all yo...
wwf12_501_591,train,3.80,3.40,3.60,That’s why I want to extend my appreciation fo...
wwf12_592_691,train,3.00,3.00,3.10,And why I’m writing you today to ask you to re...
wwf12_702_921,train,3.33,3.44,3.44,"In fact, I want to urge you to strengthen your..."


In [62]:
eb.to_csv('emobank.csv')

---