# Train test split 

repecting sentence, stratify rare ents as much as possible

In [24]:
import pandas as pd
df = pd.read_csv('data/latin_NER_dataset_improved.csv', index_col=0)
print(len(df))

134785


In [25]:
in_domain = df[df['orig_text'] != 'Ovid']

In [26]:
in_domain.orig_text.value_counts()

GW              58621
PlinyElder      35672
PlinyYounger    18571
CW               4819
Name: orig_text, dtype: int64

In [27]:
#identify the sentences with rare entities

grouped = in_domain.groupby("sent_id")

I_GRP_sents = []
I_LOC_sents = []

for group in grouped.groups:
    sent_df = grouped.get_group(group)
    if 'I-GRP' in sent_df['tag'].values:
        I_GRP_sents.append(group)
    if 'I-LOC' in sent_df['tag'].values:
        I_LOC_sents.append(group)

print(f'sentences that contain I-GRP: \n{I_GRP_sents}')
print(f'sentences that contain I-LOC: \n{I_LOC_sents}')

sentences that contain I-GRP: 
['GW_1301', 'GW_1978', 'GW_2342', 'GW_2426', 'GW_2433', 'GW_2435', 'PlinyElder_1400', 'PlinyElder_3462']
sentences that contain I-LOC: 
['GW_1143', 'GW_1558', 'GW_169', 'GW_1717', 'GW_1933', 'GW_1971', 'GW_411', 'GW_412', 'GW_417', 'GW_46', 'GW_47', 'GW_63', 'GW_65', 'GW_723', 'GW_755', 'GW_761', 'PlinyElder_127', 'PlinyElder_183', 'PlinyElder_2360', 'PlinyElder_2395', 'PlinyElder_255', 'PlinyElder_2622', 'PlinyElder_3035', 'PlinyElder_3226', 'PlinyElder_3294', 'PlinyElder_335', 'PlinyElder_336', 'PlinyElder_3363', 'PlinyElder_3440', 'PlinyElder_3452', 'PlinyElder_3459', 'PlinyElder_3478', 'PlinyElder_3527', 'PlinyElder_3535', 'PlinyElder_3546', 'PlinyElder_3574', 'PlinyElder_3575', 'PlinyElder_3590', 'PlinyYounger_860']


In [28]:
print(len(I_GRP_sents))
print(len(I_LOC_sents))

8
39


In [29]:
forbidden_group = I_GRP_sents + I_LOC_sents

#treat the rest of the dataset as usual
normal_dataset = pd.concat([group for (name, group) in grouped if name not in forbidden_group])
print(normal_dataset)

#isolate the rare ones
I_loc_df = pd.concat([group for (name, group) in grouped if name in I_LOC_sents])

I_grp_df = pd.concat([group for (name, group) in grouped if name in I_GRP_sents])

                  word     tag  sentence     orig_text           sent_id
0                   C.  B-PERS         0            CW              CW_0
1                 Iuli  I-PERS         0            CW              CW_0
2             Caesaris  I-PERS         0            CW              CW_0
3       Commentariorum       O         0            CW              CW_0
4                   De       O         0            CW              CW_0
...                ...     ...       ...           ...               ...
112784            vera       O       998  PlinyYounger  PlinyYounger_998
112785          amores       O       998  PlinyYounger  PlinyYounger_998
112786            ipse       O       999  PlinyYounger  PlinyYounger_999
112787           posui       O       999  PlinyYounger  PlinyYounger_999
112788               .       O       999  PlinyYounger  PlinyYounger_999

[115844 rows x 5 columns]


In [30]:
from sklearn.model_selection import GroupShuffleSplit

normal_dataset.reset_index(drop=True, inplace=True)

#use groupshufflesplit to keep sentences together while shuffeling and splitting the dataset

#first split: 75 procent to train
train_test_splitter = GroupShuffleSplit(test_size=.25, n_splits=2, random_state = 7)
split = train_test_splitter.split(normal_dataset, groups=normal_dataset['sent_id'])
train_inds, test_inds = next(split)

train = normal_dataset.iloc[train_inds].reset_index(drop=True)
test = normal_dataset.iloc[test_inds].reset_index(drop=True)

In [31]:
#second split: 12.5 procent to test, 12.5 to evaluate
splitter = GroupShuffleSplit(test_size=.5, n_splits=2, random_state = 7)
split = splitter.split(test, groups=test['sent_id'])
eval_inds, test_inds = next(split)

eva = test.iloc[eval_inds]
test = test.iloc[test_inds]

In [32]:
train.head()
test.head()
eva.head()

Unnamed: 0,word,tag,sentence,orig_text,sent_id
0,C.,B-PERS,0,CW,CW_0
1,Iuli,I-PERS,0,CW,CW_0
2,Caesaris,I-PERS,0,CW,CW_0
3,Commentariorum,O,0,CW,CW_0
4,De,O,0,CW,CW_0


In [33]:
print(len(train.groupby('sent_id')))
print(len(test.groupby('sent_id')))
print(len(eva.groupby('sent_id')))

5786
965
964


In [34]:
#perform same train-test-eval split for I-loc df

I_loc_df.reset_index(drop=True, inplace=True)

splitter = GroupShuffleSplit(test_size=.33, n_splits=2, random_state = 7)
split = splitter.split(I_loc_df, groups=I_loc_df['sent_id'])
train_inds, test_inds = next(split)

I_loc_train = I_loc_df.iloc[train_inds].reset_index(drop=True)
I_loc_test = I_loc_df.iloc[test_inds].reset_index(drop=True)

splitter = GroupShuffleSplit(test_size=.5, n_splits=2, random_state = 7)
split = splitter.split(I_loc_test, groups=I_loc_test['sent_id'])
eval_inds, test_inds = next(split)

I_loc_eva = I_loc_test.iloc[eval_inds].reset_index(drop=True)
I_loc_test = I_loc_test.iloc[test_inds].reset_index(drop=True)

print(len(I_loc_train.groupby('sent_id')))
print(len(I_loc_test.groupby('sent_id')))
print(len(I_loc_eva.groupby('sent_id')))

26
7
6


In [35]:
I_loc_eva

Unnamed: 0,word,tag,sentence,orig_text,sent_id
0,Caesari,B-PERS,46,GW,GW_46
1,cum,O,46,GW,GW_46
2,id,O,46,GW,GW_46
3,nuntiatum,O,46,GW,GW_46
4,esset,O,46,GW,GW_46
...,...,...,...,...,...
374,in,O,3363,PlinyElder,PlinyElder_3363
375,se,O,3363,PlinyElder,PlinyElder_3363
376,trahere,O,3363,PlinyElder,PlinyElder_3363
377,nubes,O,3363,PlinyElder,PlinyElder_3363


In [36]:
#and finally I-grp-df, proportions slightly adjusted so we have two samples in eval and test

I_grp_df.reset_index(drop=True, inplace=True)

splitter = GroupShuffleSplit(test_size=.60, n_splits=2, random_state = 7)
split = splitter.split(I_grp_df, groups=I_grp_df['sent_id'])
train_inds, test_inds = next(split)

I_grp_train = I_grp_df.iloc[train_inds].reset_index(drop=True)
I_grp_test = I_grp_df.iloc[test_inds].reset_index(drop=True)

splitter = GroupShuffleSplit(test_size=.5, n_splits=2, random_state = 7)
split = splitter.split(I_grp_test, groups=I_grp_test['sent_id'])
eval_inds, test_inds = next(split)

I_grp_eva = I_grp_test.iloc[eval_inds].reset_index(drop=True)
I_grp_test = I_grp_test.iloc[test_inds].reset_index(drop=True)

print(len(I_grp_train.groupby('sent_id')))
print(len(I_grp_test.groupby('sent_id')))
print(len(I_grp_eva.groupby('sent_id')))

3
3
2


In [37]:
I_grp_eva

Unnamed: 0,word,tag,sentence,orig_text,sent_id
0,Altera,O,2342,GW,GW_2342
1,ex,O,2342,GW,GW_2342
2,parte,O,2342,GW,GW_2342
3,Gabalos,B-GRP,2342,GW,GW_2342
4,proximosque,O,2342,GW,GW_2342
5,pagos,O,2342,GW,GW_2342
6,Arvernorum,B-GRP,2342,GW,GW_2342
7,in,O,2342,GW,GW_2342
8,Helvios,B-GRP,2342,GW,GW_2342
9,",",O,2342,GW,GW_2342


In [38]:
#concatenate df's
train_final = pd.concat([train, I_loc_train, I_grp_train], ignore_index=True)
test_final = pd.concat([test, I_loc_test, I_grp_test], ignore_index=True)
eval_final = pd.concat([eva, I_loc_eva, I_grp_eva], ignore_index=True)

In [39]:
#inspect length on sent
print(len(train_final.groupby('sent_id')))
print(len(test_final.groupby('sent_id')))
print(len(eval_final.groupby('sent_id')))

5815
975
972


In [40]:
#inspect length on tokens
print(len(train_final))
print(len(test_final))
print(len(eval_final))

88165
14686
14832


In [41]:
print(train_final.tag.value_counts())
print(test_final.tag.value_counts())
print(eval_final.tag.value_counts())

O         82696
B-PERS     2706
B-GRP      1271
B-LOC       839
I-PERS      618
I-LOC        31
I-GRP         4
Name: tag, dtype: int64
O         13638
B-PERS      474
B-GRP       247
B-LOC       218
I-PERS       98
I-LOC         8
I-GRP         3
Name: tag, dtype: int64
O         13846
B-PERS      473
B-GRP       207
B-LOC       169
I-PERS      125
I-LOC        10
I-GRP         2
Name: tag, dtype: int64


In [42]:
print(train_final.orig_text.value_counts())
print(test_final.orig_text.value_counts())
print(eval_final.orig_text.value_counts())

GW              43949
PlinyElder      26697
PlinyYounger    13827
CW               3692
Name: orig_text, dtype: int64
GW              7452
PlinyElder      4188
PlinyYounger    2454
CW               592
Name: orig_text, dtype: int64
GW              7220
PlinyElder      4787
PlinyYounger    2290
CW               535
Name: orig_text, dtype: int64


In [43]:
def df_sort_groups(df, by='sentence'):
    grouped = df.groupby(by)

    new_df = pd.DataFrame(columns = ['word', 'tag', 'sentence'])

    for name, group in grouped:
        new_df = pd.concat([new_df, group])
        
    return new_df

def df_nested_sort_groups(df, by='orig_text'):
    grouped = df.groupby(by)
    
    new_df = pd.DataFrame(columns = ['word', 'tag', 'sentence'])
    
    for name, group in grouped:
        group = df_sort_groups(group)
        new_df = pd.concat([new_df, group])
        
    return new_df

train_final = df_nested_sort_groups(train_final)
eval_final = df_nested_sort_groups(eval_final)
test_final = df_nested_sort_groups(test_final)

out_domain = df[df['orig_text'] == 'Ovid']

test_final = pd.concat([test_final, out_domain])
test_final.reset_index(drop=True, inplace=True)

In [45]:
train_final.reset_index(drop=True, inplace=True)

In [47]:
eval_final.reset_index(drop=True, inplace=True)

In [48]:
train_final.to_csv('data/Latin_NER_train.csv')
test_final.to_csv('data/Latin_NER_test.csv')
eval_final.to_csv('data/Latin_NER_eval.csv')