# Train

In [1]:
from datasets import load_dataset, load_metric, Dataset, DatasetDict, concatenate_datasets
import os

In [2]:
train_loc = r'flores101_dataset_train'

In [2]:
# read files into dictionary
data = {'en-et': {'en':[],
                  'et':[]},
        'en-mk': {'en':[],
                  'mk':[]},
        'et-mk': {'et':[],
                  'mk':[]}}

for file in [x for x in os.listdir(train_loc) if x.split('.')[1] in ['en-et', 'en-mk', 'et-mk'] and x.split('.')[2] in ['en','et','mk']]:
    with open(os.path.join(train_loc,file), encoding="utf-8") as f:
        lines = f.readlines()
        ftype = file.split('.')
        cleaned = [l.strip('\n') for l in lines if l != '\n']
        data[ftype[1]][ftype[2]] += cleaned
        
    # print progress
    print(len(data['en-et']['en']),len(data['en-et']['et']),
          len(data['en-mk']['en']),len(data['en-mk']['mk']),
          len(data['et-mk']['et']),len(data['et-mk']['mk']),
          end=' '*50+'\r')

# connect lang1 text with lang2 text
main = {}
for l1, l2 in [('en','et'),('en','mk'),('et','mk')]:
    reform = []
    for lang1, lang2 in zip(data['{}-{}'.format(l1,l2)][l1],data['{}-{}'.format(l1,l2)][l2]):
        reform.append({l1:lang1, l2:lang2})
    main['{}-{}'.format(l1,l2)] = reform

35710108 35710108 2714368 2714368 3064902 3064902                                                  

(35710108, 2714368, 3064902)

In [28]:
# create Dataset from dictionary
dataset_et_mk = Dataset.from_dict({'translation':main['et-mk']})

dataset_en_mk = Dataset.from_dict({'translation':main['en-mk']})

# split en_et into two parts because of large size
split = 2
idx = int( len(main['en-et'])/ split )

dataset_en_et = Dataset.from_dict({'translation':main['en-et'][:idx]})
dataset_en_et1 = Dataset.from_dict({'translation':main['en-et'][idx:]})

dataset_en_et = concatenate_datasets([dataset_en_et, dataset_en_et1])

# create one Dataset containing all training examples
d = DatasetDict()
d['et-mk'] = dataset_et_mk
d['en-mk'] = dataset_en_mk
d['en-et'] = dataset_en_et

d.save_to_disk("dataset_train")

## Test

In [86]:
test_dev = r'flores101_dataset_dev\dev'
test_devtest = r'flores101_dataset_dev\devtest'

['eng.devtest', 'est.devtest', 'mkd.devtest']

In [105]:
# read files into dictionary
test_data = {'dev':{}, 'devtest':{} }
for folder in [test_dev, test_devtest]:
    for file in [x for x in os.listdir(folder) if x.split('.')[0] in ['eng','est','mkd']]:
        print(file)
        with open(os.path.join(folder,file), encoding="utf-8") as f:
            lines = f.readlines()
            language = file.split('.')[0]
            cleaned = [l.replace('\n','') for l in lines if l != '\n']
            test_data[folder.split('\\')[1]][language] = cleaned

eng.dev
est.dev
mkd.dev
eng.devtest
est.devtest
mkd.devtest


In [136]:
# connect lang1 text with lang2 text
test_main = {}
for types, v in test_data.items():
    reform = []
    for en, et, mk in zip(v['eng'], v['est'],v ['mkd']):
        reform.append({'en':en, 'et':et, 'mk':mk})
    test_main[types] = reform
len(test_main['dev']), len(test_main['devtest'])

(997, 1012)

In [145]:
# create Dataset from dictionary
dataset_dev = Dataset.from_dict({'translation':test_main['dev']})
dataset_devtest = Dataset.from_dict({'translation':test_main['devtest']})

# create one Dataset containing all validation and test examples
d_dev = DatasetDict()
d_dev['dev'] = dataset_dev
d_dev['devtest'] = dataset_devtest
d_dev.save_to_disk("dataset_dev")