In [1]:
import json as js
import pandas as pd
from datasets import load_dataset, Dataset
from tqdm import tqdm
tqdm.pandas()
from ast import literal_eval

In [2]:
with open("meta_infos.json", "r") as f:
    meta_infos = js.load(f)

In [3]:
template_list = list()

for task, languages in meta_infos["task_language_mapping"].items():
    if len(languages)>1:
        languages.append('all')
    for lang in languages:
        for split in ['train', 'validation', 'test']:
            template_list.append({'finetuning_task':task, 'split':split, 'language': lang})
template = pd. DataFrame(template_list)
#template = template[template.finetuning_task.str.contains('multi_eurlex')==False] # multi_eurlex we need to perform language split

In [4]:
def split_into_languages(dataset):
    dataset_new = list()

    dataset_df = pd.DataFrame(dataset)

    for item in dataset_df.to_dict(orient='records'):
        labels = item['label']
        for language, document in literal_eval(item['input']).items():
            if document is not None:
                item_new = dict()
                item_new['language'] = language
                item_new['input'] = str(document)
                item_new['label'] = labels
                dataset_new.append(item_new)

    dataset_new = pd.DataFrame(dataset_new)

    dataset_new = Dataset.from_pandas(dataset_new)
    
    return dataset_new

In [5]:
def get_sample_size(finetuning_task, split, language):
    dataset = load_dataset("joelito/lextreme", finetuning_task, split=split)
    
    if 'multi_eurlex' in finetuning_task:
        dataset = split_into_languages(dataset)
    
    if language=="all":
        return len(dataset['input'])
    else:
        dataset = pd.DataFrame(dataset)
        dataset = dataset[dataset.language==language]
        return dataset.shape[0]

In [None]:
template['number_of_samples'] = template.progress_apply(lambda x: get_sample_size(x["finetuning_task"], x["split"], x["language"]), axis=1)

  0%|                                                   | 0/441 [00:00<?, ?it/s]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/brazilian_court_decisions_judgment/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
  0%|▏                                          | 2/441 [00:03<12:33,  1.72s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/brazilian_court_decisions_judgment/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
  1%|▎                                          | 3/441 [00:06<16:44,  2.29s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/brazilian_court_decisions_judgment/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
  1%|▍                                          | 4/441 [00:09<18:26,  2.53s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme

  7%|███                                       | 32/441 [02:00<22:27,  3.29s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/online_terms_of_service_unfairness_levels/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
  7%|███▏                                      | 33/441 [02:03<21:45,  3.20s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/online_terms_of_service_unfairness_levels/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
  8%|███▏                                      | 34/441 [02:06<21:16,  3.14s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/online_terms_of_service_unfairness_levels/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
  8%|███▎                                      | 35/441 [02:09<20:56,  3.09s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datase

 14%|█████▊                                    | 61/441 [03:29<19:10,  3.03s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/covid19_emergency_event/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
 14%|█████▉                                    | 62/441 [03:32<19:10,  3.04s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/covid19_emergency_event/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
 14%|██████                                    | 63/441 [03:35<18:56,  3.01s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/covid19_emergency_event/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
 15%|██████                                    | 64/441 [03:38<18:46,  2.99s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/covid19_emergency_event/1.0.0/43

 21%|███████▉                              | 92/441 [45:25<34:36:55, 357.06s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
 21%|████████                              | 93/441 [46:15<25:36:27, 264.91s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
 21%|████████                              | 94/441 [47:30<20:02:57, 208.01s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
 22%|████████▏                             | 95/441 [59:10<34:09:38, 355.43s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ff

 28%|█████████▊                         | 123/441 [3:02:20<23:23:20, 264.78s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
 28%|█████████▊                         | 124/441 [3:03:31<18:11:25, 206.58s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
 28%|█████████▉                         | 125/441 [3:14:58<30:47:51, 350.86s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
 29%|██████████                         | 126/441 [3:15:47<22:46:30, 260.29s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ff

 35%|████████████▏                      | 154/441 [5:20:00<16:22:19, 205.36s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
 35%|████████████▎                      | 155/441 [5:30:45<26:47:12, 337.18s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
 35%|████████████▍                      | 156/441 [5:31:33<19:49:36, 250.45s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
 36%|████████████▍                      | 157/441 [5:32:42<15:27:38, 195.98s/it]

Downloading and preparing dataset lextreme/multi_eurlex_level_2 to /Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_2/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415...


Generating train split: 0 examples [00:00, ? examples/s]

Found cached dataset multi_eurlex (/Users/test/.cache/huggingface/datasets/multi_eurlex/all_languages/1.0.0/e50b15921a24b8e33941240fe19542e492392357f145d933bd497812181d610d)


Generating validation split: 0 examples [00:00, ? examples/s]

Found cached dataset multi_eurlex (/Users/test/.cache/huggingface/datasets/multi_eurlex/all_languages/1.0.0/e50b15921a24b8e33941240fe19542e492392357f145d933bd497812181d610d)


Generating test split: 0 examples [00:00, ? examples/s]

Found cached dataset multi_eurlex (/Users/test/.cache/huggingface/datasets/multi_eurlex/all_languages/1.0.0/e50b15921a24b8e33941240fe19542e492392357f145d933bd497812181d610d)


Dataset lextreme downloaded and prepared to /Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_2/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415. Subsequent calls will reuse this data.


 36%|████████████▌                      | 158/441 [5:48:27<33:04:06, 420.66s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_2/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
 36%|████████████▌                      | 159/441 [5:49:15<24:11:56, 308.92s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_2/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
 36%|████████████▋                      | 160/441 [5:50:40<18:52:39, 241.85s/it]Found cached dataset lextreme (/Users/test/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_2/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)


In [None]:
template.to_excel('/Users/test/Downloads/overview_sizes.xlsx', index=False)