In [10]:
import json as js
import pandas as pd
from datasets import load_dataset, Dataset
from tqdm import tqdm
tqdm.pandas()
from ast import literal_eval
from traceback import print_exc

In [2]:
with open("meta_infos.json", "r") as f:
    meta_infos = js.load(f)

In [3]:
def split_into_languages(dataset):
    dataset_new = list()

    dataset_df = pd.DataFrame(dataset)

    for item in dataset_df.to_dict(orient='records'):
        labels = item['label']
        for language, document in literal_eval(item['input']).items():
            if document is not None:
                item_new = dict()
                item_new['language'] = language
                item_new['input'] = str(document)
                item_new['label'] = labels
                dataset_new.append(item_new)

    dataset_new = pd.DataFrame(dataset_new)

    dataset_new = Dataset.from_pandas(dataset_new)
    
    return dataset_new

In [21]:
dataset = load_dataset("joelito/lextreme", 'brazilian_court_decisions_judgment', split='train')
dataset

Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/brazilian_court_decisions_judgment/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)


Dataset({
    features: ['input', 'label', 'language'],
    num_rows: 3234
})

In [23]:
dataset['label'][0]

1

In [33]:
def get_number_of_labels(dataset_object):
    
    labels = dataset_object['label']
    if type(labels[0])==list:
        labels = [item for sublist in labels for item in sublist]
        labels = set(labels)
    else:
        labels = set(labels)
    return labels


In [36]:
def get_sample_size(finetuning_task,language, template):
    for split in ['train', 'validation', 'test']:
        try:
            dataset = load_dataset("joelito/lextreme", finetuning_task, split=split)

            if 'multi_eurlex' in finetuning_task:
                dataset = split_into_languages(dataset)

            if language=="all":
                return len(dataset['input'])
            else:
                dataset = pd.DataFrame(dataset)
                dataset = dataset[dataset.language==language]
                number_of_examples =  dataset.shape[0]
                dataset = Dataset.from_pandas(dataset)
                labels = get_number_of_labels(dataset)
                template_item = dict()
                template_item['finetuning_task']=finetuning_task
                template_item['split']=split
                template_item['language']=language
                template_item['number_of_examples']=number_of_examples
                template_item['number_of_labels']=len(labels)
                #template_item['labels']=labels
                template.append(template_item)
        except Exception as e:
            print(finetuning_task, split, language)
            print_exc()

In [None]:
template_list = list()

for task, languages in meta_infos["task_language_mapping"].items():
    print(task)
    if len(languages)>1:
        languages.append('all')
    for lang in languages:
        get_sample_size(task, lang, template_list)

brazilian_court_decisions_judgment


Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/brazilian_court_decisions_judgment/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/brazilian_court_decisions_judgment/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/brazilian_court_decisions_judgment/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)


brazilian_court_decisions_unanimity


Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/brazilian_court_decisions_unanimity/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/brazilian_court_decisions_unanimity/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/brazilian_court_decisions_unanimity/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)


german_argument_mining


Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/german_argument_mining/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/german_argument_mining/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/german_argument_mining/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)


greek_legal_code_chapter


Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/greek_legal_code_chapter/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/greek_legal_code_chapter/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/greek_legal_code_chapter/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)


greek_legal_code_subject


Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/greek_legal_code_subject/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/greek_legal_code_subject/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/greek_legal_code_subject/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)


greek_legal_code_volume


Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/greek_legal_code_volume/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/greek_legal_code_volume/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/greek_legal_code_volume/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)


swiss_judgment_prediction


Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/swiss_judgment_prediction/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/swiss_judgment_prediction/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/swiss_judgment_prediction/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/swiss_judgment_prediction/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/swiss_judgment_prediction/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset

online_terms_of_service_unfairness_levels


Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/online_terms_of_service_unfairness_levels/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/online_terms_of_service_unfairness_levels/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/online_terms_of_service_unfairness_levels/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/online_terms_of_service_unfairness_levels/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/online_terms_of_service_unfairness_levels/1.0.0/43e540

online_terms_of_service_clause_topics


Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/online_terms_of_service_clause_topics/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/online_terms_of_service_clause_topics/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/online_terms_of_service_clause_topics/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/online_terms_of_service_clause_topics/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/online_terms_of_service_clause_topics/1.0.0/43e5402e22c4ffbf0de470c8e8

covid19_emergency_event


Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/covid19_emergency_event/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/covid19_emergency_event/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/covid19_emergency_event/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/covid19_emergency_event/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/covid19_emergency_event/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme 

multi_eurlex_level_1


Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonma

Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_1/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonma

multi_eurlex_level_2


Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_2/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_2/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_2/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_2/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonmatoshi/.cache/huggingface/datasets/joelito___lextreme/multi_eurlex_level_2/1.0.0/43e5402e22c4ffbf0de470c8e84115dfed02ba82d1f3dad643bbf67fd9b8b415)
Found cached dataset lextreme (/Users/vetonma

In [None]:
template = pd.DataFrame(template_list)

In [None]:
template.to_excel('/Users/test/Downloads/overview_sizes.xlsx', index=False)

