In [1]:
from dataclasses import dataclass
import pandas as pd
from nusantara import NusantaraMetadata, NusantaraConfigHelper
from nusantara.utils.constants import Tasks, TASK_TO_SCHEMA

In [2]:
conhelps = NusantaraConfigHelper()
print('All Configs')
print(conhelps)

# filter and load datasets
# ====================================================================
print('Retrieve SMSA')
print([helper for helper in conhelps.filtered(lambda x: ("smsa" in x.dataset_name and x.is_nusantara_schema))])
smsa_datasets = [
    helper.load_dataset()
    for helper in conhelps.filtered(
        lambda x: ("smsa" in x.dataset_name and x.is_nusantara_schema)
    )
]
print(smsa_datasets)

# examples of other filters
# ====================================================================

# get all source schema config helpers
print('Source datasets')
source_helpers = conhelps.filtered(lambda x: x.config.schema == "source")
print(source_helpers)

# get all nusantara config helpers
print('Nusantara datasets')
nusantara_helpers = conhelps.filtered(lambda x: x.is_nusantara_schema)
print(nusantara_helpers)

# nusantara NER public tasks
print('Nusantara NER public datasets')
nc_ner_public_helpers = conhelps.filtered(
    lambda x: (
        x.is_nusantara_schema
        and Tasks.NAMED_ENTITY_RECOGNITION in x.tasks
        and not x.is_local
    )
)
print(nc_ner_public_helpers)

# indolem datasets
print('IndoLEM datasets')
nc_indolem_helpers = conhelps.filtered(
    lambda x: ("indolem" in x.dataset_name and x.is_nusantara_schema)
)
print(nc_indolem_helpers)



All Configs
NusantaraMetadata(script='/home/samuel/nusantara-datasets/nusantara/nusa_datasets/bible_en_id/bible_en_id.py', dataset_name='bible_en_id', tasks=[<Tasks.MACHINE_TRANSLATION: 'MT'>], languages=['ind', 'eng'], config=NusantaraConfig(name='bible_en_id_source', version=1.0.0, data_dir=None, data_files=None, description='Bible En-Id source schema', schema='source', subset_id='bible_en_id'), is_local=False, is_nusantara_schema=False, nusantara_schema_caps=None, is_large=False, is_resource=False, is_default=True, is_broken=False, nusantara_version='1.0.0', source_version='1.0.0', citation='@inproceedings{cahyawijaya-etal-2021-indonlg,\n    title = "{I}ndo{NLG}: Benchmark and Resources for Evaluating {I}ndonesian Natural Language Generation",\n    author = "Cahyawijaya, Samuel  and\n      Winata, Genta Indra  and\n      Wilie, Bryan  and\n      Vincentio, Karissa  and\n      Li, Xiaohong  and\n      Kuncoro, Adhiguna  and\n      Ruder, Sebastian  and\n      Lim, Zhi Yuan  and\n    

  0%|          | 0/3 [00:00<?, ?it/s]

[DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 11000
    })
    validation: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 500
    })
})]
Source datasets
NusantaraMetadata(script='/home/samuel/nusantara-datasets/nusantara/nusa_datasets/bible_en_id/bible_en_id.py', dataset_name='bible_en_id', tasks=[<Tasks.MACHINE_TRANSLATION: 'MT'>], languages=['ind', 'eng'], config=NusantaraConfig(name='bible_en_id_source', version=1.0.0, data_dir=None, data_files=None, description='Bible En-Id source schema', schema='source', subset_id='bible_en_id'), is_local=False, is_nusantara_schema=False, nusantara_schema_caps=None, is_large=False, is_resource=False, is_default=True, is_broken=False, nusantara_version='1.0.0', source_version='1.0.0', citation='@inproceedings{cahyawijaya-etal-2021-indonlg,\n    title = "{I}ndo{NLG}: Benchmark and Re

In [3]:
meta_df = pd.read_csv('https://docs.google.com/spreadsheets/d/17o83IvWxmtGLYridZis0nEprHhsZIMeFtHGtXV35h6M/export?format=csv&gid=879729812', skiprows=1)
meta_df = meta_df[meta_df['Implemented'] > 0].rename({
    'No.': 'id', 'Name': 'name', 'Subsets': 'subsets', 'Link': 'source_link', 'Description': 'description',
    'HF Link': 'hf_link', 'License': 'license', 'Year': 'year', 'Collection Style': 'collection_style',
    'Language': 'language', 'Dialect': 'dialect', 'Domain': 'domain', 'Form': 'modality', 'Tasks': 'tasks',
    'Volume': 'volume', 'Unit': 'unit', 'Ethical Risks': 'ethical_risk', 'Provider': 'provider',
    'Paper Title': 'paper_title', 'Paper Link': 'paper_link', 'Access': 'access', 'Derived From': 'derived_from', 
    'Test Split': 'is_splitted', 'Notes': 'notes', 'Dataloader': 'dataloader', 'Implemented': 'implemented'
}, axis=1)
meta_df['is_splitted'] = meta_df['is_splitted'].apply(lambda x: True if x =='Yes' else False)
# [
#  'No.', 'Name', 'Subsets', 'Link', 'HF Link', 'License', 'Year',
#  'Language', 'Dialect', 'Domain', 'Form', 'Collection Style',
#  'Description', 'Volume', 'Unit', 'Ethical Risks', 'Provider',
#  'Paper Title', 'Paper Link', 'Access', 'Derived From', 'Tasks',
#  'Test Split', 'Notes', 'Dataloader', 'Implemented'
# ]

In [4]:
@dataclass
class MetaDict:
    data: dict = None

In [5]:
name_to_meta_map = {}
for cfg_meta in conhelps:
    # Assign metadata to meta dataframe
    meta_df.loc[meta_df.dataloader == cfg_meta.dataset_name, [
        'is_large', 'is_resource', 'is_default', 'is_broken',
        'is_local', 'citation', 'license', 'homepage', 'tasks'
    ]] = [
        cfg_meta.is_large, cfg_meta.is_resource, cfg_meta.is_default, cfg_meta.is_broken, 
        cfg_meta.is_local, cfg_meta.citation, cfg_meta.license, cfg_meta.homepage, '|'.join([task.value for task in cfg_meta.tasks])
    ]
    
    if cfg_meta.dataset_name not in name_to_meta_map:
        name_to_meta_map[cfg_meta.dataset_name] = {}
    if cfg_meta.config.schema not in name_to_meta_map[cfg_meta.dataset_name]:
        name_to_meta_map[cfg_meta.dataset_name][cfg_meta.config.schema] = []
    name_to_meta_map[cfg_meta.dataset_name][cfg_meta.config.schema].append(cfg_meta)

for dset_name in name_to_meta_map.keys():
    meta_df.loc[meta_df.dataloader == dset_name, 'metadata'] = MetaDict(data=name_to_meta_map[dset_name])

In [6]:
# Filter & load all Indonesian sentiment analysis task
lang = 'ind'
task = Tasks.SENTIMENT_ANALYSIS

filtered_df = meta_df.loc[
    (meta_df.tasks.str.contains(task.value)) & 
    (meta_df.language.str.contains(lang)) & meta_df.is_splitted
]

schema = f'nusantara_{TASK_TO_SCHEMA[task].lower()}'
datasets = {}
for metas in filtered_df.metadata:
    if schema in metas.data:
        for meta in metas.data[schema]:
            if len(meta.languages) > 1:
                if lang in meta.config.name:
                    datasets[meta.config.name] = meta.load_dataset()
            else:
                datasets[meta.config.name] = meta.load_dataset()

datasets



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

{'indolem_sentiment_nusantara_text': DatasetDict({
     train: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 3638
     })
     test: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 1011
     })
     validation: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 399
     })
 }),
 'nusax_senti_ind_nusantara_text': DatasetDict({
     train: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 500
     })
     validation: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 100
     })
     test: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 400
     })
 }),
 'smsa_nusantara_text': DatasetDict({
     train: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 11000
     })
     validation: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 1260
     })
     test: Dataset({
         features: ['id', 'text', 'la

In [7]:
# Filter & load all splitted English sentiment analysis task
lang = 'eng'
task = Tasks.MACHINE_TRANSLATION

filtered_df = meta_df.loc[
    (meta_df.tasks.str.contains(task.value)) & 
    (meta_df.language.str.contains(lang)) & meta_df.is_splitted
]

schema = f'nusantara_{TASK_TO_SCHEMA[task].lower()}'
datasets = {}
for metas in filtered_df.metadata:
    if schema in metas.data:
        for meta in metas.data[schema]:
            if len(meta.languages) > 1:
                if lang in meta.config.name:
                    datasets[meta.config.name] = meta.load_dataset()
            else:
                datasets[meta.config.name] = meta.load_dataset()
                
datasets



  0%|          | 0/3 [00:00<?, ?it/s]

{'indo_general_mt_en_id_nusantara_t2t': DatasetDict({
     train: Dataset({
         features: ['id', 'text_1', 'text_2', 'text_1_name', 'text_2_name'],
         num_rows: 1821716
     })
     test: Dataset({
         features: ['id', 'text_1', 'text_2', 'text_1_name', 'text_2_name'],
         num_rows: 2000
     })
     validation: Dataset({
         features: ['id', 'text_1', 'text_2', 'text_1_name', 'text_2_name'],
         num_rows: 2000
     })
 })}

In [12]:
 meta_df.loc[
    (meta_df.name.str.contains('CSUI'))
]


26    POS
Name: tasks, dtype: object

In [15]:
for cfg_meta in conhelps:
    if len(cfg_meta.tasks) > 1:
        print(cfg_meta.name)

In [None]:
# Filter & load all Indonesian sentiment analysis task
lang = 'ind'
task = Tasks.SENTIMENT_ANALYSIS

filtered_df = meta_df.loc[
    (meta_df.tasks.str.contains(task.value)) & 
    (meta_df.language.str.contains(lang)) & meta_df.is_splitted
]

schema = f'nusantara_{TASK_TO_SCHEMA[task].lower()}'
datasets = {}
for metas in filtered_df.metadata:
    if schema in metas.data:
        for meta in metas.data[schema]:
            if len(meta.languages) > 1:
                if lang in meta.config.name:
                    datasets[meta.config.name] = meta.load_dataset()
            else:
                datasets[meta.config.name] = meta.load_dataset()

datasets

In [None]:
If there is multiple tasks on a dataset, split it and filter schema out of it resulting in
config meta from the original dataset meta