In [1]:
from dataclasses import dataclass
import pandas as pd
from nusacrowd import NusantaraMetadata, NusantaraConfigHelper, NusantaraMetadataHelper
from nusacrowd.utils.constants import Tasks, TASK_TO_SCHEMA

In [2]:
conhelps = NusantaraConfigHelper()
print('All Configs')
print(conhelps)

# filter and load datasets
# ====================================================================
print('Retrieve SMSA')
print([helper for helper in conhelps.filtered(lambda x: ("smsa" in x.dataset_name and x.is_nusantara_schema))])
smsa_datasets = [
    helper.load_dataset()
    for helper in conhelps.filtered(
        lambda x: ("smsa" in x.dataset_name and x.is_nusantara_schema)
    )
]
print(smsa_datasets)

# examples of other filters
# ====================================================================

# get all source schema config helpers
print('Source datasets')
source_helpers = conhelps.filtered(lambda x: x.config.schema == "source")
print(source_helpers)

# get all nusantara config helpers
print('Nusantara datasets')
nusantara_helpers = conhelps.filtered(lambda x: x.is_nusantara_schema)
print(nusantara_helpers)

# nusantara NER public tasks
print('Nusantara NER public datasets')
nc_ner_public_helpers = conhelps.filtered(
    lambda x: (
        x.is_nusantara_schema
        and Tasks.NAMED_ENTITY_RECOGNITION in x.tasks
        and not x.is_local
    )
)
print(nc_ner_public_helpers)

# indolem datasets
print('IndoLEM datasets')
nc_indolem_helpers = conhelps.filtered(
    lambda x: ("indolem" in x.dataset_name and x.is_nusantara_schema)
)
print(nc_indolem_helpers)



All Configs
NusantaraMetadata(script='/home/samuel/nusantara-datasets/nusantara/nusa_datasets/bible_en_id/bible_en_id.py', dataset_name='bible_en_id', tasks=[<Tasks.MACHINE_TRANSLATION: 'MT'>], languages=['ind', 'eng'], config=NusantaraConfig(name='bible_en_id_source', version=1.0.0, data_dir=None, data_files=None, description='Bible En-Id source schema', schema='source', subset_id='bible_en_id'), is_local=False, is_nusantara_schema=False, nusantara_schema_caps=None, is_large=False, is_resource=False, is_default=True, is_broken=False, nusantara_version='1.0.0', source_version='1.0.0', citation='@inproceedings{cahyawijaya-etal-2021-indonlg,\n    title = "{I}ndo{NLG}: Benchmark and Resources for Evaluating {I}ndonesian Natural Language Generation",\n    author = "Cahyawijaya, Samuel  and\n      Winata, Genta Indra  and\n      Wilie, Bryan  and\n      Vincentio, Karissa  and\n      Li, Xiaohong  and\n      Kuncoro, Adhiguna  and\n      Ruder, Sebastian  and\n      Lim, Zhi Yuan  and\n    

  0%|          | 0/3 [00:00<?, ?it/s]

[DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 11000
    })
    validation: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 500
    })
})]
Source datasets
NusantaraMetadata(script='/home/samuel/nusantara-datasets/nusantara/nusa_datasets/bible_en_id/bible_en_id.py', dataset_name='bible_en_id', tasks=[<Tasks.MACHINE_TRANSLATION: 'MT'>], languages=['ind', 'eng'], config=NusantaraConfig(name='bible_en_id_source', version=1.0.0, data_dir=None, data_files=None, description='Bible En-Id source schema', schema='source', subset_id='bible_en_id'), is_local=False, is_nusantara_schema=False, nusantara_schema_caps=None, is_large=False, is_resource=False, is_default=True, is_broken=False, nusantara_version='1.0.0', source_version='1.0.0', citation='@inproceedings{cahyawijaya-etal-2021-indonlg,\n    title = "{I}ndo{NLG}: Benchmark and Re

In [14]:
metadata_helper = NusantaraMetadataHelper()

In [15]:
metadata_helper

<nusantara.config_helper.NusantaraMetadataHelper at 0x7f7ad20432b0>

In [16]:
print([helper.config.name for helper in conhelps.filtered(lambda x: (
    't2t' in x.config.name
    and x.is_nusantara_schema
    and ('ind' in x.config.name if 'nusax_mt' in x.dataset_name else True)
    and not x.is_resource))
])

['bible_en_id_nusantara_t2t', 'bible_jv_id_nusantara_t2t', 'bible_su_id_nusantara_t2t', 'covost2_ind_eng_nusantara_t2t', 'covost2_eng_ind_nusantara_t2t', 'id_panl_bppt_nusantara_t2t', 'id_qqp_nusantara_t2t', 'id_wiki_parallel_jav_ind_nusantara_t2t', 'id_wiki_parallel_min_ind_nusantara_t2t', 'id_wiki_parallel_sun_ind_nusantara_t2t', 'indo_general_mt_en_id_nusantara_t2t', 'indo_religious_mt_en_id_nusantara_t2t', 'indosum_fold0_nusantara_t2t', 'indosum_fold1_nusantara_t2t', 'indosum_fold2_nusantara_t2t', 'indosum_fold3_nusantara_t2t', 'indosum_fold4_nusantara_t2t', 'korpus_nusantara_ind_jav_nusantara_t2t', 'korpus_nusantara_ind_day_nusantara_t2t', 'korpus_nusantara_ind_bug_nusantara_t2t', 'korpus_nusantara_ind_sun_nusantara_t2t', 'korpus_nusantara_ind_mad_nusantara_t2t', 'korpus_nusantara_ind_bin_nusantara_t2t', 'korpus_nusantara_ind_bbc_nusantara_t2t', 'korpus_nusantara_ind_khek_nusantara_t2t', 'korpus_nusantara_ind_msa_nusantara_t2t', 'korpus_nusantara_ind_min_nusantara_t2t', 'korpus_nu

In [17]:
print([helper.config.name for helper in conhelps.filtered(lambda x: (
    ('_text' in x.config.name or '_pair' in x.config.name)
    and x.is_nusantara_schema
    and not x.is_resource))
])

['casa_nusantara_text_multi', 'emot_nusantara_text', 'emotcmt_nusantara_text', 'emotion_id_opinion_nusantara_text', 'hoasa_nusantara_text_multi', 'id_abusive_nusantara_text', 'id_abusive_news_comment_nusantara_text', 'id_clickbait_nusantara_text', 'id_google_play_review_nusantara_text', 'id_google_play_review_posneg_nusantara_text', 'id_hatespeech_nusantara_text', 'id_hoax_news_nusantara_text', 'id_multilabel_hs_nusantara_text_multi', 'id_short_answer_grading_nusantara_pairs', 'id_stance_nusantara_pairs', 'id_sts_nusantara_pairs_score', 'imdb_jv_nusantara_text', 'indolem_ntp_nusantara_pairs', 'indolem_sentiment_nusantara_text', 'indonli_nusantara_pairs', 'jadi_ide_nusantara_text', 'local_id_abusive_jav_nusantara_text_multi', 'local_id_abusive_sun_nusantara_text_multi', 'netifier_nusantara_text_multi', 'nusax_senti_ace_nusantara_text', 'nusax_senti_ban_nusantara_text', 'nusax_senti_bjn_nusantara_text', 'nusax_senti_bug_nusantara_text', 'nusax_senti_eng_nusantara_text', 'nusax_senti_ind_

In [18]:
@dataclass
class MetaDict:
    data: dict = None

In [41]:
meta_df = pd.read_csv('https://docs.google.com/spreadsheets/d/17o83IvWxmtGLYridZis0nEprHhsZIMeFtHGtXV35h6M/export?format=csv&gid=879729812', skiprows=1)
meta_df = meta_df[meta_df['Implemented'] != 0].rename({
    'No.': 'id', 'Name': 'name', 'Subsets': 'subsets', 'Link': 'source_link', 'Description': 'description',
    'HF Link': 'hf_link', 'License': 'license', 'Year': 'year', 'Collection Style': 'collection_style',
    'Language': 'language', 'Dialect': 'dialect', 'Domain': 'domain', 'Form': 'modality', 'Tasks': 'tasks',
    'Volume': 'volume', 'Unit': 'unit', 'Ethical Risks': 'ethical_risk', 'Provider': 'provider',
    'Paper Title': 'paper_title', 'Paper Link': 'paper_link', 'Access': 'access', 'Derived From': 'derived_from', 
    'Test Split': 'is_splitted', 'Notes': 'notes', 'Dataloader': 'dataloader', 'Implemented': 'implemented'
}, axis=1)
meta_df['is_splitted'] = meta_df['is_splitted'].apply(lambda x: True if x =='Yes' else False)
# [
#  'No.', 'Name', 'Subsets', 'Link', 'HF Link', 'License', 'Year',
#  'Language', 'Dialect', 'Domain', 'Form', 'Collection Style',
#  'Description', 'Volume', 'Unit', 'Ethical Risks', 'Provider',
#  'Paper Title', 'Paper Link', 'Access', 'Derived From', 'Tasks',
#  'Test Split', 'Notes', 'Dataloader', 'Implemented'
# ]

In [23]:
name_to_meta_map = {}
for cfg_meta in conhelps:
    # Assign metadata to meta dataframe
    meta_df.loc[meta_df.dataloader == cfg_meta.dataset_name, [
        'is_large', 'is_resource', 'is_default', 'is_broken',
        'is_local', 'citation', 'license', 'homepage', 'tasks'
    ]] = [
        cfg_meta.is_large, cfg_meta.is_resource, cfg_meta.is_default, cfg_meta.is_broken, 
        cfg_meta.is_local, cfg_meta.citation, cfg_meta.license, cfg_meta.homepage, '|'.join([task.value for task in cfg_meta.tasks])
    ]
    
    if cfg_meta.dataset_name not in name_to_meta_map:
        name_to_meta_map[cfg_meta.dataset_name] = {}
    if cfg_meta.config.schema not in name_to_meta_map[cfg_meta.dataset_name]:
        name_to_meta_map[cfg_meta.dataset_name][cfg_meta.config.schema] = []
    name_to_meta_map[cfg_meta.dataset_name][cfg_meta.config.schema].append(cfg_meta)
    
meta_df = meta_df.fillna(False)

for dset_name in name_to_meta_map.keys():
    meta_df.loc[meta_df.dataloader == dset_name, 'metadata'] = MetaDict(data=name_to_meta_map[dset_name])

In [10]:
# Filter & load all Indonesian sentiment analysis task
lang = 'ind'
task = Tasks.SENTIMENT_ANALYSIS

filtered_df = meta_df.loc[
    (meta_df.tasks.str.contains(task.value)) & (~meta_df.is_resource) &
    (meta_df.language.str.contains(lang)) & meta_df.is_splitted
]

schema = f'nusantara_{TASK_TO_SCHEMA[task].lower()}'
datasets = {}
for metas in filtered_df.metadata:
    if schema in metas.data:
        for meta in metas.data[schema]:
            if len(meta.languages) > 1:
                if lang in meta.config.name:
                    datasets[meta.config.name] = meta.load_dataset()
            else:
                datasets[meta.config.name] = meta.load_dataset()
datasets



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

{'sentiment_nathasa_review_nusantara_text': DatasetDict({
     train: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 62132
     })
     test: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 62131
     })
 }),
 'indolem_sentiment_nusantara_text': DatasetDict({
     train: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 3638
     })
     test: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 1011
     })
     validation: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 399
     })
 }),
 'nusax_senti_ind_nusantara_text': DatasetDict({
     train: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 500
     })
     validation: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 100
     })
     test: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 400
     })
 }),
 'smsa_nusantara_text': DatasetDict({


In [14]:
# Filter & load all splitted English sentiment analysis task
lang = 'eng'
task = Tasks.MACHINE_TRANSLATION

filtered_df = meta_df.loc[
    (meta_df.tasks.str.contains(task.value)) & 
    (meta_df.language.str.contains(lang)) & meta_df.is_splitted
]

schema = f'nusantara_{TASK_TO_SCHEMA[task].lower()}'
datasets = {}
for metas in filtered_df.metadata:
    if schema in metas.data:
        for meta in metas.data[schema]:
            if len(meta.languages) > 1:
                if lang in meta.config.name:
                    datasets[meta.config.name] = meta.load_dataset()
            else:
                datasets[meta.config.name] = meta.load_dataset()
                
datasets



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

{'indo_general_mt_en_id_nusantara_t2t': DatasetDict({
     train: Dataset({
         features: ['id', 'text_1', 'text_2', 'text_1_name', 'text_2_name'],
         num_rows: 1821716
     })
     test: Dataset({
         features: ['id', 'text_1', 'text_2', 'text_1_name', 'text_2_name'],
         num_rows: 2000
     })
     validation: Dataset({
         features: ['id', 'text_1', 'text_2', 'text_1_name', 'text_2_name'],
         num_rows: 2000
     })
 }),
 'nusax_mt_ace_eng_nusantara_t2t': DatasetDict({
     train: Dataset({
         features: ['id', 'text_1', 'text_2', 'text_1_name', 'text_2_name'],
         num_rows: 500
     })
     validation: Dataset({
         features: ['id', 'text_1', 'text_2', 'text_1_name', 'text_2_name'],
         num_rows: 100
     })
     test: Dataset({
         features: ['id', 'text_1', 'text_2', 'text_1_name', 'text_2_name'],
         num_rows: 400
     })
 }),
 'nusax_mt_ban_eng_nusantara_t2t': DatasetDict({
     train: Dataset({
         features: ['i

In [15]:
meta_df.loc[
    (meta_df.name.str.contains('CSUI'))
]

Unnamed: 0,id,name,subsets,source_link,hf_link,license,year,language,dialect,domain,...,dataloader,implemented,is_large,is_resource,is_default,is_broken,is_local,citation,homepage,metadata
29,27,Idn-tagged-corpus-CSUI,False,https://github.com/ir-nlp-csui/idn-tagged-corp...,https://huggingface.co/datasets/indonlu,Creative Commons Attribution Share-Alike 4.0 I...,2014.0,ind,formal,news articles,...,idn_tagged_corpus_csui,17.0,False,False,False,False,False,"@inproceedings{dinakaramani2014designing,\n t...",https://bahasa.cs.ui.ac.id/postag/corpus,MetaDict(data={'source': [NusantaraMetadata(sc...
153,116,UD_Indonesian-CSUI,False,https://github.com/UniversalDependencies/UD_In...,False,CC BY-SA 4.0,2020.0,ind,False,"General, News articles",...,ud_id_csui,1.0,False,False,False,False,False,"@article {10.3844/jcssp.2020.1585.1597,\nautho...",https://github.com/UniversalDependencies/UD_In...,MetaDict(data={'source': [NusantaraMetadata(sc...


In [19]:
# Filter & load all Indonesian sentiment analysis task
lang = 'ind'
task = Tasks.SENTIMENT_ANALYSIS

filtered_df = meta_df.loc[
    (meta_df.tasks.str.contains(task.value)) & 
    (meta_df.language.str.contains(lang)) & meta_df.is_splitted
]

schema = f'nusantara_{TASK_TO_SCHEMA[task].lower()}'
datasets = {}
for metas in filtered_df.metadata:
    if schema in metas.data:
        for meta in metas.data[schema]:
            if len(meta.languages) > 1:
                if lang in meta.config.name:
                    datasets[meta.config.name] = meta.load_dataset()
            else:
                datasets[meta.config.name] = meta.load_dataset()

datasets



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

{'sentiment_nathasa_review_nusantara_text': DatasetDict({
     train: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 62132
     })
     test: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 62131
     })
 }),
 'indolem_sentiment_nusantara_text': DatasetDict({
     train: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 3638
     })
     test: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 1011
     })
     validation: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 399
     })
 }),
 'inset_lexicon_nusantara_text': DatasetDict({
     train: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 10218
     })
 }),
 'nusax_senti_ind_nusantara_text': DatasetDict({
     train: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 500
     })
     validation: Dataset({
         features: ['id', 'text', 'label'],
         num_rows: 

If there is multiple tasks on a dataset, split it and filter schema out of it resulting in
config meta from the original dataset meta

In [23]:
# Filter & load all Indonesian sentiment analysis task
lang = 'ind'
task = Tasks.SENTIMENT_ANALYSIS

filtered_df = meta_df.loc[
    meta_df.is_splitted
]

schema = f'nusantara_{TASK_TO_SCHEMA[task].lower()}'
datasets = {}
for metas in filtered_df.metadata:
    if schema in metas.data:
        for meta in metas.data[schema]:
            if len(meta.languages) > 1:
                if lang in meta.config.name:
                    datasets[meta.config.name] = meta.load_dataset()
            else:
                datasets[meta.config.name] = meta.load_dataset()
datasets

Unnamed: 0,id,name,subsets,source_link,hf_link,license,year,language,dialect,domain,...,dataloader,implemented,is_large,is_resource,is_default,is_broken,is_local,citation,homepage,metadata
3,4,CC100,False,https://data.statmt.org/cc-100/,https://huggingface.co/datasets/cc100,MIT,2020.0,"ind, sun, jav",other,multi domain,...,cc100,1.0,False,False,False,False,False,@inproceedings{conneau-etal-2020-unsup...,https://data.statmt.org/cc-100/,MetaDict(data={'source': [NusantaraMetadata(sc...
4,4,CC100,Indonesian,https://data.statmt.org/cc-100/,https://huggingface.co/datasets/cc100,Common Crawl's license,2020.0,ind,other,multi domain,...,False,False,False,False,False,False,False,False,False,
5,4,CC100,Javanese,https://data.statmt.org/cc-100/,https://huggingface.co/datasets/cc100,Common Crawl's license,2020.0,jav,other,multi domain,...,False,False,False,False,False,False,False,False,False,
6,4,CC100,Sundanese,https://data.statmt.org/cc-100/,https://huggingface.co/datasets/cc100,Common Crawl's license,2020.0,sun,other,multi domain,...,False,False,False,False,False,False,False,False,False,
11,9,Customer Review (Natasha Skincare),False,https://drive.google.com/file/d/1D1pHX7CxrI-eI...,False,Unknown,2017.0,ind,False,Social media,...,sentiment_nathasa_review,1.0,False,False,False,False,False,"@article{nurlaila2018classification,\n title=...",https://jurnal.uns.ac.id/itsmart/article/viewF...,MetaDict(data={'source': [NusantaraMetadata(sc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,116,UD_Indonesian-CSUI,False,https://github.com/UniversalDependencies/UD_In...,False,CC BY-SA 4.0,2020.0,ind,False,"General, News articles",...,ud_id_csui,1.0,False,False,False,False,False,"@article {10.3844/jcssp.2020.1585.1597,\nautho...",https://github.com/UniversalDependencies/UD_In...,MetaDict(data={'source': [NusantaraMetadata(sc...
158,121,WikiAnn,"ind, jav, bug, min, bjn, sun, ace, tet, ms, ma...",https://drive.google.com/drive/folders/1Q-xdT9...,https://huggingface.co/datasets/wikiann,Apache-2.0 license,2017.0,"ind, eng, sun, jav, min, bug, bjn, tpi, ace, t...",Banyumasan,Wiki articles,...,wikiann,1.0,False,False,False,False,False,"@inproceedings{pan-etal-2017-cross,\n title...",https://github.com/afshinrahimi/mmner,MetaDict(data={'source': [NusantaraMetadata(sc...
162,125,XCOPA,Indonesian,https://github.com/cambridgeltl/xcopa,https://huggingface.co/datasets/xcopa,Unknown,2021.0,ind,False,General,...,xcopa,1.0,False,False,False,False,False,"@inproceedings{ponti2020xcopa,\n title={{XCOP...",https://github.com/cambridgeltl/xcopa,MetaDict(data={'source': [NusantaraMetadata(sc...
163,126,XL-Sum,Indonesian,https://github.com/csebuetnlp/xl-sum,https://huggingface.co/datasets/csebuetnlp/xlsum,CC-BY-NC-SA 4.0,2021.0,ind,False,News articles,...,xl_sum,1.0,False,False,False,False,False,"@inproceedings{hasan2021xl,\n title={XL-Sum: ...",https://github.com/csebuetnlp/xl-sum,MetaDict(data={'source': [NusantaraMetadata(sc...


In [20]:
meta_df

Unnamed: 0,id,name,subsets,source_link,hf_link,license,year,language,dialect,domain,...,dataloader,implemented,is_large,is_resource,is_default,is_broken,is_local,citation,homepage,metadata
3,4,CC100,False,https://data.statmt.org/cc-100/,https://huggingface.co/datasets/cc100,MIT,2020.0,"ind, sun, jav",other,multi domain,...,cc100,1.0,False,False,False,False,False,@inproceedings{conneau-etal-2020-unsup...,https://data.statmt.org/cc-100/,MetaDict(data={'source': [NusantaraMetadata(sc...
4,4,CC100,Indonesian,https://data.statmt.org/cc-100/,https://huggingface.co/datasets/cc100,Common Crawl's license,2020.0,ind,other,multi domain,...,False,False,False,False,False,False,False,False,False,
5,4,CC100,Javanese,https://data.statmt.org/cc-100/,https://huggingface.co/datasets/cc100,Common Crawl's license,2020.0,jav,other,multi domain,...,False,False,False,False,False,False,False,False,False,
6,4,CC100,Sundanese,https://data.statmt.org/cc-100/,https://huggingface.co/datasets/cc100,Common Crawl's license,2020.0,sun,other,multi domain,...,False,False,False,False,False,False,False,False,False,
11,9,Customer Review (Natasha Skincare),False,https://drive.google.com/file/d/1D1pHX7CxrI-eI...,False,Unknown,2017.0,ind,False,Social media,...,sentiment_nathasa_review,1.0,False,False,False,False,False,"@article{nurlaila2018classification,\n title=...",https://jurnal.uns.ac.id/itsmart/article/viewF...,MetaDict(data={'source': [NusantaraMetadata(sc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,116,UD_Indonesian-CSUI,False,https://github.com/UniversalDependencies/UD_In...,False,CC BY-SA 4.0,2020.0,ind,False,"General, News articles",...,ud_id_csui,1.0,False,False,False,False,False,"@article {10.3844/jcssp.2020.1585.1597,\nautho...",https://github.com/UniversalDependencies/UD_In...,MetaDict(data={'source': [NusantaraMetadata(sc...
158,121,WikiAnn,"ind, jav, bug, min, bjn, sun, ace, tet, ms, ma...",https://drive.google.com/drive/folders/1Q-xdT9...,https://huggingface.co/datasets/wikiann,Apache-2.0 license,2017.0,"ind, eng, sun, jav, min, bug, bjn, tpi, ace, t...",Banyumasan,Wiki articles,...,wikiann,1.0,False,False,False,False,False,"@inproceedings{pan-etal-2017-cross,\n title...",https://github.com/afshinrahimi/mmner,MetaDict(data={'source': [NusantaraMetadata(sc...
162,125,XCOPA,Indonesian,https://github.com/cambridgeltl/xcopa,https://huggingface.co/datasets/xcopa,Unknown,2021.0,ind,False,General,...,xcopa,1.0,False,False,False,False,False,"@inproceedings{ponti2020xcopa,\n title={{XCOP...",https://github.com/cambridgeltl/xcopa,MetaDict(data={'source': [NusantaraMetadata(sc...
163,126,XL-Sum,Indonesian,https://github.com/csebuetnlp/xl-sum,https://huggingface.co/datasets/csebuetnlp/xlsum,CC-BY-NC-SA 4.0,2021.0,ind,False,News articles,...,xl_sum,1.0,False,False,False,False,False,"@inproceedings{hasan2021xl,\n title={XL-Sum: ...",https://github.com/csebuetnlp/xl-sum,MetaDict(data={'source': [NusantaraMetadata(sc...


In [2]:
TEXT_CLASSIFICATION_TASKS = [
    # Single Label Classification
    'emot_nusantara_text',
    'emotcmt_nusantara_text',
    'emotion_id_opinion_nusantara_text',
    'id_abusive_nusantara_text',
    'id_clickbait_nusantara_text',
    'id_google_play_review_nusantara_text',
    'id_google_play_review_posneg_nusantara_text',
    'id_hatespeech_nusantara_text',
    'imdb_jv_nusantara_text',
    'indolem_sentiment_nusantara_text',
    'jadi_ide_nusantara_text',
    'nusax_senti_ace_nusantara_text',
    'nusax_senti_ban_nusantara_text',
    'nusax_senti_bjn_nusantara_text',
    'nusax_senti_bug_nusantara_text',
    'nusax_senti_eng_nusantara_text',
    'nusax_senti_ind_nusantara_text',
    'nusax_senti_jav_nusantara_text',
    'nusax_senti_mad_nusantara_text',
    'nusax_senti_min_nusantara_text',
    'nusax_senti_nij_nusantara_text',
    'nusax_senti_sun_nusantara_text',
    'nusax_senti_bbc_nusantara_text',
    'nusax_senti_nusantara_text',
    'sentiment_nathasa_review_nusantara_text',
    'smsa_nusantara_text',
    # Pair Sentence Classification
    'id_stance_nusantara_pairs',
    'indolem_ntp_nusantara_pairs',
    'indonli_nusantara_pairs',
    'id_sts_nusantara_pairs_score',
    # Multilabel Classification
    'casa_nusantara_text_multi',
    'hoasa_nusantara_text_multi',
    'netifier_nusantara_text_multi',
    'id_multilabel_hs_nusantara_text_multi',
]

TEXT_GENERATION_TASKS = [
    # MT
    'bible_en_id_nusantara_t2t',
    'bible_jv_id_nusantara_t2t',
    'bible_su_id_nusantara_t2t',
    'id_panl_bppt_nusantara_t2t',
    'indo_general_mt_en_id_nusantara_t2t',
    'indo_religious_mt_en_id_nusantara_t2t',
    'minangnlp_mt_nusantara_t2t',
    'news_en_id_nusantara_t2t',
    'nusax_mt_ace_ind_nusantara_t2t',
    'nusax_mt_ban_ind_nusantara_t2t',
    'nusax_mt_bjn_ind_nusantara_t2t',
    'nusax_mt_bug_ind_nusantara_t2t',
    'nusax_mt_eng_ind_nusantara_t2t',
    'nusax_mt_ind_ace_nusantara_t2t',
    'nusax_mt_ind_ban_nusantara_t2t',
    'nusax_mt_ind_bjn_nusantara_t2t',
    'nusax_mt_ind_bug_nusantara_t2t',
    'nusax_mt_ind_eng_nusantara_t2t',
    'nusax_mt_ind_jav_nusantara_t2t',
    'nusax_mt_ind_mad_nusantara_t2t',
    'nusax_mt_ind_min_nusantara_t2t',
    'nusax_mt_ind_nij_nusantara_t2t',
    'nusax_mt_ind_sun_nusantara_t2t',
    'nusax_mt_ind_bbc_nusantara_t2t',
    'nusax_mt_jav_ind_nusantara_t2t',
    'nusax_mt_mad_ind_nusantara_t2t',
    'nusax_mt_min_ind_nusantara_t2t',
    'nusax_mt_nij_ind_nusantara_t2t',
    'nusax_mt_sun_ind_nusantara_t2t',
    'nusax_mt_bbc_ind_nusantara_t2t',
    'parallel_su_id_nusantara_t2t',
    'ted_en_id_nusantara_t2t',
    'ud_id_csui_nusantara_t2t',
    # Paraphrasing
    # 'id_qqp_nusantara_t2t',
    # 'paracotta_id_nusantara_t2t',
    'stif_indonesia_nusantara_t2t',
    # Summarization
    'indosum_fold0_nusantara_t2t',
    'xl_sum_nusantara_t2t',
    # Dialogue System
    'xpersona_id_nusantara_t2t',
]

In [3]:
text_classification_datasets = {
    helper.config.name: helper.load_dataset() for helper in conhelps.filtered(lambda x: x.config.name in TEXT_CLASSIFICATION_TASKS)
}

NameError: name 'conhelps' is not defined

In [5]:
text_generation_datasets = {
    helper.config.name: helper.load_dataset() for helper in conhelps.filtered(lambda x: x.config.name in TEXT_GENERATION_TASKS)
}

Downloading and preparing dataset bible_en_id/bible_en_id_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/bible_en_id/bible_en_id_nusantara_t2t/1.0.0/0914cfc955aeb37cf55222ac526adcdeea078a3d4fac89b7e24be78197f28584...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset bible_en_id downloaded and prepared to /home/samuel/.cache/huggingface/datasets/bible_en_id/bible_en_id_nusantara_t2t/1.0.0/0914cfc955aeb37cf55222ac526adcdeea078a3d4fac89b7e24be78197f28584. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset bible_en_id/bible_jv_id_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/bible_en_id/bible_jv_id_nusantara_t2t/1.0.0/94cb64872abc2af5dbc51e125560e3584ad3490a12276f6274811ffc690b6577...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset bible_en_id downloaded and prepared to /home/samuel/.cache/huggingface/datasets/bible_en_id/bible_jv_id_nusantara_t2t/1.0.0/94cb64872abc2af5dbc51e125560e3584ad3490a12276f6274811ffc690b6577. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset bible_su_id/bible_su_id_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/bible_su_id/bible_su_id_nusantara_t2t/1.0.0/afe00cbc1bbbc8c3fdcbfa5d75ef6d8eca68cbb42bfdfbefb1efe66777190b55...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset bible_su_id downloaded and prepared to /home/samuel/.cache/huggingface/datasets/bible_su_id/bible_su_id_nusantara_t2t/1.0.0/afe00cbc1bbbc8c3fdcbfa5d75ef6d8eca68cbb42bfdfbefb1efe66777190b55. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset id_panl_bppt/id_panl_bppt_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/id_panl_bppt/id_panl_bppt_nusantara_t2t/1.0.0/3e0338dbc7d5c3b44c5ca68d5374c363476db1453d10206c1c04f1bbc4b57d0b...


Downloading data:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset id_panl_bppt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/id_panl_bppt/id_panl_bppt_nusantara_t2t/1.0.0/3e0338dbc7d5c3b44c5ca68d5374c363476db1453d10206c1c04f1bbc4b57d0b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset id_quora_question_pairs/id_qqp_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/id_quora_question_pairs/id_qqp_nusantara_t2t/1.0.0/eed59379992ba1ede73b9720cadb64df39bbe8f05c831517b241e7920135147d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset id_quora_question_pairs downloaded and prepared to /home/samuel/.cache/huggingface/datasets/id_quora_question_pairs/id_qqp_nusantara_t2t/1.0.0/eed59379992ba1ede73b9720cadb64df39bbe8f05c831517b241e7920135147d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset indo_religious_mt_en_id/indo_religious_mt_en_id_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/indo_religious_mt_en_id/indo_religious_mt_en_id_nusantara_t2t/1.0.0/3436a2763dcd62dd8a14e3c8d12e1b99fad9e4d82b32d68131c87b5b77b581f7...


Downloading data:   0%|          | 0.00/164k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/177k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/169k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/186k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.85M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.42M [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset indo_religious_mt_en_id downloaded and prepared to /home/samuel/.cache/huggingface/datasets/indo_religious_mt_en_id/indo_religious_mt_en_id_nusantara_t2t/1.0.0/3436a2763dcd62dd8a14e3c8d12e1b99fad9e4d82b32d68131c87b5b77b581f7. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset indo_sum/indosum_fold0_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/indo_sum/indosum_fold0_nusantara_t2t/1.0.0/f779ca1f657fd0562f1534b73b03183ed41ad8f80f4b8003b0a19bd0e0888633...


Downloading data:   0%|          | 0.00/96.0M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset indo_sum downloaded and prepared to /home/samuel/.cache/huggingface/datasets/indo_sum/indosum_fold0_nusantara_t2t/1.0.0/f779ca1f657fd0562f1534b73b03183ed41ad8f80f4b8003b0a19bd0e0888633. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset minang_nl_pmt/minangnlp_mt_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/minang_nl_pmt/minangnlp_mt_nusantara_t2t/1.0.0/d95956ac1d04411db5056a7f3824a31458a4accf72975e82173cdb4686dd1c84...


Downloading data: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset minang_nl_pmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/minang_nl_pmt/minangnlp_mt_nusantara_t2t/1.0.0/d95956ac1d04411db5056a7f3824a31458a4accf72975e82173cdb4686dd1c84. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset news_en_id/news_en_id_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/news_en_id/news_en_id_nusantara_t2t/1.0.0/a5c9608bc6314f138f35d229f18bd181d0f95902da4efb90106a8a3cd02a9f74...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset news_en_id downloaded and prepared to /home/samuel/.cache/huggingface/datasets/news_en_id/news_en_id_nusantara_t2t/1.0.0/a5c9608bc6314f138f35d229f18bd181d0f95902da4efb90106a8a3cd02a9f74. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_ace_ind_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ace_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Downloading data:   0%|          | 0.00/354k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/70.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/285k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ace_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_ban_ind_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ban_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ban_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_bjn_ind_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_bjn_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_bjn_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_bug_ind_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_bug_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_bug_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_eng_ind_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_eng_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_eng_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_ind_ace_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_ace_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_ace_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_ind_ban_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_ban_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_ban_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_ind_bjn_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_bjn_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_bjn_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_ind_bug_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_bug_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_bug_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_ind_eng_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_eng_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_eng_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_ind_jav_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_jav_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_jav_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_ind_mad_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_mad_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_mad_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_ind_min_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_min_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_min_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_ind_nij_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_nij_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_nij_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_ind_sun_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_sun_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_sun_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_ind_bbc_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_bbc_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_ind_bbc_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_jav_ind_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_jav_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_jav_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_mad_ind_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_mad_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_mad_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_min_ind_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_min_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_min_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_nij_ind_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_nij_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_nij_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_sun_ind_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_sun_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_sun_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset nusa_xmt/nusax_mt_bbc_ind_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_bbc_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset nusa_xmt downloaded and prepared to /home/samuel/.cache/huggingface/datasets/nusa_xmt/nusax_mt_bbc_ind_nusantara_t2t/1.0.0/8e5cd18bf2771dbfe991f0c4db45678ec7aaecf94420cd375f211b46495e81cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset para_cotta/paracotta_id_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/para_cotta/paracotta_id_nusantara_t2t/1.0.0/4b63c156f44006c55c9c4a8eee3cea966db5c075d4afc914be994f3251ddd8e2...


Downloading data:   0%|          | 0.00/889M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset para_cotta downloaded and prepared to /home/samuel/.cache/huggingface/datasets/para_cotta/paracotta_id_nusantara_t2t/1.0.0/4b63c156f44006c55c9c4a8eee3cea966db5c075d4afc914be994f3251ddd8e2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset stif_indonesia/stif_indonesia_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/stif_indonesia/stif_indonesia_nusantara_t2t/1.0.0/e2a7ca266edc3e1906afc3d121b172498d5b6ae74bd39b7b1b46535c58d48255...


Downloading data:   0%|          | 0.00/6.89k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.11k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/58.6k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset stif_indonesia downloaded and prepared to /home/samuel/.cache/huggingface/datasets/stif_indonesia/stif_indonesia_nusantara_t2t/1.0.0/e2a7ca266edc3e1906afc3d121b172498d5b6ae74bd39b7b1b46535c58d48255. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset ted_en_id/ted_en_id_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/ted_en_id/ted_en_id_nusantara_t2t/1.0.0/238640abbb8d60a5c5e091a85d78eb1b939b81afab21f385b61834e9137b34b5...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset ted_en_id downloaded and prepared to /home/samuel/.cache/huggingface/datasets/ted_en_id/ted_en_id_nusantara_t2t/1.0.0/238640abbb8d60a5c5e091a85d78eb1b939b81afab21f385b61834e9137b34b5. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset ud_id_csui_dataset/ud_id_csui_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/ud_id_csui_dataset/ud_id_csui_nusantara_t2t/1.0.0/617f7a6ffc44867d5cb6ca1b90f92468f5d7eac873c290392071ea76abecd637...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset ud_id_csui_dataset downloaded and prepared to /home/samuel/.cache/huggingface/datasets/ud_id_csui_dataset/ud_id_csui_nusantara_t2t/1.0.0/617f7a6ffc44867d5cb6ca1b90f92468f5d7eac873c290392071ea76abecd637. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading and preparing dataset xl_sum/xl_sum_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/xl_sum/xl_sum_nusantara_t2t/1.0.0/76624c668cf6d1c08c111c12ef243a9d87b01fdff321ec667bd87016705c4a2e...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset xl_sum downloaded and prepared to /home/samuel/.cache/huggingface/datasets/xl_sum/xl_sum_nusantara_t2t/1.0.0/76624c668cf6d1c08c111c12ef243a9d87b01fdff321ec667bd87016705c4a2e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading and preparing dataset x_persona_id/xpersona_id_nusantara_t2t to /home/samuel/.cache/huggingface/datasets/x_persona_id/xpersona_id_nusantara_t2t/1.0.0/421f069a8f9e1c2b6fdfcd287bf16582ef6a4154ed2cfb6c724bf19e0369db8b...


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset x_persona_id downloaded and prepared to /home/samuel/.cache/huggingface/datasets/x_persona_id/xpersona_id_nusantara_t2t/1.0.0/421f069a8f9e1c2b6fdfcd287bf16582ef6a4154ed2cfb6c724bf19e0369db8b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

# Count TC

In [16]:
dset_count = {}
for dset_name in text_classification_datasets.keys():
    dset_count[dset_name] = {'train': 0, 'validation': 0, 'test': 0}
    for split in text_classification_datasets[dset_name].keys():
        dset_count[dset_name][split] = len(text_classification_datasets[dset_name][split])
tc_df = pd.DataFrame(dset_count)

In [17]:
tc_df.sum()

casa_nusantara_text_multi                        1080.0
emot_nusantara_text                              4401.0
emotcmt_nusantara_text                            582.0
emotion_id_opinion_nusantara_text                7080.0
hoasa_nusantara_text_multi                       2854.0
id_abusive_nusantara_text                        2016.0
id_clickbait_nusantara_text                     15000.0
id_google_play_review_nusantara_text            10040.0
id_google_play_review_posneg_nusantara_text     10040.0
id_hatespeech_nusantara_text                      713.0
id_multilabel_hs_nusantara_text_multi           13169.0
id_stance_nusantara_pairs                         337.0
id_sts_nusantara_pairs_score                    12901.0
imdb_jv_nusantara_text                         100000.0
indolem_ntp_nusantara_pairs                     33528.0
indolem_sentiment_nusantara_text                 5048.0
indonli_nusantara_pairs                         17709.0
jadi_ide_nusantara_text                         

In [18]:
tc_df.T.sum()

train           239623.0
validation       16338.0
test            115831.0
unsupervised     50000.0
dtype: float64

In [52]:
tc_metadata = {}
for row in tc_df.T.itertuples():
    if row.test != 0:
        tc_metadata[row.Index] = ('test', row.test)
    elif row.validation != 0:
        tc_metadata[row.Index] = ('validation', row.validation)
    elif row.train != 0:
        tc_metadata[row.Index] = ('train', row.train)

In [53]:


pd.DataFrame(tc_metadata).T

Unnamed: 0,0,1
casa_nusantara_text_multi,test,180.0
emot_nusantara_text,test,440.0
emotcmt_nusantara_text,test,582.0
emotion_id_opinion_nusantara_text,train,7080.0
hoasa_nusantara_text_multi,test,286.0
id_abusive_nusantara_text,train,2016.0
id_clickbait_nusantara_text,train,15000.0
id_google_play_review_nusantara_text,validation,3012.0
id_google_play_review_posneg_nusantara_text,validation,3012.0
id_hatespeech_nusantara_text,train,713.0


In [56]:
pd.DataFrame(tc_metadata).T[1].sum()

176668.0

# Count TG

In [57]:
dset_count = {}
for dset_name in text_generation_datasets.keys():
    dset_count[dset_name] = {'train': 0, 'validation': 0, 'test': 0}
    for split in text_generation_datasets[dset_name].keys():
        dset_count[dset_name][split] = len(text_generation_datasets[dset_name][split])
tg_df = pd.DataFrame(dset_count)

In [58]:
tg_df.sum()

bible_en_id_nusantara_t2t                  31078
bible_jv_id_nusantara_t2t                   7958
bible_su_id_nusantara_t2t                   7957
id_panl_bppt_nusantara_t2t                 24021
id_qqp_nusantara_t2t                      149011
indo_general_mt_en_id_nusantara_t2t      1825716
indo_religious_mt_en_id_nusantara_t2t     589368
indosum_fold0_nusantara_t2t                18774
minangnlp_mt_nusantara_t2t                 16371
news_en_id_nusantara_t2t                   42369
nusax_mt_ace_ind_nusantara_t2t              1000
nusax_mt_ban_ind_nusantara_t2t              1000
nusax_mt_bjn_ind_nusantara_t2t              1000
nusax_mt_bug_ind_nusantara_t2t              1000
nusax_mt_eng_ind_nusantara_t2t              1000
nusax_mt_ind_ace_nusantara_t2t              1000
nusax_mt_ind_ban_nusantara_t2t              1000
nusax_mt_ind_bjn_nusantara_t2t              1000
nusax_mt_ind_bug_nusantara_t2t              1000
nusax_mt_ind_eng_nusantara_t2t              1000
nusax_mt_ind_jav_nus

In [59]:
tg_df.T.sum()

train         8925867
validation      44585
test            44053
dtype: int64

In [62]:
tg_metadata = {}
for row in tg_df.T.itertuples():
    if row.test != 0:
        tg_metadata[row.Index] = ('test', row.test)
    elif row.validation != 0:
        tg_metadata[row.Index] = ('validation', row.validation)
    elif row.train != 0:
        tg_metadata[row.Index] = ('train', row.train)

In [63]:
pd.DataFrame(tg_metadata).T

Unnamed: 0,0,1
bible_en_id_nusantara_t2t,test,4661
bible_jv_id_nusantara_t2t,test,1193
bible_su_id_nusantara_t2t,test,1193
id_panl_bppt_nusantara_t2t,train,24021
id_qqp_nusantara_t2t,validation,14927
indo_general_mt_en_id_nusantara_t2t,test,2000
indo_religious_mt_en_id_nusantara_t2t,test,4824
indosum_fold0_nusantara_t2t,test,3762
minangnlp_mt_nusantara_t2t,test,3200
news_en_id_nusantara_t2t,test,1954


In [72]:
pd.DataFrame(tg_metadata).T[1].sum()

6086617

# Save Count Data

In [68]:
pd.DataFrame(tg_metadata).T.rename({0: 'split', 1: 'num_data'}, axis=1).reset_index().to_csv('tg_dataset.csv', index=False)

In [71]:
pd.DataFrame(tc_metadata).T.rename({0: 'split', 1: 'num_data'}, axis=1).reset_index().to_csv('tc_dataset.csv', index=False)

# Test Experiment

In [124]:
cfg_to_split_name = pd.DataFrame(tc_metadata).T[0].to_dict()

In [132]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [133]:
def get_logprobs(tokenizer, model, prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids, output_ids = inputs["input_ids"], inputs["input_ids"][:, 1:]
    outputs = model(**inputs, labels=input_ids)
    logits = outputs.logits
    logprobs = torch.gather(F.log_softmax(logits, dim=2), 2, output_ids.unsqueeze(2))
    return logprobs

# Zero-shot evaluation for the Choice of Plausible Alternatives (COPA) task.
# A return value of 0 indicates that the first alternative is more plausible,
# while 1 indicates that the second alternative is more plausible.
def prompt_eval(tokenizer, model, prompt, alternatives):
    probs = []
    for alt in alternatives:
        probs.append(get_logprobs(prompt + "\n" + f"Choose between ({', '.join(alternatives)})" + "\n", alt).sum())
    return np.argmax(probs)

In [None]:
# Load Model
# facebook/xglm-564M, facebook/xglm-1.7B, facebook/xglm-2.9B, facebook/xglm-4.5B, facebook/xglm-7.5B
tokenizer = AutoTokenizer.from_pretrained("facebook/xglm-7.5B")
model = AutoModelForCausalLM.from_pretrained("facebook/xglm-7.5B")

In [None]:
eval_results = {}
for key, dset in text_classification_datasets.items():
    print(f'Evaluating `{key}`')
    # Get Idx to Label
    if '_text_multi' in key:
        labels = dset[cfg_to_split_name[key]].info.features['labels'].feature.names
        idx2label = {i: lab for i, lab in enumerate(dset[cfg_to_split_name[key]].info.features['labels'].feature.names)}
        # Skip for now
        continue
    elif '_text' in key:
        labels = dset[cfg_to_split_name[key]].info.features['label'].names
        idx2label = {i: lab for i, lab in enumerate(dset[cfg_to_split_name[key]].info.features['label'].names)}
    elif '_pairs_score' in key:
        # Skip for now
        continue
    elif '_pairs' in key:
        labels = dset[cfg_to_split_name[key]].info.features['label'].names
        idx2label = {i: lab for i, lab in enumerate(dset[cfg_to_split_name[key]].info.features['label'].names)}
    else:
        raise ValueError('Unknown Dataset Type')

    # Iterate Dataset
    preds = []
    golds = []
    for row in tqdm(dset[cfg_to_split_name[key]]):
        if '_text' in key:
            prompt = row.text
        elif '_pairs' in key:
            prompt = row.text_1 + "\n" + row.text_2
        
        preds.append(prompt_eval(tokenizer, model, prompt, labels))
        golds.append(row.label)       
        
    metrics = {}
    metrics["ACC"] = accuracy_score(list_label, list_hyp)
    metrics["F1-macro"] = f1_score(list_label, list_hyp, average='macro')
    metrics["REC-macro"] = recall_score(list_label, list_hyp, average='macro')
    metrics["PRE-macro"] = precision_score(list_label, list_hyp, average='macro')
    metrics["F1-micro"] = f1_score(list_label, list_hyp, average='micro')
    metrics["REC-micro"] = recall_score(list_label, list_hyp, average='micro')
    metrics["PRE-micro"] = precision_score(list_label, list_hyp, average='micro')
    
    eval_results[key] = metrics
    break

In [3]:
conhelps = NusantaraConfigHelper()
nusantara_helpers = conhelps.filtered(
    lambda x: x.is_nusantara_schema and not x.is_resource
)
print(len(nusantara_helpers))

396


In [4]:
conhelps = NusantaraConfigHelper()
nusantara_helpers = conhelps.filtered(
    lambda x: x.is_nusantara_schema 
        and not x.is_resource 
        and (x.config.name.endswith('_text') or x.config.name.endswith('_pairs'))
)
print(len(nusantara_helpers))

33


In [5]:
conhelps = NusantaraConfigHelper()
nusantara_helpers = conhelps.filtered(
    lambda x: x.is_nusantara_schema 
        and not x.is_resource 
        and x.config.name.endswith('_t2t')
)
print(len(nusantara_helpers))

242


In [47]:
meta_df = pd.read_csv('https://docs.google.com/spreadsheets/d/17o83IvWxmtGLYridZis0nEprHhsZIMeFtHGtXV35h6M/export?format=csv&gid=879729812', skiprows=1)
meta_df = meta_df.rename({
    'No.': 'id', 'Name': 'name', 'Subsets': 'subsets', 'Link': 'source_link', 'Description': 'description',
    'HF Link': 'hf_link', 'License': 'license', 'Year': 'year', 'Collection Style': 'collection_style',
    'Language': 'language', 'Dialect': 'dialect', 'Domain': 'domain', 'Form': 'modality', 'Tasks': 'tasks',
    'Volume': 'volume', 'Unit': 'unit', 'Ethical Risks': 'ethical_risk', 'Provider': 'provider',
    'Paper Title': 'paper_title', 'Paper Link': 'paper_link', 'Access': 'access', 'Derived From': 'derived_from', 
    'Test Split': 'is_splitted', 'Notes': 'notes', 'Dataloader': 'dataloader', 'Implemented': 'implemented'
}, axis=1)
meta_df['is_splitted'] = meta_df['is_splitted'].apply(lambda x: True if x =='Yes' else False)
# [
#  'No.', 'Name', 'Subsets', 'Link', 'HF Link', 'License', 'Year',
#  'Language', 'Dialect', 'Domain', 'Form', 'Collection Style',
#  'Description', 'Volume', 'Unit', 'Ethical Risks', 'Provider',
#  'Paper Title', 'Paper Link', 'Access', 'Derived From', 'Tasks',
#  'Test Split', 'Notes', 'Dataloader', 'Implemented'
# ]
meta_df.loc[meta_df.is_splitted, 'name']

0                AM2iCo
2                  CASA
8      COCO Captions ID
9                  CORD
10            CoVoST 2 
             ...       
160               WReTe
161              X-FACT
162               XCOPA
163              XL-Sum
164         XPersona Id
Name: name, Length: 72, dtype: object

In [52]:
dataset_name = meta_df.loc[meta_df.is_splitted, 'dataloader'].tolist()

In [55]:
[
    helper.config.name.split('_nusantara_t2t')[0] for helper in nusantara_helpers
            if (('nusax_mt' not in helper.config.name and 'talpco' not in helper.config.name) or 'ind' in helper.config.name)
]

['bible_en_id',
 'bible_jv_id',
 'bible_su_id',
 'covost2_ind_eng',
 'covost2_eng_ind',
 'id_panl_bppt',
 'id_qqp',
 'id_wiki_parallel_jav_ind',
 'id_wiki_parallel_min_ind',
 'id_wiki_parallel_sun_ind',
 'indo_general_mt_en_id',
 'indo_religious_mt_en_id',
 'indosum_fold0',
 'indosum_fold1',
 'indosum_fold2',
 'indosum_fold3',
 'indosum_fold4',
 'korpus_nusantara_ind_jav',
 'korpus_nusantara_ind_day',
 'korpus_nusantara_ind_bug',
 'korpus_nusantara_ind_sun',
 'korpus_nusantara_ind_mad',
 'korpus_nusantara_ind_bin',
 'korpus_nusantara_ind_bbc',
 'korpus_nusantara_ind_khek',
 'korpus_nusantara_ind_msa',
 'korpus_nusantara_ind_min',
 'korpus_nusantara_ind_tiociu',
 'korpus_nusantara_jav_ind',
 'korpus_nusantara_day_ind',
 'korpus_nusantara_bug_ind',
 'korpus_nusantara_sun_ind',
 'korpus_nusantara_mad_ind',
 'korpus_nusantara_bin_ind',
 'korpus_nusantara_bbc_ind',
 'korpus_nusantara_khek_ind',
 'korpus_nusantara_msa_ind',
 'korpus_nusantara_min_ind',
 'korpus_nusantara_tiociu_ind',
 'minan