In [None]:
from dataclasses import dataclass
import pandas as pd
from nusantara import NusantaraMetadata, NusantaraConfigHelper
from nusantara.utils.constants import Tasks, TASK_TO_SCHEMA

In [None]:
conhelps = NusantaraConfigHelper()
print('All Configs')
print(conhelps)

# filter and load datasets
# ====================================================================
print('Retrieve SMSA')
print([helper for helper in conhelps.filtered(lambda x: ("smsa" in x.dataset_name and x.is_nusantara_schema))])
smsa_datasets = [
    helper.load_dataset()
    for helper in conhelps.filtered(
        lambda x: ("smsa" in x.dataset_name and x.is_nusantara_schema)
    )
]
print(smsa_datasets)

# examples of other filters
# ====================================================================

# get all source schema config helpers
print('Source datasets')
source_helpers = conhelps.filtered(lambda x: x.config.schema == "source")
print(source_helpers)

# get all nusantara config helpers
print('Nusantara datasets')
nusantara_helpers = conhelps.filtered(lambda x: x.is_nusantara_schema)
print(nusantara_helpers)

# nusantara NER public tasks
print('Nusantara NER public datasets')
nc_ner_public_helpers = conhelps.filtered(
    lambda x: (
        x.is_nusantara_schema
        and Tasks.NAMED_ENTITY_RECOGNITION in x.tasks
        and not x.is_local
    )
)
print(nc_ner_public_helpers)

# indolem datasets
print('IndoLEM datasets')
nc_indolem_helpers = conhelps.filtered(
    lambda x: ("indolem" in x.dataset_name and x.is_nusantara_schema)
)
print(nc_indolem_helpers)

In [None]:
meta_df = pd.read_csv('https://docs.google.com/spreadsheets/d/17o83IvWxmtGLYridZis0nEprHhsZIMeFtHGtXV35h6M/export?format=csv&gid=879729812', skiprows=1)
meta_df = meta_df[meta_df['Implemented'] > 0].rename({
    'No.': 'id', 'Name': 'name', 'Subsets': 'subsets', 'Link': 'source_link', 'Description': 'description',
    'HF Link': 'hf_link', 'License': 'license', 'Year': 'year', 'Collection Style': 'collection_style',
    'Language': 'language', 'Dialect': 'dialect', 'Domain': 'domain', 'Form': 'modality', 'Tasks': 'tasks',
    'Volume': 'volume', 'Unit': 'unit', 'Ethical Risks': 'ethical_risk', 'Provider': 'provider',
    'Paper Title': 'paper_title', 'Paper Link': 'paper_link', 'Access': 'access', 'Derived From': 'derived_from', 
    'Test Split': 'is_splitted', 'Notes': 'notes', 'Dataloader': 'dataloader', 'Implemented': 'implemented'
}, axis=1)
meta_df['is_splitted'] = meta_df['is_splitted'].apply(lambda x: True if x =='Yes' else False)
# [
#  'No.', 'Name', 'Subsets', 'Link', 'HF Link', 'License', 'Year',
#  'Language', 'Dialect', 'Domain', 'Form', 'Collection Style',
#  'Description', 'Volume', 'Unit', 'Ethical Risks', 'Provider',
#  'Paper Title', 'Paper Link', 'Access', 'Derived From', 'Tasks',
#  'Test Split', 'Notes', 'Dataloader', 'Implemented'
# ]

In [4]:
@dataclass
class MetaDict:
    data: dict = None

In [5]:
name_to_meta_map = {}
for cfg_meta in conhelps:
    # Assign metadata to meta dataframe
    meta_df.loc[meta_df.dataloader == cfg_meta.dataset_name, [
        'is_large', 'is_resource', 'is_default', 'is_broken',
        'is_local', 'citation', 'license', 'homepage', 'tasks'
    ]] = [
        cfg_meta.is_large, cfg_meta.is_resource, cfg_meta.is_default, cfg_meta.is_broken, 
        cfg_meta.is_local, cfg_meta.citation, cfg_meta.license, cfg_meta.homepage, '|'.join([task.value for task in cfg_meta.tasks])
    ]
    
    if cfg_meta.dataset_name not in name_to_meta_map:
        name_to_meta_map[cfg_meta.dataset_name] = {}
    if cfg_meta.config.schema not in name_to_meta_map[cfg_meta.dataset_name]:
        name_to_meta_map[cfg_meta.dataset_name][cfg_meta.config.schema] = []
    name_to_meta_map[cfg_meta.dataset_name][cfg_meta.config.schema].append(cfg_meta)

for dset_name in name_to_meta_map.keys():
    meta_df.loc[meta_df.dataloader == dset_name, 'metadata'] = MetaDict(data=name_to_meta_map[dset_name])

In [6]:
# Filter & load all Indonesian sentiment analysis task
lang = 'ind'
task = Tasks.SENTIMENT_ANALYSIS

filtered_df = meta_df.loc[
    (meta_df.tasks.str.contains(task.value)) & 
    (meta_df.language.str.contains(lang)) & meta_df.is_splitted
]

schema = f'nusantara_{TASK_TO_SCHEMA[task].lower()}'
datasets = {}
for metas in filtered_df.metadata:
    if schema in metas.data:
        for meta in metas.data[schema]:
            if len(meta.languages) > 1:
                if lang in meta.config.name:
                    datasets[meta.config.name] = meta.load_dataset()
            else:
                datasets[meta.config.name] = meta.load_dataset()

datasets

Unnamed: 0,id,name,subsets,source_link,hf_link,license,year,language,dialect,domain,...,provider,paper_title,paper_link,access,derived_from,tasks,is_splitted,notes,dataloader,implemented
2,3,CC100,,https://data.statmt.org/cc-100/,https://huggingface.co/datasets/cc100,Common Crawl's license,2020.0,"ind, sun, jav",other,multi domain,...,Multiple Institutions,Unsupervised Cross-lingual Representation Lear...,https://aclanthology.org/2020.acl-main.747/,Free,Common Crawl,Language Modeling,False,No dataset split,cc100,1.0
10,8,Customer Review (Natasha Skincare),,https://drive.google.com/file/d/1D1pHX7CxrI-eI...,,Unknown,2017.0,ind,,Social media,...,Tweet @NatashaSkinCare,CLASSIFICATION OF CUSTOMERS EMOTION USING NAÏV...,https://jurnal.uns.ac.id/itsmart/article/viewF...,Free,,Emotion Classification,True,"Dataset Split: 87120 train, 37143 validation, ...",sentiment_nathasa_review,1.0
12,10,EmoT (IndoNLU Split),,https://github.com/IndoNLP/indonlu/tree/master...,https://huggingface.co/datasets/indonlu,CC-BY-SA 4.0,2018.0,ind,other,social media,...,Universitas Indonesia,Emotion classification on indonesian\ntwitter ...,https://ieeexplore.ieee.org/document/8629262,Free,Twitter,Emotion Classification,True,"Dataset Split: 3521 train, 440 validation, 442...",emot,9.0
13,11,EmotCMT,"Indonesian, English",https://github.com/ir-nlp-csui/CodeMixedEmotion,,Unknown,2021.0,"ind, eng",,Social media,...,Universitas Indonesia,Normalisation of Indonesian-English Code-Mixed...,https://thesai.org/Downloads/Volume12No11/Pape...,Free,Normalization of indonesian-english code-mixed...,Emotion Classification,False,No dataset split,emotcmt,1.0
15,13,FacQA,,https://github.com/IndoNLP/indonlu/tree/master...,https://huggingface.co/datasets/indonlu,CC-BY-SA 4.0,2007.0,ind,other,news articles,...,Toyohashi University of Technology,A Machine Learning Approach for\nIndonesian Qu...,https://www.researchgate.net/publication/22117...,Free,Wikipedia,Question Answering (Extractive),True,"Dataset Split: 2495 train, 311 dev, 311 test",facqa,9.0
16,14,HoASA (IndoNLU Split),,https://github.com/IndoNLP/indonlu/tree/master...,https://huggingface.co/datasets/indonlu,CC-BY-SA 4.0,2019.0,ind,other,hotel reviews,...,Institut Teknologi Bandung,Multi-label Aspect Categorization with Convolu...,https://ieeexplore.ieee.org/stamp/stamp.jsp?tp...,Free,Online Platform,Aspect Based Sentiment Analysis,True,"Dataset Split: 7,560 train, 1890 test",hoasa,9.0
18,16,ID Abusive,,https://github.com/okkyibrohim/id-abusive-lang...,-,CC-BY-NC-SA 4.0,2018.0,ind,colloquial,social media,...,"IR-NLP Lab, Fasilkom UI",A Dataset and Preliminaries Study for Abusive ...,https://www.sciencedirect.com/science/article/...,Free,Twitter,Hate Speech Detection,False,No dataset split,id_abusive,1.0
20,18,ID Multilabel HS,,https://github.com/okkyibrohim/id-multi-label-...,-,CC-BY-NC-SA 4.0,2019.0,ind,colloquial,social media,...,"IR-NLP Lab, Fasilkom UI",Multi-label Hate Speech and Abusive Language D...,https://aclanthology.org/W19-3506/,Free,"ID Abusive, Twitter",Hate Speech Detection,False,No dataset split,id_multilabel_hs,1.0
23,21,ID-HSD-Riomulia,,https://github.com/ir-nlp-csui/id-hsd-riomulia,-,unknown (looks like CC-BY-SA 4.0),2017.0,ind,colloquial,social media,...,"IR-NLP Lab, Fasilkom UI",Hate Speech Detection in the Indonesian Langua...,https://ieeexplore.ieee.org/abstract/document/...,Free,Twitter,Hate Speech Detection,False,No dataset split,id_hatespeech,1.0
26,24,Idn-tagged-corpus-CSUI,,https://github.com/ir-nlp-csui/idn-tagged-corp...,https://huggingface.co/datasets/indonlu,CC-BY-SA 4.0,2014.0,ind,formal,news articles,...,"IR-NLP Lab, Fasilkom UI",Designing an Indonesian Part of speech Tagset ...,https://ieeexplore.ieee.org/stamp/stamp.jsp?tp...,Free,-,POS Tagging,True,"Dataset Split: 8000 train, 1000 validation, 10...",idn_tagged_corpus_csui,17.0


In [None]:
# Filter & load all splitted English sentiment analysis task
lang = 'eng'
task = Tasks.MACHINE_TRANSLATION

filtered_df = meta_df.loc[
    (meta_df.tasks.str.contains(task.value)) & 
    (meta_df.language.str.contains(lang)) & meta_df.is_splitted
]

schema = f'nusantara_{TASK_TO_SCHEMA[task].lower()}'
datasets = {}
for metas in filtered_df.metadata:
    if schema in metas.data:
        for meta in metas.data[schema]:
            if len(meta.languages) > 1:
                if lang in meta.config.name:
                    datasets[meta.config.name] = meta.load_dataset()
            else:
                datasets[meta.config.name] = meta.load_dataset()
                
datasets

In [None]:
 meta_df.loc[
    (meta_df.name.str.contains('CSUI'))
]


In [None]:
for cfg_meta in conhelps:
    if len(cfg_meta.tasks) > 1:
        print(cfg_meta.name)

In [None]:
# Filter & load all Indonesian sentiment analysis task
lang = 'ind'
task = Tasks.SENTIMENT_ANALYSIS

filtered_df = meta_df.loc[
    (meta_df.tasks.str.contains(task.value)) & 
    (meta_df.language.str.contains(lang)) & meta_df.is_splitted
]

schema = f'nusantara_{TASK_TO_SCHEMA[task].lower()}'
datasets = {}
for metas in filtered_df.metadata:
    if schema in metas.data:
        for meta in metas.data[schema]:
            if len(meta.languages) > 1:
                if lang in meta.config.name:
                    datasets[meta.config.name] = meta.load_dataset()
            else:
                datasets[meta.config.name] = meta.load_dataset()

datasets

In [None]:
If there is multiple tasks on a dataset, split it and filter schema out of it resulting in
config meta from the original dataset meta