In [5]:
from tqdm import tqdm
import pandas as pd
from promptsource.templates import DatasetTemplates
!pip install nusacrowd
from nusacrowd import NusantaraConfigHelper
import logging
import argparse

# parser = argparse.ArgumentParser()
# parser.add_argument('--dataset-name', help='Dataset name')
# parser.add_argument('--subset-name', help='Subset name')
# args = parser.parse_args()

# Set up the logger
logging.basicConfig(
    level=logging.DEBUG,  # Set the desired logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format='%(asctime)s [%(levelname)s]: %(message)s',  # Customize the log message format
    datefmt='%Y-%m-%d %H:%M:%S'  # Customize the date/time format
)

# Create a file handler to write logs into a file
file_handler = logging.FileHandler('app.log')
file_handler.setLevel(logging.DEBUG)  # Set the log level for the file handler

# Create a formatter for the file handler (customize the log format for the file)
file_formatter = logging.Formatter('%(asctime)s [%(levelname)s]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
file_handler.setFormatter(file_formatter)

logger = logging.getLogger("IndoP3 Dataset Generation")
logger.addHandler(file_handler)

conhelps = NusantaraConfigHelper()

all_data = []
checkpoint_save_path = "generated_dataset"

dataset_name = "indo_law" #args.dataset_name
subset_name = "nusa" #args.subset_name
logger.info(f"Input dataset_name: {dataset_name}")
logger.info(f"Input subset_name: {subset_name}")

# Load dataset
nusa_metadata = conhelps.filtered(lambda x: dataset_name in x.dataset_name and subset_name in x.config.name)[0]
dataset_name = nusa_metadata.dataset_name
subset_name = nusa_metadata.config.name
dset = nusa_metadata.load_dataset()
logger.info("============================================")
logger.info(f"## DATASET INFO ##")
logger.info(f"Real dataset_name: {dataset_name}")
logger.info(f"Real subset_name: {subset_name}")
logger.info(f"dset.shape: {dset.shape}")
example = dset["train"][0]
logger.info(f"Example dataset: {example}")
logger.info("============================================")


# Load prompt
prompt = DatasetTemplates(dataset_name, subset_name=subset_name)

# Iterate to each prompt templates
for prompt_id in tqdm(prompt.templates):
    template_name = prompt.templates[prompt_id].name

    for dataset_key in dset.keys():
        for example in dset[dataset_key]:
            data_details = {
                "dataset_name": dataset_name,
                "subset_name": subset_name,
                "prompt_id": prompt_id,
                "template_name": template_name,
                "dataset_key": dataset_key,
            }
            input = None
            output = None

            try:
                render = prompt[template_name].apply(example)
                if len(render) != 2:
                    if len(render) == 1:
                        input = render[0]

                    logger.info(f"Output not available for {data_details}.")
                    break
                else:
                    input = render[0]
                    output = render[1]
            except Exception as e:
                logger.error(f"Exception occurred on {data_details}. Please rectify: {e}")
                break
            
            data_details["input"] = input
            data_details["output"] = output

            all_data.append(
                data_details
            )
    
df_ = pd.DataFrame(all_data)
df_.to_csv(f"{checkpoint_save_path}/{dataset_name}-{subset_name}.csv")



2023-08-18 08:42:13 [INFO]: Input dataset_name: indo_law
2023-08-18 08:42:13 [INFO]: Input subset_name: nusa
2023-08-18 08:42:13 [DEBUG]: open file: /Users/ihza.mahendra/.cache/huggingface/datasets/indo_law/indo_law_nusantara_text/1.0.0/e901cc1faecaf0f01dd1c6647597be69bb401d1036603cc1b778e8e440944ac6/dataset_info.json
2023-08-18 08:42:13 [DEBUG]: open file: /Users/ihza.mahendra/.cache/huggingface/datasets/indo_law/indo_law_nusantara_text/1.0.0/e901cc1faecaf0f01dd1c6647597be69bb401d1036603cc1b778e8e440944ac6/dataset_info.json
2023-08-18 08:42:13 [INFO]: ## DATASET INFO ##
2023-08-18 08:42:13 [INFO]: Real dataset_name: indo_law
2023-08-18 08:42:13 [INFO]: Real subset_name: indo_law_nusantara_text
2023-08-18 08:42:13 [INFO]: dset.shape: {'train': (22630, 3)}
2023-08-18 08:42:13 [INFO]: Example dataset: {'id': '9f73f714b56ea3f527b70db6e50c29dc', 'text': '{"kepala_putusan": "\\nputusan\\nnomor 684 pid sus 2019 pn blb\\ndemi keadilan berdasarkan ketuhanan yang maha esa\\npengadilan negeri ba