In [2]:
import os
import torch
from pathlib import Path
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizerFast, BertForMaskedLM, pipeline
from transformers import BertConfig, AutoConfig
from datasets import load_dataset, load_from_disk, DatasetDict, Dataset
from transformers import DataCollatorForLanguageModeling
import math


In [3]:
os.environ['CUDA_VISIBLE_DEVICES']='0'
os.environ['TRANSFORMERS_CACHE'] = '/media/agus/DATA/huggingface/huggingface/'
torch.cuda.device(0)

<torch.cuda.device at 0x7fae240a64f0>

In [4]:
torch.cuda.is_available()

True

In [5]:
# Bert-Base
config = BertConfig(
    vocab_size=50_000,
    max_position_embeddings=512,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=12,
    type_vocab_size=2,
)

In [6]:
model = BertForMaskedLM(config=config)

In [7]:
model.num_parameters()

124492880

In [8]:
from transformers import BertTokenizerFast, DataCollatorForLanguageModeling
import random

In [9]:
# Custom function to apply keyword masking
def mask_keywords(input_ids, keyword_list, tokenizer, mask_prob=0.15, mask_token="[MASK]"):
    # Apply masking to each input ID
    masked_input_ids = input_ids.clone()
    for i in range(len(masked_input_ids)):
        token = tokenizer.decode(masked_input_ids[i])
        if token in keyword_list and random.random() < mask_prob:
            masked_input_ids[i] = tokenizer.convert_tokens_to_ids(mask_token)

    return masked_input_ids

class CustomDataCollator(DataCollatorForLanguageModeling):
    def __init__(self, tokenizer, mlm=True, mlm_probability=0.15, keyword_list=None):
        super().__init__(tokenizer=tokenizer, mlm=mlm, mlm_probability=mlm_probability)
        self.keyword_list = keyword_list

    def __call__(self, examples):
        batch = super().__call__(examples)

        # Apply keyword masking
        if self.keyword_list is not None:
            batch["input_ids"] = mask_keywords(batch["input_ids"], self.keyword_list, self.tokenizer)

        return batch

In [10]:
from transformers import Trainer, TrainingArguments
import multiprocessing


def run_scratch(data,tokenizer_file, batch_size, model_name, keyword_list=None):
    def group_texts_ds(examples):
        # Concatenate all texts.
        block_size = 512
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
        total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result
    
    #suffix
    dataset = load_dataset(path="/media/agus/DATA/DDALM/output/suffix_array/",
                        data_files = data,
                        cache_dir="/media/agus/DATA/huggingface/huggingface/",

                        ) #load_dataset(path=data_dir)
    dataset = dataset['train']
    dataset = dataset.train_test_split(test_size=0.2)
    
    tokenizer = BertTokenizerFast.from_pretrained(tokenizer_file)
    
    if(keyword_list == None):
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=True, mlm_probability=0.15
        )
    else:
        data_collator = CustomDataCollator(
            tokenizer=tokenizer, mlm=True, mlm_probability=0.15, 
            keyword_list=keyword_list)
    
    
    num_proc = multiprocessing.cpu_count()
    print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")

    def group_texts(examples):
        tokenized_inputs = tokenizer(
        examples["text"], return_special_tokens_mask=True, truncation=False, max_length=tokenizer.model_max_length
        )
        return tokenized_inputs

    # preprocess dataset
    tokenized_datasets = dataset.map(group_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
    
     #Grouping
    lm_datasets = tokenized_datasets.map(
        group_texts_ds,
        batched=True,
        batch_size=1000,
        num_proc=28,
    )
    
    training_args = TrainingArguments(
        output_dir=f'models/{model_name}',
        overwrite_output_dir=True,
        num_train_epochs=25,
        per_device_eval_batch_size=batch_size,
        per_device_train_batch_size=batch_size,
        #warmup_steps=2000,
        #max_steps=100000,
        warmup_ratio=0.1,
        weight_decay=0.1,
        max_grad_norm=25,
        #lr_scheduler_type="cosine",
        learning_rate=2e-5,
        save_steps=1000,
        save_total_limit=2,
        seed=42,
        fp16=True,
        prediction_loss_only=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=lm_datasets["train"],
        eval_dataset=lm_datasets["test"],
        #prediction_loss_only=True,
    )
    
    trainer.train()
    
     #Evaluate
    eval_results = trainer.evaluate()
    print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
    
    #Save
    trainer.save_model(f"models/{model_name}")
    tokenizer.save_pretrained(f"models/{model_name}")

In [11]:
def create_keywords(file_path):
    unigrams = []
    with open(file_path, 'r') as file:
        text = file.read()
        words = text.lower().split()  # Split the text into individual words (tokens)
        unigrams.extend(words)  # Add the words to the list of unigrams

    return unigrams


# Task with Custom Collator

In [36]:
data = ["cofog-c1-fourth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/cofog-c1/"
batch_size = 18
model_name = "cofog-c1-scratch"
keywords = create_keywords('/media/agus/DATA/DDALM/script/IndoGovBERT-final/data/cofog_keywords.txt')

run_scratch(data,tokenizer_file, batch_size, model_name, keywords)

Found cached dataset text (/home/agus/.cache/huggingface/datasets/text/suffix_array-015eb444645a5062/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /home/agus/.cache/huggingface/datasets/text/suffix_array-015eb444645a5062/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-f4867fc4851132b5.arrow and /home/agus/.cache/huggingface/datasets/text/suffix_array-015eb444645a5062/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-76cd7ad19a1143a5.arrow
Loading cached processed dataset at /home/agus/.cache/huggingface/datasets/text/suffix_array-015eb444645a5062/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-817f689728866bea_*_of_00028.arrow
Loading cached processed dataset at /home/agus/.cache/huggingface/datasets/text/suffix_array-015eb444645a5062/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-72c9626f16cd035d_*_of_00028.arrow
Loading cached processed dataset at /home/agus/.cache/huggingface/datasets/text/suffix_array-015eb444645a5062/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5

The max length for the tokenizer is: 1000000000000000019884624838656


Loading cached processed dataset at /home/agus/.cache/huggingface/datasets/text/suffix_array-015eb444645a5062/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-ed32666d7949b396_*_of_00028.arrow


  0%|          | 0/126125 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 10.554, 'learning_rate': 7.928327915642591e-07, 'epoch': 0.1}
{'loss': 10.0082, 'learning_rate': 1.5856655831285183e-06, 'epoch': 0.2}
{'loss': 9.6882, 'learning_rate': 2.3784983746927777e-06, 'epoch': 0.3}
{'loss': 9.3085, 'learning_rate': 3.1713311662570366e-06, 'epoch': 0.4}
{'loss': 8.9402, 'learning_rate': 3.9641639578212955e-06, 'epoch': 0.5}
{'loss': 8.6445, 'learning_rate': 4.756996749385555e-06, 'epoch': 0.59}
{'loss': 8.4463, 'learning_rate': 5.549829540949813e-06, 'epoch': 0.69}
{'loss': 8.2962, 'learning_rate': 6.342662332514073e-06, 'epoch': 0.79}
{'loss': 8.1761, 'learning_rate': 7.135495124078332e-06, 'epoch': 0.89}
{'loss': 8.0785, 'learning_rate': 7.928327915642591e-06, 'epoch': 0.99}
{'loss': 7.9887, 'learning_rate': 8.72116070720685e-06, 'epoch': 1.09}
{'loss': 7.8952, 'learning_rate': 9.51399349877111e-06, 'epoch': 1.19}
{'loss': 7.8287, 'learning_rate': 1.030682629033537e-05, 'epoch': 1.29}
{'loss': 7.7692, 'learning_rate': 1.1099659081899627e-05, 'epoch':

  0%|          | 0/1317 [00:00<?, ?it/s]

Perplexity: 812.44


# Fineal 25

In [12]:
data = ["persuratan-dataset-final-fourth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/persuratan"
batch_size = 18
model_name = "persuratan-final-25"

run_scratch(data,tokenizer_file, batch_size, model_name)

Found cached dataset text (/media/agus/DATA/huggingface/huggingface/text/suffix_array-0bb18389d41e00ae/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/17750 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/4438 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/17750 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/4438 [00:00<?, ? examples/s]



  0%|          | 0/167750 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 10.5972, 'learning_rate': 5.961251862891208e-07, 'epoch': 0.07}
{'loss': 10.0694, 'learning_rate': 1.1922503725782416e-06, 'epoch': 0.15}
{'loss': 9.8003, 'learning_rate': 1.7883755588673623e-06, 'epoch': 0.22}
{'loss': 9.4872, 'learning_rate': 2.3845007451564832e-06, 'epoch': 0.3}
{'loss': 9.1557, 'learning_rate': 2.9806259314456037e-06, 'epoch': 0.37}
{'loss': 8.8393, 'learning_rate': 3.5767511177347247e-06, 'epoch': 0.45}
{'loss': 8.6113, 'learning_rate': 4.172876304023845e-06, 'epoch': 0.52}
{'loss': 8.4434, 'learning_rate': 4.7690014903129665e-06, 'epoch': 0.6}
{'loss': 8.3049, 'learning_rate': 5.365126676602087e-06, 'epoch': 0.67}
{'loss': 8.204, 'learning_rate': 5.9612518628912075e-06, 'epoch': 0.75}
{'loss': 8.1175, 'learning_rate': 6.5573770491803276e-06, 'epoch': 0.82}
{'loss': 8.0379, 'learning_rate': 7.153502235469449e-06, 'epoch': 0.89}
{'loss': 7.9452, 'learning_rate': 7.74962742175857e-06, 'epoch': 0.97}
{'loss': 7.8933, 'learning_rate': 8.34575260804769e-06, 'e

  0%|          | 0/1655 [00:00<?, ?it/s]

Perplexity: 798.18


In [13]:
data = ["peraturan-dataset-final-fifth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/peraturan"
batch_size = 18
model_name = "peraturan-final-25"

run_scratch(data,tokenizer_file, batch_size, model_name)

Found cached dataset text (/media/agus/DATA/huggingface/huggingface/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-b5c9ef00b1ba87bb.arrow and /media/agus/DATA/huggingface/huggingface/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-a735864a71d26cc3.arrow
Loading cached processed dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-910a2980a8d2c27a_*_of_00028.arrow


The max length for the tokenizer is: 1000000000000000019884624838656


Loading cached processed dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-4339047be501bb9f_*_of_00028.arrow
Loading cached processed dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-122dcb06538033b2_*_of_00028.arrow
Loading cached processed dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-aab000935275ee29_*_of_00028.arrow


  0%|          | 0/537125 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 8.3441, 'learning_rate': 1.8617466907452572e-07, 'epoch': 0.02}
{'loss': 8.0889, 'learning_rate': 3.7234933814905144e-07, 'epoch': 0.05}
{'loss': 7.9041, 'learning_rate': 5.585240072235772e-07, 'epoch': 0.07}
{'loss': 7.7761, 'learning_rate': 7.446986762981029e-07, 'epoch': 0.09}
{'loss': 7.6374, 'learning_rate': 9.308733453726288e-07, 'epoch': 0.12}
{'loss': 7.5195, 'learning_rate': 1.1170480144471544e-06, 'epoch': 0.14}
{'loss': 7.3906, 'learning_rate': 1.3032226835216803e-06, 'epoch': 0.16}
{'loss': 7.2728, 'learning_rate': 1.4893973525962058e-06, 'epoch': 0.19}
{'loss': 7.1604, 'learning_rate': 1.6751996723325826e-06, 'epoch': 0.21}
{'loss': 7.0527, 'learning_rate': 1.8613743414071085e-06, 'epoch': 0.23}
{'loss': 6.9879, 'learning_rate': 2.047549010481634e-06, 'epoch': 0.26}
{'loss': 6.9182, 'learning_rate': 2.2337236795561596e-06, 'epoch': 0.28}
{'loss': 6.85, 'learning_rate': 2.4198983486306853e-06, 'epoch': 0.3}
{'loss': 6.8048, 'learning_rate': 2.6060730177052114e-06, 

  0%|          | 0/5250 [00:00<?, ?it/s]

Perplexity: 154.57


In [14]:
data = ["persuratan-dataset-final-fourth.txt","peraturan-dataset-final-fifth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/all_dataset"
batch_size = 18
model_name = "all-final-25"

run_scratch(data,tokenizer_file, batch_size, model_name)

Found cached dataset text (/media/agus/DATA/huggingface/huggingface/text/suffix_array-2d9aa0cc05fe1eab/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-2d9aa0cc05fe1eab/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-6395a96612784d70.arrow and /media/agus/DATA/huggingface/huggingface/text/suffix_array-2d9aa0cc05fe1eab/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-2b5590fd9ac74366.arrow
Loading cached processed dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-2d9aa0cc05fe1eab/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-c761370daeaa0f48_*_of_00028.arrow


The max length for the tokenizer is: 1000000000000000019884624838656


Loading cached processed dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-2d9aa0cc05fe1eab/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-1b5776c1dcd4326f_*_of_00028.arrow
Loading cached processed dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-2d9aa0cc05fe1eab/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-7244ada2f6f4dd5e_*_of_00028.arrow
Loading cached processed dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-2d9aa0cc05fe1eab/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-d96deb5045f0fc62_*_of_00028.arrow


  0%|          | 0/699975 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 7.9761, 'learning_rate': 1.4286122460641735e-07, 'epoch': 0.02}
{'loss': 7.7507, 'learning_rate': 2.857224492128347e-07, 'epoch': 0.04}
{'loss': 7.6315, 'learning_rate': 4.2858367381925204e-07, 'epoch': 0.05}
{'loss': 7.5296, 'learning_rate': 5.714448984256694e-07, 'epoch': 0.07}
{'loss': 7.4298, 'learning_rate': 7.143061230320866e-07, 'epoch': 0.09}
{'loss': 7.3351, 'learning_rate': 8.571673476385041e-07, 'epoch': 0.11}
{'loss': 7.2575, 'learning_rate': 1.0000285722449213e-06, 'epoch': 0.13}
{'loss': 7.18, 'learning_rate': 1.1428897968513388e-06, 'epoch': 0.14}
{'loss': 7.091, 'learning_rate': 1.285751021457756e-06, 'epoch': 0.16}
{'loss': 7.0136, 'learning_rate': 1.4283265236149604e-06, 'epoch': 0.18}
{'loss': 6.9701, 'learning_rate': 1.5711877482213777e-06, 'epoch': 0.2}
{'loss': 6.9055, 'learning_rate': 1.7140489728277951e-06, 'epoch': 0.21}
{'loss': 6.8651, 'learning_rate': 1.8569101974342126e-06, 'epoch': 0.23}
{'loss': 6.8254, 'learning_rate': 1.99977142204063e-06, 'epo

  0%|          | 0/7281 [00:00<?, ?it/s]

Perplexity: 15.91


# Final

In [22]:
data = ["persuratan-dataset-final-fourth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/persuratan"
batch_size = 18
model_name = "persuratan-final"

run_scratch(data,tokenizer_file, batch_size, model_name)

Downloading and preparing dataset text/suffix_array to /media/agus/DATA/huggingface/huggingface/text/suffix_array-0bb18389d41e00ae/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /media/agus/DATA/huggingface/huggingface/text/suffix_array-0bb18389d41e00ae/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/17750 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/4438 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/17750 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/4438 [00:00<?, ? examples/s]



  0%|          | 0/66740 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 7.7426, 'learning_rate': 1.4863649985016482e-06, 'epoch': 0.07}
{'loss': 7.1263, 'learning_rate': 2.9847168115073423e-06, 'epoch': 0.15}
{'loss': 6.8692, 'learning_rate': 4.483068624513036e-06, 'epoch': 0.22}
{'loss': 6.7041, 'learning_rate': 5.98142043751873e-06, 'epoch': 0.3}
{'loss': 6.5594, 'learning_rate': 7.479772250524424e-06, 'epoch': 0.37}
{'loss': 6.4037, 'learning_rate': 8.978124063530117e-06, 'epoch': 0.45}
{'loss': 6.24, 'learning_rate': 1.0476475876535812e-05, 'epoch': 0.52}
{'loss': 6.0731, 'learning_rate': 1.1974827689541506e-05, 'epoch': 0.6}
{'loss': 5.9324, 'learning_rate': 1.34731795025472e-05, 'epoch': 0.67}
{'loss': 5.7906, 'learning_rate': 1.4971531315552894e-05, 'epoch': 0.75}
{'loss': 5.7078, 'learning_rate': 1.6469883128558585e-05, 'epoch': 0.82}
{'loss': 5.5954, 'learning_rate': 1.796823494156428e-05, 'epoch': 0.9}
{'loss': 5.5152, 'learning_rate': 1.9466586754569973e-05, 'epoch': 0.97}
{'loss': 5.4263, 'learning_rate': 1.9892784603602705e-05, 'epoch

  0%|          | 0/1691 [00:00<?, ?it/s]

Perplexity: 71.66


In [23]:
data = ["peraturan-dataset-final-fifth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/peraturan"
batch_size = 18
model_name = "peraturan-final"

run_scratch(data,tokenizer_file, batch_size, model_name)

Downloading and preparing dataset text/suffix_array to /media/agus/DATA/huggingface/huggingface/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /media/agus/DATA/huggingface/huggingface/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/47783 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/11946 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/47783 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/11946 [00:00<?, ? examples/s]



  0%|          | 0/214850 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 6.3927, 'learning_rate': 4.6264835932045613e-07, 'epoch': 0.02}
{'loss': 5.8145, 'learning_rate': 9.280893646730278e-07, 'epoch': 0.05}
{'loss': 5.5124, 'learning_rate': 1.3935303700255996e-06, 'epoch': 0.07}
{'loss': 5.3138, 'learning_rate': 1.858971375378171e-06, 'epoch': 0.09}
{'loss': 5.145, 'learning_rate': 2.3244123807307427e-06, 'epoch': 0.12}
{'loss': 5.0518, 'learning_rate': 2.7898533860833144e-06, 'epoch': 0.14}
{'loss': 4.9641, 'learning_rate': 3.2552943914358858e-06, 'epoch': 0.16}
{'loss': 4.8555, 'learning_rate': 3.7207353967884575e-06, 'epoch': 0.19}
{'loss': 4.7727, 'learning_rate': 4.186176402141029e-06, 'epoch': 0.21}
{'loss': 4.6471, 'learning_rate': 4.651617407493601e-06, 'epoch': 0.23}
{'loss': 4.5872, 'learning_rate': 5.117058412846172e-06, 'epoch': 0.26}
{'loss': 4.5077, 'learning_rate': 5.582499418198743e-06, 'epoch': 0.28}
{'loss': 4.3983, 'learning_rate': 6.047009541540611e-06, 'epoch': 0.3}
{'loss': 4.3051, 'learning_rate': 6.512450546893182e-06, 'ep

  0%|          | 0/5250 [00:00<?, ?it/s]

Perplexity: 10.95


In [24]:
data = ["persuratan-dataset-final-fourth.txt","peraturan-dataset-final-fifth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/all_dataset"
batch_size = 18
model_name = "c3-final"

run_scratch(data,tokenizer_file, batch_size, model_name)

Downloading and preparing dataset text/suffix_array to /media/agus/DATA/huggingface/huggingface/text/suffix_array-2d9aa0cc05fe1eab/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /media/agus/DATA/huggingface/huggingface/text/suffix_array-2d9aa0cc05fe1eab/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/65533 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/16384 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/65533 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/16384 [00:00<?, ? examples/s]



  0%|          | 0/279990 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 6.7554, 'learning_rate': 3.550126790242509e-07, 'epoch': 0.02}
{'loss': 6.2067, 'learning_rate': 7.121682917247046e-07, 'epoch': 0.04}
{'loss': 5.991, 'learning_rate': 1.0693239044251582e-06, 'epoch': 0.05}
{'loss': 5.8146, 'learning_rate': 1.4264795171256118e-06, 'epoch': 0.07}
{'loss': 5.6567, 'learning_rate': 1.7836351298260655e-06, 'epoch': 0.09}
{'loss': 5.544, 'learning_rate': 2.140790742526519e-06, 'epoch': 0.11}
{'loss': 5.4644, 'learning_rate': 2.4979463552269726e-06, 'epoch': 0.13}
{'loss': 5.4089, 'learning_rate': 2.855101967927426e-06, 'epoch': 0.14}
{'loss': 5.2886, 'learning_rate': 3.21225758062788e-06, 'epoch': 0.16}
{'loss': 5.2031, 'learning_rate': 3.5694131933283332e-06, 'epoch': 0.18}
{'loss': 5.1636, 'learning_rate': 3.926568806028787e-06, 'epoch': 0.2}
{'loss': 5.066, 'learning_rate': 4.283724418729241e-06, 'epoch': 0.21}
{'loss': 4.9931, 'learning_rate': 4.6401657202042936e-06, 'epoch': 0.23}
{'loss': 4.9262, 'learning_rate': 4.997321332904747e-06, 'epoch

  0%|          | 0/7281 [00:00<?, ?it/s]

Perplexity: 14.39


# Task Driven

## COFOG Driven

In [17]:
data = ["cofog-c1-fourth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/cofog-c1"
batch_size = 18
model_name = "cofog-c1-final"

run_scratch(data,tokenizer_file, batch_size, model_name)

Downloading and preparing dataset text/suffix_array to /media/agus/DATA/huggingface/huggingface/text/suffix_array-015eb444645a5062/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /media/agus/DATA/huggingface/huggingface/text/suffix_array-015eb444645a5062/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/7652 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/1914 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/7652 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/1914 [00:00<?, ? examples/s]



  0%|          | 0/50450 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 7.7252, 'learning_rate': 1.974231912784936e-06, 'epoch': 0.1}
{'loss': 7.1384, 'learning_rate': 3.956392467789891e-06, 'epoch': 0.2}
{'loss': 6.8881, 'learning_rate': 5.9385530227948475e-06, 'epoch': 0.3}
{'loss': 6.7114, 'learning_rate': 7.920713577799802e-06, 'epoch': 0.4}
{'loss': 6.5784, 'learning_rate': 9.902874132804758e-06, 'epoch': 0.5}
{'loss': 6.4348, 'learning_rate': 1.1885034687809713e-05, 'epoch': 0.59}
{'loss': 6.2573, 'learning_rate': 1.3867195242814669e-05, 'epoch': 0.69}
{'loss': 6.1142, 'learning_rate': 1.5849355797819626e-05, 'epoch': 0.79}
{'loss': 5.9854, 'learning_rate': 1.7831516352824582e-05, 'epoch': 0.89}
{'loss': 5.8738, 'learning_rate': 1.9809712586719526e-05, 'epoch': 0.99}
{'loss': 5.7634, 'learning_rate': 1.9800902984252838e-05, 'epoch': 1.09}
{'loss': 5.6489, 'learning_rate': 1.958066292258562e-05, 'epoch': 1.19}
{'loss': 5.6008, 'learning_rate': 1.93604228609184e-05, 'epoch': 1.29}
{'loss': 5.508, 'learning_rate': 1.914062327937452e-05, 'epoch'

  0%|          | 0/1317 [00:00<?, ?it/s]

Perplexity: 100.94


In [18]:
data = ["cofog-c2-fifth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/cofog-c2"
batch_size = 18
model_name = "cofog-c2-final"

run_scratch(data,tokenizer_file, batch_size, model_name)

Downloading and preparing dataset text/suffix_array to /media/agus/DATA/huggingface/huggingface/text/suffix_array-56c56ea567bb41a6/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /media/agus/DATA/huggingface/huggingface/text/suffix_array-56c56ea567bb41a6/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/34240 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/8560 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/34240 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/8560 [00:00<?, ? examples/s]



  0%|          | 0/192990 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 6.3832, 'learning_rate': 5.160889165241723e-07, 'epoch': 0.03}
{'loss': 5.8463, 'learning_rate': 1.0342504792994457e-06, 'epoch': 0.05}
{'loss': 5.55, 'learning_rate': 1.5524120420747191e-06, 'epoch': 0.08}
{'loss': 5.404, 'learning_rate': 2.0705736048499926e-06, 'epoch': 0.1}
{'loss': 5.2562, 'learning_rate': 2.588735167625266e-06, 'epoch': 0.13}
{'loss': 5.1215, 'learning_rate': 3.1068967304005395e-06, 'epoch': 0.16}
{'loss': 5.0392, 'learning_rate': 3.6250582931758127e-06, 'epoch': 0.18}
{'loss': 4.9257, 'learning_rate': 4.143219855951086e-06, 'epoch': 0.21}
{'loss': 4.8116, 'learning_rate': 4.660345095600809e-06, 'epoch': 0.23}
{'loss': 4.7317, 'learning_rate': 5.1785066583760825e-06, 'epoch': 0.26}
{'loss': 4.6244, 'learning_rate': 5.696668221151356e-06, 'epoch': 0.28}
{'loss': 4.5689, 'learning_rate': 6.214829783926629e-06, 'epoch': 0.31}
{'loss': 4.4812, 'learning_rate': 6.731955023576352e-06, 'epoch': 0.34}
{'loss': 4.4027, 'learning_rate': 7.2501165863516255e-06, 'epo

  0%|          | 0/4679 [00:00<?, ?it/s]

Perplexity: 11.84


In [21]:
data = ["cofog-c1-fourth.txt","cofog-c2-fifth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/cofog-all"
batch_size = 18
model_name = "cofog-all-final"

run_scratch(data,tokenizer_file, batch_size, model_name)

Found cached dataset text (/media/agus/DATA/huggingface/huggingface/text/suffix_array-e1bc485b557ab25b/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-e1bc485b557ab25b/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-a127a7344a2915bc.arrow and /media/agus/DATA/huggingface/huggingface/text/suffix_array-e1bc485b557ab25b/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-a6e2f8ec7ab6738f.arrow
Loading cached processed dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-e1bc485b557ab25b/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-5f8c675617f8e3a7_*_of_00028.arrow


The max length for the tokenizer is: 1000000000000000019884624838656


Loading cached processed dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-e1bc485b557ab25b/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-f67d1b23c5c1a4cf_*_of_00028.arrow
Loading cached processed dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-e1bc485b557ab25b/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-ac5ac5c2ff95650a_*_of_00028.arrow
Loading cached processed dataset at /media/agus/DATA/huggingface/huggingface/text/suffix_array-e1bc485b557ab25b/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-7d308a42ab1ff13f_*_of_00028.arrow


  0%|          | 0/245870 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 5.4696, 'learning_rate': 4.059055598487006e-07, 'epoch': 0.02}
{'loss': 5.4575, 'learning_rate': 8.1262455769309e-07, 'epoch': 0.04}
{'loss': 5.4271, 'learning_rate': 1.2193435555374793e-06, 'epoch': 0.06}
{'loss': 5.3716, 'learning_rate': 1.6260625533818685e-06, 'epoch': 0.08}
{'loss': 5.3076, 'learning_rate': 2.031968113230569e-06, 'epoch': 0.1}
{'loss': 5.2701, 'learning_rate': 2.4386871110749586e-06, 'epoch': 0.12}
{'loss': 5.2605, 'learning_rate': 2.845406108919348e-06, 'epoch': 0.14}
{'loss': 5.2206, 'learning_rate': 3.252125106763737e-06, 'epoch': 0.16}
{'loss': 5.1275, 'learning_rate': 3.6580306666124375e-06, 'epoch': 0.18}
{'loss': 5.0723, 'learning_rate': 4.064749664456827e-06, 'epoch': 0.2}
{'loss': 5.013, 'learning_rate': 4.471468662301216e-06, 'epoch': 0.22}
{'loss': 4.9686, 'learning_rate': 4.878187660145606e-06, 'epoch': 0.24}
{'loss': 4.8823, 'learning_rate': 5.284906657989995e-06, 'epoch': 0.26}
{'loss': 4.8382, 'learning_rate': 5.690812217838696e-06, 'epoch':

  0%|          | 0/5897 [00:00<?, ?it/s]

Perplexity: 15.03


## SDGS Driven

In [16]:
data = ["sdgs-c1-fourth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/sdgs-c1-token"
batch_size = 18
model_name = "sdgs-c1-final"

run_scratch(data,tokenizer_file, batch_size, model_name)

Found cached dataset text (/media/agus/DATA/huggingface/huggingface/text/suffix_array-e69061d20723c21e/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/10544 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/2636 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/10544 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/2636 [00:00<?, ? examples/s]



  0%|          | 0/54880 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 7.8318, 'learning_rate': 1.8148688046647232e-06, 'epoch': 0.09}
{'loss': 7.219, 'learning_rate': 3.637026239067056e-06, 'epoch': 0.18}
{'loss': 6.9375, 'learning_rate': 5.459183673469388e-06, 'epoch': 0.27}
{'loss': 6.7767, 'learning_rate': 7.281341107871721e-06, 'epoch': 0.36}
{'loss': 6.6527, 'learning_rate': 9.103498542274053e-06, 'epoch': 0.46}
{'loss': 6.4837, 'learning_rate': 1.0925655976676386e-05, 'epoch': 0.55}
{'loss': 6.3572, 'learning_rate': 1.2747813411078717e-05, 'epoch': 0.64}
{'loss': 6.1993, 'learning_rate': 1.4569970845481051e-05, 'epoch': 0.73}
{'loss': 6.0672, 'learning_rate': 1.638848396501458e-05, 'epoch': 0.82}
{'loss': 5.9352, 'learning_rate': 1.821064139941691e-05, 'epoch': 0.91}
{'loss': 5.8477, 'learning_rate': 1.9996355685131196e-05, 'epoch': 1.0}
{'loss': 5.7302, 'learning_rate': 1.9793893747975383e-05, 'epoch': 1.09}
{'loss': 5.6433, 'learning_rate': 1.9591431810819566e-05, 'epoch': 1.18}
{'loss': 5.5768, 'learning_rate': 1.9388969873663753e-05, '

  0%|          | 0/1380 [00:00<?, ?it/s]

Perplexity: 95.49


In [14]:
data = ["sdgs-c2-fifth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/sdgs-c2-token"
batch_size = 18
model_name = "sdgs-c2-final"

run_scratch(data,tokenizer_file, batch_size, model_name)

Found cached dataset text (/media/agus/DATA/huggingface/huggingface/text/suffix_array-f7dcb39a3af109aa/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/39983 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/9996 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/39983 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/9996 [00:00<?, ? examples/s]



  0%|          | 0/194130 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 6.5814, 'learning_rate': 5.140884973986504e-07, 'epoch': 0.03}
{'loss': 6.0491, 'learning_rate': 1.0292072322670377e-06, 'epoch': 0.05}
{'loss': 5.7666, 'learning_rate': 1.544325967135425e-06, 'epoch': 0.08}
{'loss': 5.6114, 'learning_rate': 2.059444702003812e-06, 'epoch': 0.1}
{'loss': 5.4454, 'learning_rate': 2.5745634368721994e-06, 'epoch': 0.13}
{'loss': 5.3619, 'learning_rate': 3.089682171740587e-06, 'epoch': 0.15}
{'loss': 5.2514, 'learning_rate': 3.6048009066089734e-06, 'epoch': 0.18}
{'loss': 5.1445, 'learning_rate': 4.11991964147736e-06, 'epoch': 0.21}
{'loss': 5.0748, 'learning_rate': 4.635038376345748e-06, 'epoch': 0.23}
{'loss': 5.003, 'learning_rate': 5.150157111214136e-06, 'epoch': 0.26}
{'loss': 4.9004, 'learning_rate': 5.665275846082523e-06, 'epoch': 0.28}
{'loss': 4.8193, 'learning_rate': 6.180394580950909e-06, 'epoch': 0.31}
{'loss': 4.7463, 'learning_rate': 6.694483078349559e-06, 'epoch': 0.33}
{'loss': 4.6974, 'learning_rate': 7.209601813217947e-06, 'epoch'

  0%|          | 0/5059 [00:00<?, ?it/s]

Perplexity: 12.45


In [13]:
data = ["sdgs-c1-fourth.txt","sdgs-c2-fifth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/sdgs-c3-token"
batch_size = 18
model_name = "sdgs-all-final"

run_scratch(data,tokenizer_file, batch_size, model_name)

Downloading and preparing dataset text/suffix_array to /media/agus/DATA/huggingface/huggingface/text/suffix_array-8070c0937e393d98/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /media/agus/DATA/huggingface/huggingface/text/suffix_array-8070c0937e393d98/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/50527 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/12632 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/50527 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/12632 [00:00<?, ? examples/s]



  0%|          | 0/252070 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 10.2826, 'learning_rate': 3.95921767762923e-07, 'epoch': 0.02}
{'loss': 8.5474, 'learning_rate': 7.926369659221645e-07, 'epoch': 0.04}
{'loss': 8.1904, 'learning_rate': 1.1893521640814061e-06, 'epoch': 0.06}
{'loss': 7.9289, 'learning_rate': 1.5860673622406475e-06, 'epoch': 0.08}
{'loss': 7.716, 'learning_rate': 1.982782560399889e-06, 'epoch': 0.1}
{'loss': 7.5228, 'learning_rate': 2.3794977585591306e-06, 'epoch': 0.12}
{'loss': 7.3431, 'learning_rate': 2.7762129567183722e-06, 'epoch': 0.14}
{'loss': 7.2231, 'learning_rate': 3.172928154877614e-06, 'epoch': 0.16}
{'loss': 7.0927, 'learning_rate': 3.569643353036855e-06, 'epoch': 0.18}
{'loss': 6.9951, 'learning_rate': 3.9663585511960965e-06, 'epoch': 0.2}
{'loss': 6.9284, 'learning_rate': 4.363073749355338e-06, 'epoch': 0.22}
{'loss': 6.8051, 'learning_rate': 4.75978894751458e-06, 'epoch': 0.24}
{'loss': 6.7617, 'learning_rate': 5.15650414567382e-06, 'epoch': 0.26}
{'loss': 6.6421, 'learning_rate': 5.553219343833063e-06, 'epoch'

  0%|          | 0/6280 [00:00<?, ?it/s]

Perplexity: 20.26


## Unclear

In [11]:
data = ["sdgs-c2-fifth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/sdgs-c2-tadp/"
batch_size = 18
model_name = "sdgs-c2-taskdriven"

run_scratch(data,tokenizer_file, batch_size, model_name)

Found cached dataset text (/media/agus/DATA/huggingface/huggingface/text/suffix_array-f7dcb39a3af109aa/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/39983 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/9996 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/39983 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/9996 [00:00<?, ? examples/s]



  0%|          | 0/447950 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 10.7522, 'learning_rate': 2.2323920080366114e-07, 'epoch': 0.01}
{'loss': 10.1543, 'learning_rate': 4.464784016073223e-07, 'epoch': 0.02}
{'loss': 9.6979, 'learning_rate': 6.697176024109835e-07, 'epoch': 0.03}
{'loss': 9.2281, 'learning_rate': 8.929568032146446e-07, 'epoch': 0.04}
{'loss': 8.675, 'learning_rate': 1.1161960040183058e-06, 'epoch': 0.06}
{'loss': 8.0632, 'learning_rate': 1.339435204821967e-06, 'epoch': 0.07}
{'loss': 7.4551, 'learning_rate': 1.562674405625628e-06, 'epoch': 0.08}
{'loss': 6.9431, 'learning_rate': 1.7859136064292891e-06, 'epoch': 0.09}
{'loss': 6.5397, 'learning_rate': 2.0091528072329503e-06, 'epoch': 0.1}
{'loss': 6.249, 'learning_rate': 2.2323920080366115e-06, 'epoch': 0.11}
{'loss': 6.0583, 'learning_rate': 2.4556312088402727e-06, 'epoch': 0.12}
{'loss': 5.9302, 'learning_rate': 2.678870409643934e-06, 'epoch': 0.13}
{'loss': 5.8408, 'learning_rate': 2.9016631320459877e-06, 'epoch': 0.15}
{'loss': 5.772, 'learning_rate': 3.1249023328496485e-06, '

  0%|          | 0/10967 [00:00<?, ?it/s]

Perplexity: 2.84


In [11]:
data = ["cofog-c2-fifth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/cofog-c2/"
batch_size = 18
model_name = "cofog-c2-scratch"

run_scratch(data,tokenizer_file, batch_size, model_name)

Downloading and preparing dataset text/suffix_array to /home/agus/.cache/huggingface/datasets/text/suffix_array-56c56ea567bb41a6/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /home/agus/.cache/huggingface/datasets/text/suffix_array-56c56ea567bb41a6/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/34240 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/8560 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/34240 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/8560 [00:00<?, ? examples/s]



  0%|          | 0/480100 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 10.7674, 'learning_rate': 2.0828993959591752e-07, 'epoch': 0.03}
{'loss': 10.2246, 'learning_rate': 4.1657987919183504e-07, 'epoch': 0.05}
{'loss': 9.9478, 'learning_rate': 6.248698187877526e-07, 'epoch': 0.08}
{'loss': 9.7196, 'learning_rate': 8.331597583836701e-07, 'epoch': 0.1}
{'loss': 9.4852, 'learning_rate': 1.0414496979795877e-06, 'epoch': 0.13}
{'loss': 9.2362, 'learning_rate': 1.2497396375755053e-06, 'epoch': 0.16}
{'loss': 8.9591, 'learning_rate': 1.4580295771714228e-06, 'epoch': 0.18}
{'loss': 8.651, 'learning_rate': 1.6663195167673402e-06, 'epoch': 0.21}
{'loss': 8.3804, 'learning_rate': 1.8746094563632577e-06, 'epoch': 0.23}
{'loss': 8.1406, 'learning_rate': 2.0828993959591755e-06, 'epoch': 0.26}
{'loss': 7.9373, 'learning_rate': 2.291189335555093e-06, 'epoch': 0.29}
{'loss': 7.7974, 'learning_rate': 2.4994792751510106e-06, 'epoch': 0.31}
{'loss': 7.6718, 'learning_rate': 2.707769214746928e-06, 'epoch': 0.34}
{'loss': 7.5803, 'learning_rate': 2.9160591543428457e-0

  0%|          | 0/4774 [00:00<?, ?it/s]

Perplexity: 273.57


In [8]:
data = ["cofog-c1-fourth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/cofog-c1/"
batch_size = 18
model_name = "cofog-c1-scratch"

run_scratch(data,tokenizer_file, batch_size, model_name)

Downloading and preparing dataset text/suffix_array to /home/agus/.cache/huggingface/datasets/text/suffix_array-015eb444645a5062/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /home/agus/.cache/huggingface/datasets/text/suffix_array-015eb444645a5062/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/7652 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/1914 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/7652 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/1914 [00:00<?, ? examples/s]



  0%|          | 0/128425 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 10.5212, 'learning_rate': 7.786342754808067e-07, 'epoch': 0.1}
{'loss': 10.0207, 'learning_rate': 1.5572685509616134e-06, 'epoch': 0.19}
{'loss': 9.7164, 'learning_rate': 2.3359028264424204e-06, 'epoch': 0.29}
{'loss': 9.3268, 'learning_rate': 3.1145371019232267e-06, 'epoch': 0.39}
{'loss': 8.9644, 'learning_rate': 3.893171377404034e-06, 'epoch': 0.49}
{'loss': 8.6638, 'learning_rate': 4.671805652884841e-06, 'epoch': 0.58}
{'loss': 8.4654, 'learning_rate': 5.4504399283656475e-06, 'epoch': 0.68}
{'loss': 8.3147, 'learning_rate': 6.2290742038464535e-06, 'epoch': 0.78}
{'loss': 8.1731, 'learning_rate': 7.007708479327261e-06, 'epoch': 0.88}
{'loss': 8.0862, 'learning_rate': 7.786342754808068e-06, 'epoch': 0.97}
{'loss': 7.9891, 'learning_rate': 8.564977030288874e-06, 'epoch': 1.07}
{'loss': 7.9154, 'learning_rate': 9.343611305769682e-06, 'epoch': 1.17}
{'loss': 7.8295, 'learning_rate': 1.0120688312699524e-05, 'epoch': 1.27}
{'loss': 7.7478, 'learning_rate': 1.0899322588180332e-05,

  0%|          | 0/1225 [00:00<?, ?it/s]

Perplexity: 813.68


In [9]:
data = ["sdgs-c1-fourth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/sdgs-c1/"
batch_size = 18
model_name = "sdgs-c1-scratch"

run_scratch(data,tokenizer_file, batch_size, model_name)

Downloading and preparing dataset text/suffix_array to /home/agus/.cache/huggingface/datasets/text/suffix_array-e69061d20723c21e/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /home/agus/.cache/huggingface/datasets/text/suffix_array-e69061d20723c21e/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/10544 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/2636 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/10544 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/2636 [00:00<?, ? examples/s]



  0%|          | 0/136875 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 8.5373, 'learning_rate': 7.305669199298657e-07, 'epoch': 0.09}
{'loss': 8.3385, 'learning_rate': 1.4611338398597314e-06, 'epoch': 0.18}
{'loss': 8.1774, 'learning_rate': 2.191700759789597e-06, 'epoch': 0.27}
{'loss': 8.0081, 'learning_rate': 2.9222676797194627e-06, 'epoch': 0.37}
{'loss': 7.8618, 'learning_rate': 3.652834599649328e-06, 'epoch': 0.46}
{'loss': 7.7179, 'learning_rate': 4.383401519579194e-06, 'epoch': 0.55}
{'loss': 7.632, 'learning_rate': 5.113968439509059e-06, 'epoch': 0.64}
{'loss': 7.555, 'learning_rate': 5.8445353594389254e-06, 'epoch': 0.73}
{'loss': 7.4928, 'learning_rate': 6.57510227936879e-06, 'epoch': 0.82}
{'loss': 7.4457, 'learning_rate': 7.305669199298656e-06, 'epoch': 0.91}
{'loss': 7.4017, 'learning_rate': 8.036236119228522e-06, 'epoch': 1.0}
{'loss': 7.3647, 'learning_rate': 8.766803039158388e-06, 'epoch': 1.1}
{'loss': 7.3119, 'learning_rate': 9.497369959088252e-06, 'epoch': 1.19}
{'loss': 7.2844, 'learning_rate': 1.0227936879018118e-05, 'epoch':

  0%|          | 0/1393 [00:00<?, ?it/s]

Perplexity: 774.22


# General Approach

In [9]:
data = ["persuratan-dataset-final-fourth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/persuratan/"
batch_size = 18
model_name = "persuratan-scratch"

run_scratch(data,tokenizer_file, batch_size, model_name)

Found cached dataset text (/home/agus/.cache/huggingface/datasets/text/suffix_array-0bb18389d41e00ae/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/17750 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/4438 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/17750 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/4438 [00:00<?, ? examples/s]

  0%|          | 0/166850 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 10.592, 'learning_rate': 5.993407252022776e-07, 'epoch': 0.07}
{'loss': 10.0789, 'learning_rate': 1.1986814504045551e-06, 'epoch': 0.15}
{'loss': 9.8036, 'learning_rate': 1.7980221756068325e-06, 'epoch': 0.22}
{'loss': 9.4805, 'learning_rate': 2.3973629008091102e-06, 'epoch': 0.3}
{'loss': 9.1454, 'learning_rate': 2.9967036260113878e-06, 'epoch': 0.37}
{'loss': 8.8435, 'learning_rate': 3.596044351213665e-06, 'epoch': 0.45}
{'loss': 8.6021, 'learning_rate': 4.1953850764159425e-06, 'epoch': 0.52}
{'loss': 8.4314, 'learning_rate': 4.7947258016182205e-06, 'epoch': 0.6}
{'loss': 8.3061, 'learning_rate': 5.3940665268204984e-06, 'epoch': 0.67}
{'loss': 8.1984, 'learning_rate': 5.9934072520227756e-06, 'epoch': 0.75}
{'loss': 8.1179, 'learning_rate': 6.592747977225053e-06, 'epoch': 0.82}
{'loss': 8.0254, 'learning_rate': 7.19208870242733e-06, 'epoch': 0.9}
{'loss': 7.9507, 'learning_rate': 7.791429427629607e-06, 'epoch': 0.97}
{'loss': 7.8766, 'learning_rate': 8.390770152831885e-06, 'e

  0%|          | 0/1691 [00:00<?, ?it/s]

Perplexity: 796.01


In [10]:
data = ["peraturan-dataset-final-fifth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/peraturan/"
batch_size = 18
model_name = "peraturan-scratch"

run_scratch(data,tokenizer_file, batch_size, model_name)

Downloading and preparing dataset text/suffix_array to /home/agus/.cache/huggingface/datasets/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /home/agus/.cache/huggingface/datasets/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/47783 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/11946 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/47783 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/11946 [00:00<?, ? examples/s]



  0%|          | 0/537125 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 8.3729, 'learning_rate': 1.8617466907452572e-07, 'epoch': 0.02}
{'loss': 8.0854, 'learning_rate': 3.7234933814905144e-07, 'epoch': 0.05}
{'loss': 7.9013, 'learning_rate': 5.585240072235772e-07, 'epoch': 0.07}
{'loss': 7.7661, 'learning_rate': 7.446986762981029e-07, 'epoch': 0.09}
{'loss': 7.627, 'learning_rate': 9.308733453726288e-07, 'epoch': 0.12}
{'loss': 7.5149, 'learning_rate': 1.1170480144471544e-06, 'epoch': 0.14}
{'loss': 7.3838, 'learning_rate': 1.3032226835216803e-06, 'epoch': 0.16}
{'loss': 7.2678, 'learning_rate': 1.4893973525962058e-06, 'epoch': 0.19}
{'loss': 7.1583, 'learning_rate': 1.6755720216707317e-06, 'epoch': 0.21}
{'loss': 7.055, 'learning_rate': 1.8617466907452576e-06, 'epoch': 0.23}
{'loss': 6.9898, 'learning_rate': 2.047921359819783e-06, 'epoch': 0.26}
{'loss': 6.9204, 'learning_rate': 2.2340960288943087e-06, 'epoch': 0.28}
{'loss': 6.8521, 'learning_rate': 2.4198983486306853e-06, 'epoch': 0.3}
{'loss': 6.8082, 'learning_rate': 2.6060730177052114e-06, 

  0%|          | 0/5250 [00:00<?, ?it/s]

Perplexity: 230.10


In [12]:
data = ["persuratan-dataset-final-fourth.txt",
        "peraturan-dataset-final-fifth.txt"]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/all_dataset/"
batch_size = 18
model_name = "all-dataset-scratch"

run_scratch(data,tokenizer_file, batch_size, model_name)

Downloading and preparing dataset text/suffix_array to /home/agus/.cache/huggingface/datasets/text/suffix_array-2d9aa0cc05fe1eab/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /home/agus/.cache/huggingface/datasets/text/suffix_array-2d9aa0cc05fe1eab/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/65533 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/16384 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/65533 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/16384 [00:00<?, ? examples/s]



  0%|          | 0/699975 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 7.8659, 'learning_rate': 1.4286122460641735e-07, 'epoch': 0.02}
{'loss': 7.6818, 'learning_rate': 2.857224492128347e-07, 'epoch': 0.04}
{'loss': 7.5876, 'learning_rate': 4.2858367381925204e-07, 'epoch': 0.05}
{'loss': 7.5026, 'learning_rate': 5.714448984256694e-07, 'epoch': 0.07}
{'loss': 7.4136, 'learning_rate': 7.13734678133661e-07, 'epoch': 0.09}
{'loss': 7.3274, 'learning_rate': 8.565959027400784e-07, 'epoch': 0.11}
{'loss': 7.2508, 'learning_rate': 9.994571273464957e-07, 'epoch': 0.13}
{'loss': 7.1795, 'learning_rate': 1.1423183519529132e-06, 'epoch': 0.14}
{'loss': 7.0955, 'learning_rate': 1.2851795765593304e-06, 'epoch': 0.16}
{'loss': 7.0205, 'learning_rate': 1.4280408011657476e-06, 'epoch': 0.18}
{'loss': 6.9838, 'learning_rate': 1.5709020257721649e-06, 'epoch': 0.2}
{'loss': 6.923, 'learning_rate': 1.7137632503785823e-06, 'epoch': 0.21}
{'loss': 6.885, 'learning_rate': 1.8566244749849996e-06, 'epoch': 0.23}
{'loss': 6.8533, 'learning_rate': 1.999199977142204e-06, 'ep

  0%|          | 0/7281 [00:00<?, ?it/s]

Perplexity: 16.45


In [9]:
data = ["cofog-persuratan.txt",
        ]
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/persuratan/"
batch_size = 18
model_name = "cofog-persuratan-scratch"

run_scratch(data,tokenizer_file, batch_size, model_name)

Found cached dataset text (/home/agus/.cache/huggingface/datasets/text/suffix_array-d7f0f78939d37770/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/1632 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/408 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/1632 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/408 [00:00<?, ? examples/s]



  0%|          | 0/40975 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 10.6402, 'learning_rate': 2.4402147388970234e-06, 'epoch': 0.31}
{'loss': 9.9991, 'learning_rate': 4.880429477794047e-06, 'epoch': 0.61}
{'loss': 9.3292, 'learning_rate': 7.32064421669107e-06, 'epoch': 0.92}
{'loss': 8.8654, 'learning_rate': 9.760858955588094e-06, 'epoch': 1.22}
{'loss': 8.606, 'learning_rate': 1.2201073694485116e-05, 'epoch': 1.53}
{'loss': 8.4476, 'learning_rate': 1.464128843338214e-05, 'epoch': 1.83}
{'loss': 8.3066, 'learning_rate': 1.708150317227916e-05, 'epoch': 2.14}
{'loss': 8.1844, 'learning_rate': 1.9521717911176187e-05, 'epoch': 2.44}
{'loss': 8.0739, 'learning_rate': 1.978197792662093e-05, 'epoch': 2.75}
{'loss': 7.9979, 'learning_rate': 1.9510806193562383e-05, 'epoch': 3.05}
{'loss': 7.9122, 'learning_rate': 1.923963446050384e-05, 'epoch': 3.36}
{'loss': 7.8514, 'learning_rate': 1.8968462727445294e-05, 'epoch': 3.66}
{'loss': 7.8008, 'learning_rate': 1.8697290994386747e-05, 'epoch': 3.97}
{'loss': 7.7413, 'learning_rate': 1.8426119261328202e-05, '

  0%|          | 0/400 [00:00<?, ?it/s]

Perplexity: 1544.94


end

In [14]:
#suffix
dataset = load_dataset(path="/media/agus/DATA/DDALM/output/suffix_array/",
                       data_files = ["persuratan-dataset-final.txt"]) #load_dataset(path=data_dir)

dataset = dataset['train']

dataset = dataset.train_test_split(test_size=0.2)

Downloading and preparing dataset text/suffix_array to /home/agus/.cache/huggingface/datasets/text/suffix_array-535a31e17bdc9b0f/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /home/agus/.cache/huggingface/datasets/text/suffix_array-535a31e17bdc9b0f/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 17755
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4439
    })
})

In [28]:
tokenizer = BertTokenizerFast.from_pretrained("/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/persuratan/")

In [29]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [30]:

from transformers import AutoTokenizer
import multiprocessing


num_proc = multiprocessing.cpu_count()
print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")

def group_texts(examples):
    tokenized_inputs = tokenizer(
       examples["text"], return_special_tokens_mask=True, truncation=False, max_length=tokenizer.model_max_length
    )
    return tokenized_inputs

# preprocess dataset
tokenized_datasets = dataset.map(group_texts, batched=True, remove_columns=["text"], num_proc=num_proc)



The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/17755 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/4439 [00:00<?, ? examples/s]

In [31]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 17755
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 4439
    })
})

In [32]:
def group_texts_ds(examples):
    # Concatenate all texts.
    block_size = 512
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [33]:
 #Grouping
lm_datasets = tokenized_datasets.map(
    group_texts_ds,
    batched=True,
    batch_size=1000,
    num_proc=28,
)

Map (num_proc=28):   0%|          | 0/17755 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/4439 [00:00<?, ? examples/s]

In [34]:
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 120045
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 30574
    })
})

In [35]:
batch_size = 18
model_name = "persuratan-scratch"

In [36]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=f'models/{model_name}',
    overwrite_output_dir=True,
    num_train_epochs=25,
    per_device_eval_batch_size=batch_size,
    per_device_train_batch_size=batch_size,
    #warmup_steps=2000,
    #max_steps=100000,
    warmup_ratio=0.1,
    weight_decay=0.1,
    max_grad_norm=10,
    #lr_scheduler_type="cosine",
    learning_rate=2e-5,
    save_steps=1000,
    save_total_limit=2,
    seed=42,
    fp16=True,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    #prediction_loss_only=True,
)


In [38]:
trainer.train()

  0%|          | 0/166750 [00:00<?, ?it/s]

{'loss': 7.0118, 'learning_rate': 1.8824854239546895e-05, 'epoch': 0.07}
{'loss': 6.9979, 'learning_rate': 1.8758220889555222e-05, 'epoch': 0.15}
{'loss': 7.0024, 'learning_rate': 1.8691720806263534e-05, 'epoch': 0.22}
{'loss': 6.9813, 'learning_rate': 1.8625087456271868e-05, 'epoch': 0.3}
{'loss': 6.9964, 'learning_rate': 1.8558454106280195e-05, 'epoch': 0.37}
{'loss': 6.9894, 'learning_rate': 1.8491820756288525e-05, 'epoch': 0.45}
{'loss': 6.9722, 'learning_rate': 1.8425187406296855e-05, 'epoch': 0.52}
{'loss': 6.9619, 'learning_rate': 1.8358687323005168e-05, 'epoch': 0.6}
{'loss': 6.9702, 'learning_rate': 1.8292053973013494e-05, 'epoch': 0.67}
{'loss': 6.9534, 'learning_rate': 1.8225420623021825e-05, 'epoch': 0.75}
{'loss': 6.9445, 'learning_rate': 1.8158787273030155e-05, 'epoch': 0.82}
{'loss': 6.9399, 'learning_rate': 1.8092287189738467e-05, 'epoch': 0.9}
{'loss': 6.9324, 'learning_rate': 1.8025653839746794e-05, 'epoch': 0.97}
{'loss': 6.9289, 'learning_rate': 1.7959020489755124e-

TrainOutput(global_step=166750, training_loss=6.683666734503841, metrics={'train_runtime': 38676.3199, 'train_samples_per_second': 77.596, 'train_steps_per_second': 4.311, 'train_loss': 6.683666734503841, 'epoch': 25.0})

In [39]:
 #Evaluate
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/1699 [00:00<?, ?it/s]

Perplexity: 805.10


In [40]:
#Save
trainer.save_model(f"models/{model_name}")
tokenizer.save_pretrained(f"models/{model_name}")

('models/persuratan-scratch/tokenizer_config.json',
 'models/persuratan-scratch/special_tokens_map.json',
 'models/persuratan-scratch/vocab.txt',
 'models/persuratan-scratch/added_tokens.json',
 'models/persuratan-scratch/tokenizer.json')