In [5]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [6]:
import os
import torch
from pathlib import Path
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizerFast, BertForMaskedLM, pipeline, AutoTokenizer, AutoModel
from transformers import BertConfig, AutoConfig
from datasets import load_dataset, load_from_disk, DatasetDict, Dataset
from transformers import DataCollatorForLanguageModeling
import math
from transformers import Trainer, TrainingArguments
import multiprocessing


2023-08-24 05:58:05.301709: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForMaskedLM, DataCollatorForLanguageModeling

In [8]:
os.environ['CUDA_VISIBLE_DEVICES']='0'
torch.cuda.device(0)

<torch.cuda.device at 0x7f1a76bd5cd0>

In [9]:
import gc
gc.collect()

60

In [10]:
torch.cuda.is_available()

True

In [13]:
def run_finetune(data,tokenizer_file, batch_size, model_checkpoint, model_name):

    def group_texts_ds(examples):
        # Concatenate all texts.
        block_size = 512
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
        total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result
    
    #suffix
    
    dataset = load_dataset(path="/media/agus/DATA/DDALM/output/suffix_array/",
                        data_files = data) #load_dataset(path=data_dir)
    dataset = dataset['train']
    dataset = dataset.train_test_split(test_size=0.2)
    
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    #BertTokenizerFast.from_pretrained(tokenizer_file)
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )
    
    print('original',dataset)
    
    num_proc = multiprocessing.cpu_count()
    print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")

    def group_texts(examples):
        tokenized_inputs = tokenizer(
            examples["text"], return_special_tokens_mask=True, truncation=False, max_length=tokenizer.model_max_length
        )
        return tokenized_inputs

    # preprocess dataset
    tokenized_datasets = dataset.map(group_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
    
     #Grouping
    lm_datasets = tokenized_datasets.map(
        group_texts_ds,
        batched=True,
        batch_size=1000,
        num_proc=28,
    )
    print('lm_dataset',lm_datasets)

    #Set Model and Training Args
    model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
    
    #model_name = model_checkpoint.split("/")[-1]
    #data_name = data.split(".")[-1]
    #model_name = f'{model_name}finetuned-all-{data_name}'
    
    training_args = TrainingArguments(
        output_dir=f'models/{model_name}',
        overwrite_output_dir=True,
        num_train_epochs=25,
        per_device_eval_batch_size=batch_size,
        per_device_train_batch_size=batch_size,
        #warmup_steps=2000,
        #max_steps=100000,
        warmup_ratio=0.1,
        weight_decay=0.1,
        max_grad_norm=10,
        #lr_scheduler_type="cosine",
        learning_rate=2e-5,
        save_steps=1000,
        save_total_limit=2,
        seed=42,
        fp16=True,
        prediction_loss_only=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=lm_datasets["train"],
        eval_dataset=lm_datasets["test"],
        #prediction_loss_only=True,
    )
    
    trainer.train()
    
     #Evaluate
    eval_results = trainer.evaluate()
    print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
    
    #Save
    trainer.save_model(f"models/{model_name}")
    tokenizer.save_pretrained(f"models/{model_name}")


In [14]:

data = ["persuratan-dataset-final-fourth.txt","peraturan-dataset-final-fifth.txt"]
tokenizer_file = "cahya/bert-base-indonesian-522M"
batch_size = 18
model_checkpoint = "cahya/bert-base-indonesian-522M"
model_name = "wirawan-finetuned-all-25"

run_finetune(data,tokenizer_file, batch_size, model_checkpoint, model_name)

Downloading and preparing dataset text/suffix_array to /home/agus/.cache/huggingface/datasets/text/suffix_array-2d9aa0cc05fe1eab/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /home/agus/.cache/huggingface/datasets/text/suffix_array-2d9aa0cc05fe1eab/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

original DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 65533
    })
    test: Dataset({
        features: ['text'],
        num_rows: 16384
    })
})
The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/65533 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/16384 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/65533 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/16384 [00:00<?, ? examples/s]

lm_dataset DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 577431
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 149883
    })
})


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


  0%|          | 0/802000 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 6.6509, 'learning_rate': 1.2468827930174566e-07, 'epoch': 0.02}
{'loss': 6.2946, 'learning_rate': 2.493765586034913e-07, 'epoch': 0.03}
{'loss': 6.1042, 'learning_rate': 3.7406483790523695e-07, 'epoch': 0.05}
{'loss': 6.0205, 'learning_rate': 4.987531172069826e-07, 'epoch': 0.06}
{'loss': 5.9544, 'learning_rate': 6.234413965087283e-07, 'epoch': 0.08}
{'loss': 5.8965, 'learning_rate': 7.481296758104739e-07, 'epoch': 0.09}
{'loss': 5.8865, 'learning_rate': 8.728179551122195e-07, 'epoch': 0.11}
{'loss': 5.8511, 'learning_rate': 9.975062344139653e-07, 'epoch': 0.12}
{'loss': 5.8159, 'learning_rate': 1.1219451371571074e-06, 'epoch': 0.14}
{'loss': 5.7906, 'learning_rate': 1.246633416458853e-06, 'epoch': 0.16}
{'loss': 5.7563, 'learning_rate': 1.3713216957605985e-06, 'epoch': 0.17}
{'loss': 5.7553, 'learning_rate': 1.4960099750623442e-06, 'epoch': 0.19}
{'loss': 5.6998, 'learning_rate': 1.6206982543640897e-06, 'epoch': 0.2}
{'loss': 5.7052, 'learning_rate': 1.745137157107232e-06, 'e

  0%|          | 0/8327 [00:00<?, ?it/s]

Perplexity: 8.35


: 

In [7]:

data = ["persuratan-dataset-final-fourth.txt"]
tokenizer_file = "cahya/bert-base-indonesian-522M"
batch_size = 18
model_checkpoint = "cahya/bert-base-indonesian-522M"
model_name = "wirawan-finetuned-persuratan-25"

run_finetune(data,tokenizer_file, batch_size, model_checkpoint, model_name)

Found cached dataset text (/home/agus/.cache/huggingface/datasets/text/suffix_array-0bb18389d41e00ae/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

original DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 17750
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4438
    })
})
The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/17750 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/4438 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/17750 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/4438 [00:00<?, ? examples/s]

lm_dataset DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 149929
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 38737
    })
})


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjust-108[0m ([33mcofog[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/208250 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 6.9353, 'learning_rate': 4.801920768307324e-07, 'epoch': 0.06}
{'loss': 6.54, 'learning_rate': 9.603841536614647e-07, 'epoch': 0.12}
{'loss': 6.3994, 'learning_rate': 1.4405762304921969e-06, 'epoch': 0.18}
{'loss': 6.3324, 'learning_rate': 1.9207683073229294e-06, 'epoch': 0.24}
{'loss': 6.2373, 'learning_rate': 2.4009603841536618e-06, 'epoch': 0.3}
{'loss': 6.1926, 'learning_rate': 2.8811524609843937e-06, 'epoch': 0.36}
{'loss': 6.1436, 'learning_rate': 3.3613445378151265e-06, 'epoch': 0.42}
{'loss': 6.1248, 'learning_rate': 3.841536614645859e-06, 'epoch': 0.48}
{'loss': 6.0684, 'learning_rate': 4.320768307322929e-06, 'epoch': 0.54}
{'loss': 6.051, 'learning_rate': 4.800960384153662e-06, 'epoch': 0.6}
{'loss': 6.0158, 'learning_rate': 5.281152460984395e-06, 'epoch': 0.66}
{'loss': 6.0091, 'learning_rate': 5.761344537815126e-06, 'epoch': 0.72}
{'loss': 5.9856, 'learning_rate': 6.240576230492197e-06, 'epoch': 0.78}
{'loss': 5.9647, 'learning_rate': 6.72076830732293e-06, 'epoch':

  0%|          | 0/2153 [00:00<?, ?it/s]

Perplexity: 20.25


In [12]:

data = ["peraturan-dataset-final-fifth.txt"]
tokenizer_file = "cahya/bert-base-indonesian-522M"
batch_size = 18
model_checkpoint = "cahya/bert-base-indonesian-522M"
model_name = "wirawan-finetuned-peraturan-25"

run_finetune(data,tokenizer_file, batch_size, model_checkpoint, model_name)

Found cached dataset text (/home/agus/.cache/huggingface/datasets/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

original DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 47783
    })
    test: Dataset({
        features: ['text'],
        num_rows: 11946
    })
})
The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/47783 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/11946 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/47783 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/11946 [00:00<?, ? examples/s]

lm_dataset DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 430194
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 108435
    })
})


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


  0%|          | 0/597500 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 6.4241, 'learning_rate': 1.6736401673640168e-07, 'epoch': 0.02}
{'loss': 6.0498, 'learning_rate': 3.3472803347280335e-07, 'epoch': 0.04}
{'loss': 5.8664, 'learning_rate': 5.020920502092051e-07, 'epoch': 0.06}
{'loss': 5.7809, 'learning_rate': 6.694560669456067e-07, 'epoch': 0.08}
{'loss': 5.7279, 'learning_rate': 8.368200836820084e-07, 'epoch': 0.1}
{'loss': 5.6836, 'learning_rate': 1.0041841004184101e-06, 'epoch': 0.13}
{'loss': 5.6462, 'learning_rate': 1.1715481171548119e-06, 'epoch': 0.15}
{'loss': 5.6026, 'learning_rate': 1.3389121338912134e-06, 'epoch': 0.17}
{'loss': 5.5663, 'learning_rate': 1.5059414225941423e-06, 'epoch': 0.19}
{'loss': 5.5561, 'learning_rate': 1.6733054393305439e-06, 'epoch': 0.21}
{'loss': 5.5237, 'learning_rate': 1.8406694560669458e-06, 'epoch': 0.23}
{'loss': 5.5171, 'learning_rate': 2.008033472803347e-06, 'epoch': 0.25}
{'loss': 5.4901, 'learning_rate': 2.1750627615062763e-06, 'epoch': 0.27}
{'loss': 5.4725, 'learning_rate': 2.342426778242678e-06,

  0%|          | 0/6025 [00:00<?, ?it/s]

Perplexity: 6.49


In [7]:

data = "persuratan-dataset-final-fourth.txt"
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/peraturan/"
batch_size = 18
model_checkpoint = "../../script/IndoGovBERT-final/models/peraturan-final-25/"
model_name = "peraturan-finetuned-persuratan-25"

run_finetune(data,tokenizer_file, batch_size, model_checkpoint, model_name)

Found cached dataset text (/home/agus/.cache/huggingface/datasets/text/suffix_array-0bb18389d41e00ae/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

original DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 17750
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4438
    })
})
The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/17750 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/4438 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/17750 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/4438 [00:00<?, ? examples/s]

lm_dataset DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 127554
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 30658
    })
})




  0%|          | 0/177175 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 6.5344, 'learning_rate': 5.643977875606728e-07, 'epoch': 0.07}
{'loss': 6.515, 'learning_rate': 1.1287955751213456e-06, 'epoch': 0.14}
{'loss': 6.4878, 'learning_rate': 1.6931933626820184e-06, 'epoch': 0.21}
{'loss': 6.4527, 'learning_rate': 2.257591150242691e-06, 'epoch': 0.28}
{'loss': 6.4404, 'learning_rate': 2.821988937803364e-06, 'epoch': 0.35}
{'loss': 6.4256, 'learning_rate': 3.386386725364037e-06, 'epoch': 0.42}
{'loss': 6.4042, 'learning_rate': 3.950784512924709e-06, 'epoch': 0.49}
{'loss': 6.3797, 'learning_rate': 4.515182300485382e-06, 'epoch': 0.56}
{'loss': 6.3739, 'learning_rate': 5.079580088046055e-06, 'epoch': 0.63}
{'loss': 6.3282, 'learning_rate': 5.643977875606728e-06, 'epoch': 0.71}
{'loss': 6.327, 'learning_rate': 6.208375663167401e-06, 'epoch': 0.78}
{'loss': 6.3279, 'learning_rate': 6.7716446551529525e-06, 'epoch': 0.85}
{'loss': 6.2785, 'learning_rate': 7.336042442713625e-06, 'epoch': 0.92}
{'loss': 6.2908, 'learning_rate': 7.900440230274298e-06, 'epoch

  0%|          | 0/1704 [00:00<?, ?it/s]

Perplexity: 62.34


In [9]:

data = "peraturan-dataset-final-fifth.txt"
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/persuratan/"
batch_size = 18
model_checkpoint = "../../script/IndoGovBERT-final/models/persuratan-final-25/"
model_name = "persuratan-finetuned-peraturan-25"

run_finetune(data,tokenizer_file, batch_size, model_checkpoint, model_name)

Found cached dataset text (/home/agus/.cache/huggingface/datasets/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /home/agus/.cache/huggingface/datasets/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-b5c9ef00b1ba87bb.arrow and /home/agus/.cache/huggingface/datasets/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-a735864a71d26cc3.arrow
Loading cached processed dataset at /home/agus/.cache/huggingface/datasets/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-0d366a34d4272e3a_*_of_00028.arrow
Loading cached processed dataset at /home/agus/.cache/huggingface/datasets/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-6489cf1e19a8ec00_*_of_00028.arrow
Loading cached processed dataset at /home/agus/.cache/huggingface/datasets/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5

original DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 47783
    })
    test: Dataset({
        features: ['text'],
        num_rows: 11946
    })
})
The max length for the tokenizer is: 1000000000000000019884624838656


Loading cached processed dataset at /home/agus/.cache/huggingface/datasets/text/suffix_array-b1e8db2c7c338967/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-93eadb566dd7eadc_*_of_00028.arrow


lm_dataset DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 400154
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 97735
    })
})




  0%|          | 0/555775 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 6.3803, 'learning_rate': 1.7992730936701573e-07, 'epoch': 0.02}
{'loss': 6.3473, 'learning_rate': 3.5985461873403145e-07, 'epoch': 0.04}
{'loss': 6.35, 'learning_rate': 5.397819281010472e-07, 'epoch': 0.07}
{'loss': 6.323, 'learning_rate': 7.197092374680629e-07, 'epoch': 0.09}
{'loss': 6.3279, 'learning_rate': 8.996365468350787e-07, 'epoch': 0.11}
{'loss': 6.3154, 'learning_rate': 1.0795638562020944e-06, 'epoch': 0.13}
{'loss': 6.3205, 'learning_rate': 1.2594911655691102e-06, 'epoch': 0.16}
{'loss': 6.2958, 'learning_rate': 1.4394184749361258e-06, 'epoch': 0.18}
{'loss': 6.2937, 'learning_rate': 1.6193457843031416e-06, 'epoch': 0.2}
{'loss': 6.2876, 'learning_rate': 1.7992730936701574e-06, 'epoch': 0.22}
{'loss': 6.2763, 'learning_rate': 1.9792004030371732e-06, 'epoch': 0.25}
{'loss': 6.2674, 'learning_rate': 2.159127712404189e-06, 'epoch': 0.27}
{'loss': 6.2492, 'learning_rate': 2.3386951671524703e-06, 'epoch': 0.29}
{'loss': 6.2478, 'learning_rate': 2.5186224765194863e-06, '

In [10]:
data = "persuratan-dataset-final-fourth.txt"
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/persuratan/"
batch_size = 18
model_name = "indobenchmark/indobert-base-p2"

run_finetune(data,tokenizer_file, batch_size, model_name)

Found cached dataset text (/home/agus/.cache/huggingface/datasets/text/suffix_array-0bb18389d41e00ae/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

original DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 17750
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4438
    })
})
The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/17750 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/4438 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/17750 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/4438 [00:00<?, ? examples/s]

lm_dataset DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 147032
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 37197
    })
})


Some weights of BertForMaskedLM were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/204225 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


: 

In [7]:
data = "cofog-persuratan.txt"
tokenizer_file = "/media/agus/DATA/DDALM/script/IndoGovBERT-final/vocab/persuratan/"
batch_size = 24
model_name = "indobenchmark/indobert-base-p2"

run_finetune(data,tokenizer_file, batch_size, model_name)

Found cached dataset text (/home/agus/.cache/huggingface/datasets/text/suffix_array-d7f0f78939d37770/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

original DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1632
    })
    test: Dataset({
        features: ['text'],
        num_rows: 408
    })
})
The max length for the tokenizer is: 1000000000000000019884624838656


Map (num_proc=28):   0%|          | 0/1632 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/408 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/1632 [00:00<?, ? examples/s]

Map (num_proc=28):   0%|          | 0/408 [00:00<?, ? examples/s]

lm_dataset DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 35490
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
        num_rows: 7001
    })
})


Some weights of BertForMaskedLM were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/36975 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 9.1524, 'learning_rate': 2.69875608436993e-06, 'epoch': 0.34}
{'loss': 7.9448, 'learning_rate': 5.402920497566252e-06, 'epoch': 0.68}
{'loss': 7.4467, 'learning_rate': 8.107084910762575e-06, 'epoch': 1.01}
{'loss': 6.9616, 'learning_rate': 1.0811249323958897e-05, 'epoch': 1.35}
{'loss': 6.3787, 'learning_rate': 1.351541373715522e-05, 'epoch': 1.69}
{'loss': 5.6307, 'learning_rate': 1.6219578150351543e-05, 'epoch': 2.03}
{'loss': 5.1842, 'learning_rate': 1.8923742563547864e-05, 'epoch': 2.37}
{'loss': 4.9742, 'learning_rate': 1.9819094269315144e-05, 'epoch': 2.7}
{'loss': 4.8128, 'learning_rate': 1.951858641103465e-05, 'epoch': 3.04}
{'loss': 4.682, 'learning_rate': 1.9218078552754157e-05, 'epoch': 3.38}
{'loss': 4.6125, 'learning_rate': 1.891757069447366e-05, 'epoch': 3.72}
{'loss': 4.5158, 'learning_rate': 1.8617062836193166e-05, 'epoch': 4.06}
{'loss': 4.4613, 'learning_rate': 1.8316554977912674e-05, 'epoch': 4.39}
{'loss': 4.4027, 'learning_rate': 1.801604711963218e-05, 'ep

  0%|          | 0/292 [00:00<?, ?it/s]

Perplexity: 39.39
