__How to train a language model__	Notebook to Highlight all the steps to effectively train Transformer model on custom data
https://github.com/huggingface/transformers/tree/master/notebooks

https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb

__Language Modeling__
https://github.com/huggingface/transformers/tree/master/examples/language-modeling
https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py

In [1]:
import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"]="1"

from pathlib import Path
import torch

from torch.utils.data import Dataset, DataLoader
from tokenizers import CharBPETokenizer
from tokenizers.processors import BertProcessing
from tokenizers.normalizers import BertNormalizer

import random
from transformers import PreTrainedTokenizer
from transformers import AutoTokenizer
from transformers import RobertaTokenizerFast, RobertaTokenizer
from filelock import FileLock
import logging
import time
import pickle

In [2]:
# device = torch.device("cuda:1")
# device

In [3]:
!nvidia-smi

Sun Jun 28 07:13:40 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:00:0E.0 Off |                  Off |
| N/A   37C    P0    37W / 250W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [4]:
# Check that PyTorch sees it
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
1


In [5]:
DATA_PATH = Path("/datadisk/data")

# DATA_RAW_PATH = DATA_PATH/"raw"
DATA_RAW_EXTRACTED_PATH = DATA_PATH/"raw_data_extraction_v2"

# 1. The data from thwiki
THWIKI_FOLDER = Path("thwiki-20200601-extracted")
WIKI_FILES = list((DATA_RAW_EXTRACTED_PATH/THWIKI_FOLDER).glob("Wiki*"))
list(map(print , WIKI_FILES[:5]))


# 2. The classification data from jung and ninja
CLASSIFICATION_JUNG_NINJA_FOLDER = Path("classification_dataset")
CLASSIFICATION_FILES = list((DATA_RAW_EXTRACTED_PATH/CLASSIFICATION_JUNG_NINJA_FOLDER).glob("*"))
list(map(print , CLASSIFICATION_FILES[:5]))

# 3. The Data from p'Moo Crawlers
ANOTHER_WEBSITE_MOO_FOLDER = Path("another_website")
ANOTHER_WEBSITE_FILES = list((DATA_RAW_EXTRACTED_PATH/ANOTHER_WEBSITE_MOO_FOLDER).glob("*"))
list(map(print , ANOTHER_WEBSITE_FILES[:5]))


# 4. Senior Project Files
SENIOR_PROJ_FOLDER = Path("data_lm")
SENIOR_PROJ_FILES = list((DATA_RAW_EXTRACTED_PATH/SENIOR_PROJ_FOLDER).glob("*"))
list(map(print , SENIOR_PROJ_FILES[:5]))

# 5. Guru Crawler Files
GURU_CRAWLER_FOLDER = Path("social_listening")
GURU_CRAWLER_FILES = list((DATA_RAW_EXTRACTED_PATH/GURU_CRAWLER_FOLDER).glob("*"))
list(map(print , GURU_CRAWLER_FILES[:5]))

ALL_FILES = WIKI_FILES + CLASSIFICATION_FILES + ANOTHER_WEBSITE_FILES + SENIOR_PROJ_FILES + GURU_CRAWLER_FILES
print(f"\nI have a total of {len(ALL_FILES)} files!")


# Output is in bytes - helper from Pathlib Path https://stackoverflow.com/questions/2104080/how-can-i-check-file-size-in-python
def getStat(prev_value, cur_value):
    if isinstance(prev_value, int):
        return prev_value + cur_value.stat().st_size
    return prev_value.stat().st_size + cur_value.stat().st_size

from functools import reduce
print(f"Amounts to a total of {reduce(getStat, ALL_FILES)/1e6:.2f} MB")

/datadisk/data/raw_data_extraction_v2/thwiki-20200601-extracted/WikiAA_0.txt
/datadisk/data/raw_data_extraction_v2/thwiki-20200601-extracted/WikiAB_2.txt
/datadisk/data/raw_data_extraction_v2/thwiki-20200601-extracted/WikiAD_2.txt
/datadisk/data/raw_data_extraction_v2/thwiki-20200601-extracted/WikiAD_0.txt
/datadisk/data/raw_data_extraction_v2/thwiki-20200601-extracted/WikiAE_0.txt
/datadisk/data/raw_data_extraction_v2/classification_dataset/thaipbs_0.txt
/datadisk/data/raw_data_extraction_v2/classification_dataset/naewna_0.txt
/datadisk/data/raw_data_extraction_v2/classification_dataset/dailynews_0.txt
/datadisk/data/raw_data_extraction_v2/classification_dataset/prbangkok_0.txt
/datadisk/data/raw_data_extraction_v2/classification_dataset/pptv36_0.txt
/datadisk/data/raw_data_extraction_v2/another_website/pantip_275.txt
/datadisk/data/raw_data_extraction_v2/another_website/praew_2.txt
/datadisk/data/raw_data_extraction_v2/another_website/facebook_3.txt
/datadisk/data/raw_data_extraction

# Making Electra Model

However, currently pretraining Electra is still inside PR stage  
- Issue : When will ELECTRA pretraining from scratch will be available? #3878 https://github.com/huggingface/transformers/issues/3878  
- Issue : BERT and other models pretraining from scratch example #4425 https://github.com/huggingface/transformers/issues/4425
- PR : Electra training from scratch #4656 https://github.com/huggingface/transformers/pull/4656

Combined model
- Combines ElectraForMaskedLM and ElectraForPreTraining with embedding sharing + custom masking/replaced token detection

Commit before Merge: https://github.com/huggingface/transformers/pull/4656/commits/30b2dbbba6918ac6540b6a1758b7ee19f0ac969c#diff-8a95e2bfb7da25648b5d12ffa69fd7a3

In [6]:
# from transformers import ElectraModel, ElectraConfig

# # Initializing a ELECTRA electra-base-uncased style configuration
# configuration = ElectraConfig()
# configuration.vocab_size = 20000

# # Initializing a model from the electra-base-uncased style configuration
# model = ElectraModel(configuration)

# # Accessing the model configuration
# configuration = model.config
# configuration

In [7]:
# model

In [8]:
# model.num_parameters()
# # => 12 million parameters

In [9]:
# model.get_input_embeddings()

# Trying out Roberta per Notebook 

From __HuggingFace Notebooks__ https://huggingface.co/transformers/notebooks.html: 

How to train a language model	Highlight all the steps to effectively train Transformer model on custom data
- Colab (ipynb) version : https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb
- MD version: https://github.com/huggingface/blog/blob/master/how-to-train.md

Pretrain Longformer	How to build a "long" version of existing pretrained models	Iz Beltagy  
https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb

In [10]:
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM

configuration = RobertaConfig(
    vocab_size=30522,
    max_position_embeddings=514, # 512 + 2 more special tokens
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
# configuration.vocab_size = 20000

model = RobertaForMaskedLM(config=configuration)

# Accessing the model configuration
model.config

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 30522
}

In [11]:
model.num_parameters()
# => 102 million parameters

67578426

In [12]:
model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

# Initializing Tokenizer

In [13]:
# from tokenizers import Tokenizer
# tokenizer = Tokenizer.from_file("./thwiki-sentencepiecebpe.tokenizer.json")
# encoded =  tokenizer.encode(u"สวัสดีครับ ผมชื่อไนท์ ตอนนี้ก็เป็นเวลาที่ผมต้องไปโรงเรียนแล้ว  นี่คือการเว้นวรรคสองทีครับ  จะได้ออกเป็นสอง Spaces")
# print(encoded.ids)
# print(encoded.tokens)

In [14]:
# tokenizer.enable_truncation(max_length=128)

In [15]:
# encoded =  tokenizer.encode(u"สวัสดีครับ ผมชื่อไนท์ ตอนนี้ก็เป็นเวลาที่ผมต้องไปโรงเรียนแล้ว  นี่คือการเว้นวรรคสองทีครับ  จะได้ออกเป็นสอง SpacesWhat is great is that our tokenizer is optimized for Esperanto. Compared to a generic tokenizer trained for English, more native words are represented by a single, unsplit token. Diacritics, i.e. accented characters used in Esperanto – ĉ, ĝ, ĥ, ĵ, ŝ, and ŭ – are encoded natively. We also represent sequences in a more efficient manner. Here on this corpus, the average length of encoded sequences is ~30% smaller as when using the pretrained GPT-2 tokenizer.")
# print("This will not be over 128: ", len(encoded.ids), encoded.tokens)
# print(encoded.overflowing[0].tokens)

wrap tokenizers inside a PreTrainedTokenizerFast from transformers 

https://github.com/huggingface/tokenizers/issues/259

In [16]:
# from tokenizers import SentencePieceBPETokenizer
# from transformers import PreTrainedTokenizerFast


# class SentencePieceBPETokenizerFast(PreTrainedTokenizerFast):
#     def __init__(
#         self,
#         vocab_file,
#         merges_file,
#         bos_token="<s>",
#         eos_token="</s>",
#         sep_token="</s>",
#         cls_token="<s>",
#         unk_token="<unk>",
#         pad_token="<pad>",
#         mask_token="<mask>",
#         **kwargs
#     ):
#         super().__init__(
#             SentencePieceBPETokenizer(
#                 vocab_file=vocab_file,
#                 merges_file=merges_file,
#             ),
#              bos_token=bos_token,
#             eos_token=eos_token,
#             unk_token=unk_token,
#             sep_token=sep_token,
#             cls_token=cls_token,
#             pad_token=pad_token,
#             mask_token=mask_token,
#             **kwargs,
#         )

In [17]:
# import json

# # with open("./thwiki-sentencepiecebpe.tokenizer.json", 'r' ) as json_data:
# with open("./thwiki-charbpe-30522.tokenizer.json", 'r' ) as json_data:
#      data = json.load(json_data)
# vocab = data['model']['vocab']
# merges = data['model']['merges']


# with open('vocab.json', 'w', encoding='utf-8') as json_file:
#     json.dump(vocab, json_file, ensure_ascii=False)
# with open('merges.txt', 'w', encoding='utf-8') as f:
#     for merge_string in merges:
#         f.write(f'{merge_string}\n')

In [18]:
# pretrain_tokenizer = SentencePieceBPETokenizerFast(vocab_file='vocab.json',merges_file ='merges.txt' )

In [19]:
# from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
# from tokenizers import Tokenizer
# from tokenizers.implementations import BaseTokenizer

# tokenizer = Tokenizer.from_file("./thwiki-sentencepiecebpe.tokenizer.json")
# base_tokenizer = BaseTokenizer(tokenizer) # Wrapper!! to PretrainTokenizerFast Tokenizer should be an instance of a Tokenizer provided by HuggingFace tokenizers library.
# base_tokenizer = SentencePieceBPETokenizer()
# pretrain_tokenizer = PreTrainedTokenizerFast(tokenizer=base_tokenizer)
# pretrain_tokenizer

In [20]:
# from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
# from tokenizers import Tokenizer, CharBPETokenizer
# from tokenizers.implementations import BaseTokenizer

# # tokenizer = Tokenizer.from_file("./thwiki-charbpe-30522.tokenizer.json")
# # base_tokenizer = BaseTokenizer(tokenizer) # Wrapper!! to PretrainTokenizerFast Tokenizer should be an instance of a Tokenizer provided by HuggingFace tokenizers library.
# base_tokenizer = CharBPETokenizer(vocab_file='vocab.json',merges_file ='merges.txt')
# pretrain_tokenizer = PreTrainedTokenizerFast(tokenizer=base_tokenizer)
# pretrain_tokenizer

In [21]:
# from transformers import RobertaTokenizerFast

# tokenizer = RobertaTokenizerFast.from_pretrained("./thwiki-seniorproj-bytebpe-30522", max_len=512)

In [22]:
# from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("./thwiki-seniorproj-bytebpe-30522", max_len=512)

# Building our dataset

Build it with `from torch.utils.data.dataset import Dataset` just like [TextDataset](https://github.com/huggingface/transformers/blob/448c467256332e4be8c122a159b482c1ef039b98/src/transformers/data/datasets/language_modeling.py) and [LineByLineTextDataset](https://github.com/huggingface/transformers/blob/448c467256332e4be8c122a159b482c1ef039b98/src/transformers/data/datasets/language_modeling.py#L78)

Note: Training with multiple files is currently not supported [issue/3445](https://github.com/huggingface/transformers/issues/3445)

padding documentation [link](https://github.com/huggingface/tokenizers/blob/master/bindings/python/tokenizers/implementations/base_tokenizer.py#L52)

Potential Improvements
- การทำให้ Dataset นั้น dynamically tokenize + dynamically open file : ตอนนี้เวลาทำ Dataset จาก torch.utils.data.dataset จะทำการ tokenize เลยตอนอยู่ใน constructor  , กำลังคิดว่าถ้าเกิดว่า Data ใหญ่มากๆ อาจจะไม่เหมาะสมกับการทำแบบนี้  เพราะว่า Ram จะต้องมีขนาดเท่าๆกับ data ที่เราใส่เข้าไป  ซึ่งเป็นไปได้ยากหาก Data มีขนาดใหญ่มากๆ   ผมได้ทำการ Search ดูแล้วก็พบว่าจาก Discussion Forum ของ Pytorch: https://discuss.pytorch.org/t/how-to-use-a-huge-line-corpus-text-with-dataset-dataloader/30872 
Option1: ใช้ pd.Dataframe ในการเปิด File แบบ small chunks of data https://discuss.pytorch.org/t/data-processing-as-a-batch-way/14154/4?u=ptrblck
Option2: ใช้ byte Offsets จากไฟล์ใหญ่ๆเพื่อที่จะ lookup .seek(): https://github.com/pytorch/text/issues/130#issuecomment-510412877
More Examples: https://github.com/pytorch/text/blob/master/torchtext/datasets/unsupervised_learning.py , https://github.com/pytorch/text/blob/a5880a3da7928dd7dd529507eec943a307204de7/examples/text_classification/iterable_train.py#L169-L214

In [23]:
logger = logging.getLogger(__name__)
class TextDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach
    soon.
    """

    def __init__(self, tokenizer: PreTrainedTokenizer, root_path: str, block_size: int, overwrite_cache=False,):
        # assert os.path.isfile(file_path)
        # For Loop MultiFile
        self.examples = []
        print("root_path", root_path)
        for root, dirs, files in os.walk(root_path):
            for file in files:
                print(">>",file)
                file_path = os.path.join(root, file)
                print("root_path", file_path)
                block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)

                directory, filename = os.path.split(file_path)
                cached_features_file = os.path.join(
                    directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
                )

                # Make sure only the first process in distributed training processes the dataset,
                # and the others will use the cache.
                lock_path = cached_features_file + ".lock"
                
                with FileLock(lock_path):
                    if os.path.exists(cached_features_file) and not overwrite_cache:
                        start = time.time()
                        with open(cached_features_file, "rb") as handle:
                            self.examples = pickle.load(handle)
                        logger.info(
                            f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                        )

                    else:
                        print("file_path", file_path)
                        with open(file_path, encoding="utf-8") as f:
                            text = f.read()

                        tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
#                         print("tokenized_text", tokenized_text)
                        for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
#                             print("i", i)
                            self.examples.append(
                                tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
                            )
#                         print(">", self.examples)
                        # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
                        # If your dataset is small, first you should loook for a bigger one :-) and second you
                        # can change this behavior by adding (model specific) padding.

                        start = time.time()
                        with open(cached_features_file, "wb") as handle:
                            pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                        logger.info(
                            "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                        )


    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
#         print("self.examples", self.examples)
        return torch.tensor(self.examples[i], dtype=torch.long)


Tokenizer from Pretrained copied from SimpleTransformers [link](https://github.com/ThilinaRajapakse/simpletransformers/blob/master/simpletransformers/language_modeling/language_modeling_model.py)

In [24]:
# tokenizer = RobertaTokenizer.from_pretrained("./thwiki-seniorproj-bytebpe-30522", max_len=512)
dataset = TextDataset(tokenizer, root_path=DATA_RAW_EXTRACTED_PATH, block_size=512, overwrite_cache=True)
# dataloader = DataLoader(dataset, batch_size=1, collate_fn=data_collator,
#                         shuffle=True, num_workers=4) # Still cant make more batch size!! Need collate function!

root_path /datadisk/data/raw_data_extraction_v2
>> SocialListeningpantip_post_data.csv_4.txt
root_path /datadisk/data/raw_data_extraction_v2/social_listening/SocialListeningpantip_post_data.csv_4.txt
file_path /datadisk/data/raw_data_extraction_v2/social_listening/SocialListeningpantip_post_data.csv_4.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> SocialListeningpantip_post_data.csv_3.txt
root_path /datadisk/data/raw_data_extraction_v2/social_listening/SocialListeningpantip_post_data.csv_3.txt
file_path /datadisk/data/raw_data_extraction_v2/social_listening/SocialListeningpantip_post_data.csv_3.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> SocialListeningpantip_post_data.csv_5.txt
root_path /datadisk/data/raw_data_extraction_v2/social_listening/SocialListeningpantip_post_data.csv_5.txt
file_path /datadisk/data/raw_data_extraction_v2/social_listening/SocialListeningpantip_post_data.csv_5.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> SocialListeningpantip_post_data.csv_1.txt
root_path /datadisk/data/raw_data_extraction_v2/social_listening/SocialListeningpantip_post_data.csv_1.txt
file_path /datadisk/data/raw_data_extraction_v2/social_listening/SocialListeningpantip_post_data.csv_1.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> SocialListeningpantip_post_data.csv_2.txt
root_path /datadisk/data/raw_data_extraction_v2/social_listening/SocialListeningpantip_post_data.csv_2.txt
file_path /datadisk/data/raw_data_extraction_v2/social_listening/SocialListeningpantip_post_data.csv_2.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> pantip_275.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_275.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_275.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> praew_2.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/praew_2.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/praew_2.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> facebook_3.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/facebook_3.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/facebook_3.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> facebook_27.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/facebook_27.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/facebook_27.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> ch7hd_0.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/ch7hd_0.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/ch7hd_0.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> pantip_179.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_179.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_179.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> pantip_202.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_202.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_202.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> ryt9_2.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/ryt9_2.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/ryt9_2.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> posttoday_2.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/posttoday_2.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/posttoday_2.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> pantip_478.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_478.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_478.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> twitter_4.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/twitter_4.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/twitter_4.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> pantip_188.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_188.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_188.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> pantip_545.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_545.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_545.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> facebook_33.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/facebook_33.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/facebook_33.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> pantip_374.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_374.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_374.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> thairath_9.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/thairath_9.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/thairath_9.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> twitter_20.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/twitter_20.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/twitter_20.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> pantip_228.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_228.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_228.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> pantip_390.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_390.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_390.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> pantip_438.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_438.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_438.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> pantip_507.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_507.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_507.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> pantip_262.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_262.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_262.txt
> 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> mgronline_20.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/mgronline_20.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/mgronline_20.txt


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>> pantip_37.txt
root_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_37.txt
file_path /datadisk/data/raw_data_extraction_v2/another_website/pantip_37.txt


KeyboardInterrupt: 

In [None]:
dataset

In [27]:
# for i_batch, sample_batched in enumerate(dataloader):
#     print(i_batch, sample_batched)
#     oumodel()

In [28]:
# tokenizer = CharBPETokenizer(vocab_file='vocab.json',merges_file ='merges.txt' )
# no_accent_strip = BertNormalizer(strip_accents=False)
# tokenizer._tokenizer.normalizer = no_accent_strip
# tokenizer._tokenizer.post_processor = BertProcessing(
#     ("</s>", tokenizer.token_to_id("</s>")),
#     ("<s>", tokenizer.token_to_id("<s>")),
# )

# input_ids = torch.tensor(tokenizer.encode(u"สวัสดีครับ ผมชื่อไนท์ ตอนนี้ก็เป็นเวลาที่ผมต้องไปโรงเรียนแล้ว  นี่คือการเว้นวรรคสองทีครับ  จะได้ออกเป็นสอง Spaces").ids).unsqueeze(0)
# print(input_ids)
# outputs = model(input_ids, labels=input_ids)
# print(outputs)
# loss, prediction_scores = outputs[:2]
# print(loss, prediction_scores.shape)

In [29]:
# dataset.__getitem__(1).unsqueeze(0)

In [30]:
# input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1

In [31]:
# %%time
# from transformers import TextDataset, LineByLineTextDataset

# # dataset = LineByLineTextDataset(
# #     tokenizer=pretrain_tokenizer,
# #     file_path="../data/text/AA/wiki_01",
# #     block_size=128,
# # )

# dataset = TextDataset(
#     tokenizer=pretrain_tokenizer,
#     file_path="../data/text/AA/wiki_01",
#     block_size=128,
# )


In [32]:
# one_doc = list(Path("../data/text/AA/").glob("wiki*"))[0].read_text(encoding="utf-8").splitlines()
# tokenizer = Tokenizer.from_file("./thwiki-sentencepiecebpe.tokenizer.json")
# tokenizer.encode_batch(one_doc[:8])

In [33]:
# one_doc = list(Path("../data/text/AA/").glob("wiki*"))[0].read_text(encoding="utf-8").splitlines()
# tokenizer = RobertaTokenizerFast(vocab_file='vocab.json',merges_file ='merges.txt', max_len=512)
# tokenizer.batch_encode_plus(one_doc[:8])

In [34]:
# print(tokenizer.encode_batch(one_doc[:8])[5].tokens)

In [35]:
# one_doc[:8]

In [36]:
from transformers import DataCollatorForLanguageModeling
# tokenizer = RobertaTokenizer(vocab_file='vocab.json',merges_file ='merges.txt', max_len=512)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Transfomers Trainer [link](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py#L133)

```python
class Trainer:
    """
    Trainer is a simple but feature-complete training and eval loop for PyTorch,
    optimized for Transformers.
    Args:
        prediction_loss_only:
            (Optional) in evaluation and prediction, only return the loss
    """
    def __init__(
        self,
        model: PreTrainedModel,
        args: TrainingArguments,
        data_collator: Optional[DataCollator] = None,
        train_dataset: Optional[Dataset] = None,
        eval_dataset: Optional[Dataset] = None,
        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
        prediction_loss_only=False,
        tb_writer: Optional["SummaryWriter"] = None,
        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None,
```

[TrainingArguments](https://github.com/huggingface/transformers/blob/master/src/transformers/training_args.py#L33) is referenced here. 

In [37]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./test_RoBERTa2",
    overwrite_output_dir=True,  #"Use this to continue training if output_dir points to a checkpoint directory."
    
    
    do_train=True, #Whether to run training.
    do_eval=True, #Whether to run eval on the dev set.
#     do_predict=True, # Whether to run predictions on the test set.
    
    num_train_epochs=20, # Total number of training epochs to perform.
    
    
    per_device_train_batch_size=8, # Batch size per GPU/TPU core/CPU for training.
    per_device_eval_batch_size=8, # Batch size per GPU/TPU core/CPU for evaluation.
    
    learning_rate=5e-5,  #The initial learning rate for Adam.
    adam_epsilon=1e-8, #Epsilon for Adam optimizer.
    
    save_steps=10_000,  #Save checkpoint every X updates steps.
    save_total_limit=2, #"Limit the total amount of checkpoints. Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
%%time
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=20.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=323.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=323.0, style=ProgressStyle(description_wi…

In [None]:
trainer.save_model("./EsperBERTo")

In [None]:
encoded = pretrain_tokenizer.encode(u"สวัสดีครับ ผมชื่อไนท์ ตอนนี้ก็เป็นเวลาที่ผมต้องไปโรงเรียนแล้ว  นี่คือการเว้นวรรคสองทีครับ  จะได้ออกเป็นสอง Spaces")
encoded