__How to train a language model__	Notebook to Highlight all the steps to effectively train Transformer model on custom data
https://github.com/huggingface/transformers/tree/master/notebooks

https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb

__Language Modeling__
https://github.com/huggingface/transformers/tree/master/examples/language-modeling
https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py

In [1]:
import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,5"
# os.environ["CUDA_LAUNCH_BLOCKING"]="1"

from pathlib import Path
import torch

from torch.utils.data import Dataset, DataLoader
from tokenizers import CharBPETokenizer, Tokenizer, ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from tokenizers.normalizers import BertNormalizer
# from tokenizers import SentencePieceBPETokenizer

import random
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast , AutoTokenizer,RobertaTokenizerFast, RobertaTokenizer
from filelock import FileLock
import logging
import time
import tqdm
import pickle
from multiprocessing import Pool
# from concurrent.futures import ProcessPoolExecutor as Pool
from functools import reduce

In [2]:
# device = torch.device("cuda:1")
# device

In [3]:
!nvidia-smi

Tue Aug 25 09:12:16 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:21:01.0 Off |                    0 |
| N/A   31C    P0    52W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  Off  | 00000000:21:02.0 Off |                    0 |
| N/A   33C    P0    54W / 300W |      0MiB / 16160MiB |      0%      Default |
|       

In [4]:
# Check that PyTorch sees it
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
8


In [5]:
DATA_PATH = Path("/datadisk/data")

# DATA_RAW_PATH = DATA_PATH/"raw"
DATA_RAW_EXTRACTED_PATH = DATA_PATH/"raw_data_extraction_v2"

# Output is in bytes - helper from Pathlib Path https://stackoverflow.com/questions/2104080/how-can-i-check-file-size-in-python
def getStat(prev_value, cur_value):
    if isinstance(prev_value, int):
        return prev_value + cur_value.stat().st_size
    return prev_value.stat().st_size + cur_value.stat().st_size

# 1. The data from thwiki
THWIKI_FOLDER = Path("thwiki-20200601-extracted")
WIKI_FILES = list((DATA_RAW_EXTRACTED_PATH/THWIKI_FOLDER).glob("*.txt"))
list(map(print , WIKI_FILES[:5]))
print(f"thwiki-20200601-extracted Amounts to a total of {reduce(getStat, WIKI_FILES)/1e6:.2f} MB")

# 2. The classification data from jung and ninja
CLASSIFICATION_JUNG_NINJA_FOLDER = Path("classification_dataset")
CLASSIFICATION_FILES = list((DATA_RAW_EXTRACTED_PATH/CLASSIFICATION_JUNG_NINJA_FOLDER).glob("*.txt"))
list(map(print , CLASSIFICATION_FILES[:5]))
print(f"classification_dataset Amounts to a total of {reduce(getStat, CLASSIFICATION_FILES)/1e6:.2f} MB")

# 3. The Data from p'Moo Crawlers
ANOTHER_WEBSITE_MOO_FOLDER = Path("another_website")
ANOTHER_WEBSITE_FILES = list((DATA_RAW_EXTRACTED_PATH/ANOTHER_WEBSITE_MOO_FOLDER).glob("*.txt"))
list(map(print , ANOTHER_WEBSITE_FILES[:5]))
print(f"another_website Amounts to a total of {reduce(getStat, ANOTHER_WEBSITE_FILES)/1e6:.2f} MB")

# 4. Senior Project Files
SENIOR_PROJ_FOLDER = Path("data_lm")
SENIOR_PROJ_FILES = list((DATA_RAW_EXTRACTED_PATH/SENIOR_PROJ_FOLDER).glob("*.txt"))
list(map(print , SENIOR_PROJ_FILES[:5]))
print(f"Senior Project Amounts to a total of {reduce(getStat, SENIOR_PROJ_FILES)/1e6:.2f} MB")

# 5. Guru Crawler Files
GURU_CRAWLER_FOLDER = Path("social_listening")
GURU_CRAWLER_FILES = list((DATA_RAW_EXTRACTED_PATH/GURU_CRAWLER_FOLDER).glob("*.txt"))
list(map(print , GURU_CRAWLER_FILES[:5]))
print(f"GuruCrawler Amounts to a total of {reduce(getStat, GURU_CRAWLER_FILES)/1e6:.2f} MB")

ALL_FILES = WIKI_FILES + CLASSIFICATION_FILES + ANOTHER_WEBSITE_FILES + SENIOR_PROJ_FILES + GURU_CRAWLER_FILES
print(f"\nI have a total of {len(ALL_FILES)} files!")





print(f"Amounts to a total of {reduce(getStat, ALL_FILES)/1e6:.2f} MB")

/datadisk/data/raw_data_extraction_v2/thwiki-20200601-extracted/WikiAA_0.txt
/datadisk/data/raw_data_extraction_v2/thwiki-20200601-extracted/WikiAB_2.txt
/datadisk/data/raw_data_extraction_v2/thwiki-20200601-extracted/WikiAD_2.txt
/datadisk/data/raw_data_extraction_v2/thwiki-20200601-extracted/WikiAD_0.txt
/datadisk/data/raw_data_extraction_v2/thwiki-20200601-extracted/WikiAE_0.txt
thwiki-20200601-extracted Amounts to a total of 566.79 MB
/datadisk/data/raw_data_extraction_v2/classification_dataset/thaipbs_0.txt
/datadisk/data/raw_data_extraction_v2/classification_dataset/naewna_0.txt
/datadisk/data/raw_data_extraction_v2/classification_dataset/dailynews_0.txt
/datadisk/data/raw_data_extraction_v2/classification_dataset/prbangkok_0.txt
/datadisk/data/raw_data_extraction_v2/classification_dataset/pptv36_0.txt
classification_dataset Amounts to a total of 50.79 MB
/datadisk/data/raw_data_extraction_v2/another_website/pantip_275.txt
/datadisk/data/raw_data_extraction_v2/another_website/pra

# Trying out BERT per Notebook 

From __HuggingFace Notebooks__ https://huggingface.co/transformers/notebooks.html: 

How to train a language model	Highlight all the steps to effectively train Transformer model on custom data
- Colab (ipynb) version : https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb
- MD version: https://github.com/huggingface/blog/blob/master/how-to-train.md

Pretrain Longformer	How to build a "long" version of existing pretrained models	Iz Beltagy  
https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb

In [6]:
from transformers import BertForMaskedLM, BertConfig

configuration = BertConfig(
    vocab_size=80000,
#     max_position_embeddings=512, # 512 + 2 more special tokens
#     num_attention_heads=12,
#     num_hidden_layers=12,
#     type_vocab_size=1,
)
# configuration.vocab_size = 20000

model = BertForMaskedLM(config=configuration)
# model = RobertaForMaskedLM.from_pretrained('./Roberta/checkpoint-200000')

# Accessing the model configuration
model.config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 80000
}

In [7]:
model.num_parameters()
# => 102 million parameters

148153472

In [8]:
model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(80000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

# Initializing Tokenizer

## Rewrite Tokenizer of bert_itos_80k with special tokens in front

In [9]:
from senior_project_util import ThaiTokenizer, pre_rules_th, post_rules_th
from fastai.text.transform import BaseTokenizer, Tokenizer, Vocab
from fastai.text.data import TokenizeProcessor, NumericalizeProcessor

TOK_PATH = Path('./senior_proj_itos')

max_vocab = 80000

BOS,EOS,FLD,UNK,PAD = 'xxbos','xxeos','xxfld','xxunk','xxpad'
TK_REP,TK_WREP, TK_NUM, TK_LAUGH = 'xxrep','xxwrep', 'xxnum', 'xxlaugh'
text_spec_tok = [UNK,PAD,BOS,EOS,FLD,TK_REP,TK_WREP, TK_NUM, TK_LAUGH]

In [10]:
import fastai
print(fastai.__version__)

1.0.61


In [11]:
# with open(TOK_PATH/"bert_itos_80k.pkl", 'rb') as f:
#     itos = pickle.load(f)
# len(itos)

In [12]:
# for o in reversed(text_spec_tok):
#     if o in itos: itos.remove(o)
#     itos.insert(0, o)

In [13]:
# itos = itos[:max_vocab]

In [14]:
# with open(TOK_PATH/"bert_itos_80k_cleaned.pkl", 'wb') as f:
#     pickle.dump(itos, f, pickle.HIGHEST_PROTOCOL)
# print(f"Successfully written vocabulary itos in {TOK_PATH/'bert_itos_80k_cleaned.pkl'}")

In [15]:
with open(TOK_PATH/"bert_itos_80k_cleaned.pkl", 'rb') as f:
    itos = pickle.load(f)
len(itos)

80000

In [16]:
vocab = Vocab(itos)
vocab.__getstate__()

{'itos': ['xxunk',
  'xxpad',
  'xxbos',
  'xxeos',
  'xxfld',
  'xxrep',
  'xxwrep',
  'xxnum',
  'xxlaugh',
  ' ',
  'ที่',
  'ก็',
  'เรา',
  'จะ',
  'ไป',
  'ครับ',
  'มี',
  'มา',
  'ได้',
  'แต่',
  'ว่า',
  'เป็น',
  'เลย',
  'ค่ะ',
  'ไม่',
  'ผม',
  'แล้ว',
  'และ',
  'ให้',
  'ๆ',
  'ใน',
  'ของ',
  'คน',
  'กับ',
  '(',
  ')',
  'หรือ',
  'มัน',
  'นี้',
  'กัน',
  'มาก',
  'อยาก',
  'คือ',
  'ต้อง',
  'ด้วย',
  'อยู่',
  'ทำ',
  '.',
  'เขา',
  '-',
  'จาก',
  'ถ้า',
  'เพราะ',
  'อะไร',
  '3',
  'ไม่ได้',
  'เค้า',
  'คะ',
  'แบบ',
  'ยัง',
  '"',
  'ดี',
  'เรื่อง',
  'ดู',
  'กว่า',
  'ใช้',
  'นะ',
  'บ้าง',
  'อีก',
  'พอ',
  'บาท',
  'ไหม',
  'เพื่อน',
  'ขอ',
  'ใคร',
  'ไหน',
  'นะคะ',
  'บอก',
  'เอา',
  'ซื้อ',
  'แฟน',
  ':',
  'ปี',
  'ถึง',
  'การ',
  '/',
  'ชอบ',
  '?',
  'ตัว',
  'ตอนนี้',
  'ช่วย',
  'นั้น',
  'หน่อย',
  'ยังไง',
  'วัน',
  'แค่',
  'ราคา',
  'ซึ่ง',
  'พี่',
  'ขึ้น',
  '5',
  'ขอบคุณ',
  'เคย',
  'คุณ',
  'ไม่มี',
  ',',
  'เวลา',
  'เห็น

In [17]:
text='ม่ายเอาเปงไงบ้างน่ารักจุงเบย'
pyThai_tt = ThaiTokenizer()
a = pyThai_tt.tokenizer(text)
a

['ม่าย', 'เอา', 'เปง', 'ไง', 'บ้าง', 'น่ารัก', 'จุงเบย']

In [18]:
tt = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', pre_rules = pre_rules_th, post_rules=post_rules_th, n_cpus=1)
test_sample = tt._process_all_1([text[:1000]])
print(test_sample)
test_sample = [vocab.numericalize(seq) for seq in test_sample]
print(test_sample)

[['ไม่', 'เอา', 'เปง', 'ไง', 'บ้าง', 'น่ารัก', 'จังเลย']]
[[24, 78, 12028, 241, 67, 464, 3080]]


In [19]:
vocab.numericalize(["asdw9eiqpwoied"]) #UNK

[0]

In [20]:
# from transformers import RobertaTokenizer

# tokenizer = RobertaTokenizer.from_pretrained("./all-data-bytebpe-20000", max_len=512)
# tokenizer.__class__.__name__

In [21]:
class CustomSeniorProjectTokenizer(object):
    def __init__(self, TOK_PATH = Path('./senior_proj_itos'), BOS='xxbos', EOS='xxeos', FLD = 'xxfld', UNK='xxunk', PAD='xxpad',
                 TK_REP='xxrep', TK_WREP='xxwrep', TK_NUM='xxnum', TK_LAUGH='xxlaugh', n_cpus=1,
                ):
        from senior_project_util import ThaiTokenizer, pre_rules_th, post_rules_th
        from fastai.text.transform import BaseTokenizer, Tokenizer, Vocab
        from fastai.text.data import TokenizeProcessor, NumericalizeProcessor

        with open(TOK_PATH/"bert_itos_80k_cleaned.pkl", 'rb') as f:
            itos = pickle.load(f)
            
        self.vocab = Vocab(itos)
        self.tokenizer = Tokenizer(tok_func = ThaiTokenizer, lang = 'th', 
                                   pre_rules = pre_rules_th, post_rules=post_rules_th, n_cpus=n_cpus)
        
        self.cls_token_id = self.vocab.stoi[BOS]
        self.sep_token_id = self.vocab.stoi[EOS]
        self.pad_token_id = self.vocab.stoi[PAD]
        
        self.mask_token = FLD  #SINCE THIS ONE IS NOT USED, and INSIDE SPECIAL TOKEN....
        self._pad_token = PAD
        
#         tokenizer_processor = TokenizeProcessor(tokenizer=tt, chunksize=300000, mark_fields=False)
#         numbericalize_processor = NumericalizeProcessor(vocab=vocab)
        
    def num_special_tokens_to_add(self, pair=False):
        return 2
    def tokenize(self, text):
        return self.tokenizer._process_all_1([text])[0]
#         return self.tokenizer.process_all([text])[0]
    
    def convert_tokens_to_ids(self, token_list):
        #From https://huggingface.co/transformers/_modules/transformers/tokenization_utils_fast.html#PreTrainedTokenizerFast.convert_tokens_to_ids
        if token_list is None:
            return None

        if isinstance(token_list, str):
            return self.vocab.numericalize([token_list])[0]
        
        return self.vocab.numericalize(token_list)
    
    def build_inputs_with_special_tokens(self, token_list):
        # From https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_bert.py#L235
        return [self.cls_token_id] + token_list + [self.sep_token_id]
    
    def get_special_tokens_mask(
        self, token_ids_0, token_ids_1 = None, already_has_special_tokens = False
    ):
        # From https://huggingface.co/transformers/_modules/transformers/tokenization_utils.html#PreTrainedTokenizer.get_special_tokens_mask
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` method.

        Args:
            token_ids_0: list of ids (must not contain special tokens)
            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
                for sequence pairs
            already_has_special_tokens: (default False) Set to True if the token list is already formated with
                special tokens for the model

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
    
    def __len__(self):
        #https://huggingface.co/transformers/_modules/transformers/tokenization_utils_fast.html#PreTrainedTokenizerFast.__len__
        return len(self.vocab.itos)

In [22]:
text = "ใครเคยมีแฟนที่กินอาหารไม่ถูกปากกันแล้วรู้สึกเสียความสุขไปอย่างนึงบ้างมั้ยครับ  ก่อนอื่นผมต้องบอกก่อนเลยว่าคนเราจะเลือกกินอาหารแบบไหนชอบแบบไหนเป็นเรื่องของความชอบส่วนตัวนะครับทุกคนมีสิทธิในการเลือกของที่ชอบและไม่ชอบอยู่แล้ว แต่ผมรู้สึกว่าตอนนี้ผมกำลังประสบปัญหาที่ดูเหมือนจะเล็กแต่กลายเป็นว่ามันค่อนข้างใหญ่ ผมคบกับแฟนมา6ปีแล้วครับ ผมเป็นคนชอบกินอาหารญี่ปุ่นและปลาดิบแต่แฟนผมไม่กินปลาดิบเลย ผมอยากกินบุฟเฟ่เนื้อแต่แฟนผมก็ไม่กินเนื้อ เราเลยไม่ได้เข้าทานร้านบุฟเฟ่เนื้อและบุฟเฟ่อาหารญี่ปุ่นกันเพราะรู้สึกลัวแฟนผมทานไม่คุ้ม และเรื่องใหญ่เลยคือผมเป็นคนชอบทานอาหารรสจัดและรสเผ็ดมาก แต่แฟนผมทานเผ็ดไม่ได้เลยเวลาเราไปกินส้มตำกันก็จะสั่ง ส้มตำไม่ใส่พริก ต้มแซ่บไม่ใส่พริก ลาบไม่ใส่พริก ร้านกับข้าวอื่นๆก็เช่นกันแฟนผมจะไม่ชอบกินผักไม่ค่อยสั่งกับข้าวที่เป็นผักแล้วผมชอบผักบุ้งทอดกรอบ เห็ดหอมสดทอดมาก แต่ก็ไม่ได้สั่งเพราะว่าเธอไม่กินถึงเค้าจะบอกให้สั่งเลยๆก็เถอะแต่ผมก็ยังเกรงใจเธออยู่ดีอ่ะครับ ผมรู้สึกกินอาหารไม่มีความสุขเลยชีวิตผมขาดรสเผ็ดไปเหมือนจะขาดใจเหมือนมันทำให้ขาดความสุขไปอย่างนึงเลยอ่ะครับ ยิ่งถ้าเราแต่งงานกันแล้วผมก็อาจจะต้องมีปัญหาเรื่องนี้มากขึ้น พอผมเห็นคู่ที่ชอบทานอาหารเหมือนๆกันเห็นเค้ากินอาหารกันอย่างมีความสุขแล้วผมรู้สึกอิจฉามากๆเลย มีใครเคยมีปัญหาแบบผมมั้ยครับแล้วจะแก้ปัญหานี้ยังไงดีครับ"
tokenizer = CustomSeniorProjectTokenizer()
print(tokenizer.num_special_tokens_to_add(pair=False))
print(tokenizer.__class__.__name__)
value = tokenizer.tokenize(text)
print(value)
value = tokenizer.convert_tokens_to_ids(value)
print(value)
value = tokenizer.build_inputs_with_special_tokens(value)
print(value)

2
CustomSeniorProjectTokenizer
['ใคร', 'เคย', 'มี', 'แฟน', 'ที่', 'กิน', 'อาหาร', 'ไม่ถูกปาก', 'กัน', 'แล้ว', 'รู้สึก', 'เสีย', 'ความสุข', 'ไป', 'อย่าง', 'นึง', 'บ้าง', 'มั้ย', 'ครับ', ' ', 'ก่อนอื่น', 'ผม', 'ต้อง', 'บอก', 'ก่อน', 'เลย', 'ว่า', 'คนเรา', 'จะ', 'เลือก', 'กิน', 'อาหาร', 'แบบ', 'ไหน', 'ชอบ', 'แบบ', 'ไหน', 'เป็นเรื่อง', 'ของ', 'ความชอบ', 'ส่วนตัว', 'นะ', 'ครับ', 'ทุกคน', 'มี', 'สิทธิ', 'ใน', 'การเลือก', 'ของที่ชอบ', 'และ', 'ไม่ชอบ', 'อยู่แล้ว', ' ', 'แต่', 'ผม', 'รู้สึก', 'ว่า', 'ตอนนี้', 'ผม', 'กำลัง', 'ประสบปัญหา', 'ที่', 'ดูเหมือน', 'จะ', 'เล็ก', 'แต่', 'กลายเป็น', 'ว่า', 'มัน', 'ค่อนข้าง', 'ใหญ่', ' ', 'ผม', 'คบ', 'กับ', 'แฟน', 'มา', 'xxnum', ' ', 'ปี', 'แล้ว', 'ครับ', ' ', 'ผม', 'เป็น', 'คน', 'ชอบ', 'กิน', 'อาหาร', 'ญี่ปุ่น', 'และ', 'ปลาดิบ', 'แต่', 'แฟน', 'ผม', 'ไม่', 'กิน', 'ปลาดิบ', 'เลย', ' ', 'ผม', 'อยากกิน', 'บุ', 'ฟเฟ่', 'เนื้อ', 'แต่', 'แฟน', 'ผม', 'ก็', 'ไม่', 'กิน', 'เนื้อ', ' ', 'เรา', 'เลย', 'ไม่ได้', 'เข้า', 'ทาน', 'ร้าน', 'บุ', 'ฟเฟ่', 'เนื้อ', 'และ', 'บุ

Constructing tokenizer wrapper based on [@theblackcat102 #259](https://github.com/huggingface/tokenizers/issues/259#issuecomment-625905930)

# Building our dataset

Build it with `from torch.utils.data.dataset import Dataset` just like [TextDataset](https://github.com/huggingface/transformers/blob/448c467256332e4be8c122a159b482c1ef039b98/src/transformers/data/datasets/language_modeling.py) and [LineByLineTextDataset](https://github.com/huggingface/transformers/blob/448c467256332e4be8c122a159b482c1ef039b98/src/transformers/data/datasets/language_modeling.py#L78)

Note: Training with multiple files is currently not supported [issue/3445](https://github.com/huggingface/transformers/issues/3445)

padding documentation [link](https://github.com/huggingface/tokenizers/blob/master/bindings/python/tokenizers/implementations/base_tokenizer.py#L52)

Potential Improvements
- การทำให้ Dataset นั้น dynamically tokenize + dynamically open file : ตอนนี้เวลาทำ Dataset จาก torch.utils.data.dataset จะทำการ tokenize เลยตอนอยู่ใน constructor  , กำลังคิดว่าถ้าเกิดว่า Data ใหญ่มากๆ อาจจะไม่เหมาะสมกับการทำแบบนี้  เพราะว่า Ram จะต้องมีขนาดเท่าๆกับ data ที่เราใส่เข้าไป  ซึ่งเป็นไปได้ยากหาก Data มีขนาดใหญ่มากๆ   ผมได้ทำการ Search ดูแล้วก็พบว่าจาก Discussion Forum ของ Pytorch: https://discuss.pytorch.org/t/how-to-use-a-huge-line-corpus-text-with-dataset-dataloader/30872 
Option1: ใช้ pd.Dataframe ในการเปิด File แบบ small chunks of data https://discuss.pytorch.org/t/data-processing-as-a-batch-way/14154/4?u=ptrblck
Option2: ใช้ byte Offsets จากไฟล์ใหญ่ๆเพื่อที่จะ lookup .seek(): https://github.com/pytorch/text/issues/130#issuecomment-510412877
More Examples: https://github.com/pytorch/text/blob/master/torchtext/datasets/unsupervised_learning.py , https://github.com/pytorch/text/blob/a5880a3da7928dd7dd529507eec943a307204de7/examples/text_classification/iterable_train.py#L169-L214

In [23]:
logger = logging.getLogger(__name__)
class TextDatasetParallel(Dataset):
    """
    This will be superseded by a framework-agnostic approach
    soon.
    """         
    def __init__(self, tokenizer: PreTrainedTokenizer, sample_path: [], block_size: int, overwrite_cache=False,
                num_processes=8, cached_directory = "/workdir/Code/bma_transformer_model/data/cached_data"):
        # assert os.path.isfile(file_path)
        # For Loop MultiFile
        self.examples = []
        self.sample_path = sample_path
#         print(f"THIS IS SAMPLE PATH {sample_path}")
        self.tokenizer = tokenizer
        
        # Set block size to be the blocksize-special tokens
        self.block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
        
        self.overwrite_cache = overwrite_cache
        self.cached_directory = cached_directory
        if not os.path.exists(cached_directory):
            os.makedirs(cached_directory)
        
        # Multiprocess for getting examples
        with Pool(processes=num_processes) as p:
            self.examples = list(tqdm.tqdm(p.imap(self.load_data_tokenized, self.sample_path), total=len(self.sample_path)))
#         with Pool(max_workers=num_processes) as p:
#             self.examples = list(tqdm.tqdm(p.map(self.load_data_tokenized, self.sample_path), total=len(self.sample_path)))
#         for path in tqdm.tqdm(self.sample_path):
#             self.examples.append(self.load_data_tokenized(path))
        
        
        # Convert from 3d list to 2d 
        # self.examples from [[[3], [4]], [[5], [6]], [[7], [8]]] => [[3], [4], [5], [6], [7], [8]]
        self.examples = [each_batch for each_file in self.examples for each_batch in each_file]
        

    def load_data_tokenized(self, file_path):
#         print(f"I AM DOING {file_path}")
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            self.cached_directory, f"cached_lm_{tokenizer.__class__.__name__}_{str(self.block_size)}_{filename}",
        )

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):
            if os.path.exists(cached_features_file) and not self.overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    temp_examples = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )
#                 print(f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start)
            else:
                temp_examples = []
                with open(file_path, encoding="utf-8") as f:
                    text = f.read()
#                 print("I finished reading ", file_path)
                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
#                 print("I finished tokenizing ", file_path)
                for i in range(0, len(tokenized_text) - self.block_size + 1, self.block_size):  # Truncate in block of block_size
                    temp_examples.append(
                        tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + self.block_size])
                    )
#                     if i%20 == 0:
#                         print("I finished special tok ", file_path)
#                 print("I finished every tokenizing ", file_path)
                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
                # If your dataset is small, first you should loook for a bigger one :-) and second you
                # can change this behavior by adding (model specific) padding.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(temp_examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )
        return temp_examples
        
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
        return torch.tensor(self.examples[i], dtype=torch.long)


In [24]:
# with open(ALL_FILES[0], encoding="utf-8") as f:
#     text = f.read()
# len(text)/6

Tokenizer from Pretrained copied from SimpleTransformers [link](https://github.com/ThilinaRajapakse/simpletransformers/blob/master/simpletransformers/language_modeling/language_modeling_model.py)

In [25]:
%%time
# logging.basicConfig(level=logging.WARN)
tokenizer = CustomSeniorProjectTokenizer()
dataset = TextDatasetParallel(tokenizer, 
                              sample_path=list(map(str, ALL_FILES)), 
#                               sample_path=list(map(str, GURU_CRAWLER_FILES)), 
                              block_size=512, 
                              cached_directory= "/workdir/cached_data_senior",
                              overwrite_cache=False, # make sure this is false when you have cache!!
                              num_processes=60,
                             )

100%|██████████| 1409/1409 [01:29<00:00, 15.68it/s]


CPU times: user 1min 12s, sys: 20.8 s, total: 1min 32s
Wall time: 1min 31s


In [26]:
# cached_directory= "/workdir/cached_data"
# def load_data_tokenized(file_path):
#     directory, filename = os.path.split(file_path)
#     cached_features_file = os.path.join(
#         cached_directory, f"cached_lm_something_{str(123)}_{filename}",
#     )
#     temp_examples = []
#     with open(file_path, encoding="utf-8") as f:
#         text = f.read()
#     print("I finished reading ", file_path)
#     tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[:1000]))
#     print("I finished tokenizing ", tokenized_text[:100])
#     return f"{file_path}+123"
# # Multiprocess for getting examples
# with Pool(processes=2) as p:
#     examples = list(tqdm.tqdm(p.imap(load_data_tokenized, list(map(str, GURU_CRAWLER_FILES))), total=len(list(map(str, GURU_CRAWLER_FILES)))))
# print(examples)


In [27]:
dataset.__len__()

1537686

In [28]:
len(dataset.__getitem__(0))

512

In [29]:
dataset.__getitem__(0)

tensor([    2, 28125,     9, 14966,     9,  5799,     9,    36,   247,  4240,
           20,     0,     9,    21,     0,    10,    21, 10444,    30,  4957,
        15773,     9,   347,   310, 37854,  5532,     9,  2720,   714,     7,
            9,   714,    97,   327,   310,  1621, 17698,   270,     9,     7,
            9,  1621,    50,  1566,  7302,  1003,     9,  2504,     9,  1894,
         9041,     9,  1127,  1041, 48540,     9,    27,     0,  5708,    31,
          409,    42, 22118,   177,   455,   249,  4636,  1175,    42,  3385,
         1296,  9041,     9,   950,  4875,    31, 21850, 22118,  5799,    16,
        17646,   366,    33,  1210,   642,  1003, 26270,     9,  1210, 20640,
        18700, 26200,     9,    27,  1210,  1561,   280,  1160,  2031,   119,
        50601,     9,  1720, 16622, 14964,  2166,  2885, 27984,     9,     0,
        26200,     9,  3200,    27,  5886, 28756,     9,    27,  1972, 26270,
            9, 14966,     9,  5799,   347,   310,  2311,  9955, 

In [30]:
list(map(lambda x: itos[x], dataset.__getitem__(0).tolist()))

['xxbos',
 'ประเทศฟิลิปปินส์',
 ' ',
 '\n',
 ' ',
 'ฟิลิปปินส์',
 ' ',
 'หรือ',
 'ชื่อ',
 'ทางการ',
 'ว่า',
 'xxunk',
 ' ',
 'เป็น',
 'xxunk',
 'ที่',
 'เป็น',
 'หมู่เกาะ',
 'ใน',
 'ภูมิภาค',
 'เอเชียตะวันออกเฉียงใต้',
 ' ',
 'ตั้ง',
 'อยู่ใน',
 'มหาสมุทรแปซิฟิก',
 'ตะวันตก',
 ' ',
 'ประกอบด้วย',
 'เกาะ',
 'xxnum',
 ' ',
 'เกาะ',
 'ซึ่ง',
 'จัด',
 'อยู่ใน',
 'เขต',
 'ภูมิศาสตร์',
 'ใหญ่',
 ' ',
 'xxnum',
 ' ',
 'เขต',
 'จาก',
 'เหนือ',
 'จรด',
 'ไต้',
 ' ',
 'ได้แก่',
 ' ',
 'ลู',
 'ซอน',
 ' ',
 'วิ',
 'ซา',
 'ยัส',
 ' ',
 'และ',
 'xxunk',
 'เมืองหลวง',
 'ของ',
 'ประเทศ',
 'คือ',
 'มะนิลา',
 'ส่วน',
 'เมือง',
 'ที่มี',
 'ประชากร',
 'มากที่สุด',
 'คือ',
 'นคร',
 'เก',
 'ซอน',
 ' ',
 'ทั้งสอง',
 'เป็นส่วนหนึ่ง',
 'ของ',
 'เมโทร',
 'มะนิลา',
 'ฟิลิปปินส์',
 'มี',
 'อาณาเขต',
 'ติดต่อ',
 'กับ',
 'ทะเล',
 'จีน',
 'ไต้',
 'ทางทิศตะวันตก',
 ' ',
 'ทะเล',
 'ฟิลิป',
 'ปิน',
 'ทางทิศตะวันออก',
 ' ',
 'และ',
 'ทะเล',
 'เซ',
 'เล',
 'บี',
 'ส',
 'ทาง',
 'ทิศตะวันตกเฉียงใต้',
 ' ',
 'โดยมี',
 'พรมแ

In [31]:
# %%time
# print(text[:1000])
# print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[:1000])))

__For text[:100000]__  

Rust implementation
>

Python
>CPU times: user 900 ms, sys: 4 ms, total: 904 ms
Wall time: 903 ms

In [32]:
# %%time
# tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[:100000]))
# None

__For text[:1000000]__  

Rust implementation
>

Python
>CPU times: user 7.27 s, sys: 40 ms, total: 7.31 s
Wall time: 7.31 s

In [33]:
# %%time
# tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[:1000000]))
# None

__For text[:3000000]__  

Rust implementation
>CPU times: user 6.38 s, sys: 328 ms, total: 6.7 s  
Wall time: 5.18 s

Python
>CPU times: user 15.5 s, sys: 72 ms, total: 15.6 s  
Wall time: 15.6 s

In [34]:
# %%time
# tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[:3000000]))
# None

__For text[:8000000]__  

Rust implementation
>

Python
>CPU times: user 36.1 s, sys: 340 ms, total: 36.4 s  
Wall time: 36.4 s

In [35]:
# %%time
# tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[:8000000]))
# None

In [36]:
# for i_batch, sample_batched in enumerate(dataloader):
#     print(i_batch, sample_batched)
#     oumodel()

In [37]:
# tokenizer = CharBPETokenizer(vocab_file='vocab.json',merges_file ='merges.txt' )
# no_accent_strip = BertNormalizer(strip_accents=False)
# tokenizer._tokenizer.normalizer = no_accent_strip
# tokenizer._tokenizer.post_processor = BertProcessing(
#     ("</s>", tokenizer.token_to_id("</s>")),
#     ("<s>", tokenizer.token_to_id("<s>")),
# )

# input_ids = torch.tensor(tokenizer.encode(u"สวัสดีครับ ผมชื่อไนท์ ตอนนี้ก็เป็นเวลาที่ผมต้องไปโรงเรียนแล้ว  นี่คือการเว้นวรรคสองทีครับ  จะได้ออกเป็นสอง Spaces").ids).unsqueeze(0)
# print(input_ids)
# outputs = model(input_ids, labels=input_ids)
# print(outputs)
# loss, prediction_scores = outputs[:2]
# print(loss, prediction_scores.shape)

In [38]:
# dataset.__getitem__(1).unsqueeze(0)

In [39]:
# input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1

In [40]:
# %%time
# from transformers import TextDataset, LineByLineTextDataset

# # dataset = LineByLineTextDataset(
# #     tokenizer=pretrain_tokenizer,
# #     file_path="../data/text/AA/wiki_01",
# #     block_size=128,
# # )

# dataset = TextDataset(
#     tokenizer=pretrain_tokenizer,
#     file_path="../data/text/AA/wiki_01",
#     block_size=128,
# )


In [41]:
# one_doc = list(Path("../data/text/AA/").glob("wiki*"))[0].read_text(encoding="utf-8").splitlines()
# tokenizer = Tokenizer.from_file("./thwiki-sentencepiecebpe.tokenizer.json")
# tokenizer.encode_batch(one_doc[:8])

In [42]:
# one_doc = list(Path("../data/text/AA/").glob("wiki*"))[0].read_text(encoding="utf-8").splitlines()
# tokenizer = RobertaTokenizerFast(vocab_file='vocab.json',merges_file ='merges.txt', max_len=512)
# tokenizer.batch_encode_plus(one_doc[:8])

In [43]:
# print(tokenizer.encode_batch(one_doc[:8])[5].tokens)

In [44]:
# one_doc[:8]

In [45]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Transfomers Trainer [link](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py#L133)

```python
class Trainer:
    """
    Trainer is a simple but feature-complete training and eval loop for PyTorch,
    optimized for Transformers.
    Args:
        prediction_loss_only:
            (Optional) in evaluation and prediction, only return the loss
    """
    def __init__(
        self,
        model: PreTrainedModel,
        args: TrainingArguments,
        data_collator: Optional[DataCollator] = None,
        train_dataset: Optional[Dataset] = None,
        eval_dataset: Optional[Dataset] = None,
        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
        prediction_loss_only=False,
        tb_writer: Optional["SummaryWriter"] = None,
        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None,
```

[TrainingArguments](https://github.com/huggingface/transformers/blob/master/src/transformers/training_args.py#L33) is referenced here. 

In [46]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./OriginalBert",
    overwrite_output_dir=False,  #"Use this to continue training if output_dir points to a checkpoint directory."
    
    fp16=True,
    fp16_opt_level='O1',
    
    
    do_train=True, #Whether to run training.
#     do_eval=True, #Whether to run eval on the dev set.
#     do_predict=True, # Whether to run predictions on the test set.
    
    num_train_epochs=200, # Total number of training epochs to perform.
    
    
    per_device_train_batch_size=6, # Batch size per GPU/TPU core/CPU for training.
#     per_device_eval_batch_size=256, # Batch size per GPU/TPU core/CPU for evaluation.
    
    learning_rate=5e-5,  #The initial learning rate for Adam.
    weight_decay=0.0,
    max_grad_norm=1.0,
    adam_epsilon=1e-8, #Epsilon for Adam optimizer.
    
    #Logging
#     logging_dir='', default_logdir -> return os.path.join("runs", current_time + "_" + socket.gethostname())
    logging_first_step= True,
    logging_steps = 500,
    
    save_steps=10_000,  #Save checkpoint every X updates steps.
    save_total_limit=2, #"Limit the total amount of checkpoints. Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints
    
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
#     eval_dataset=val_dataset
)

Note : This is why the GPU 1 is having all the load, and this is how it can be mitigated, and how to migrate to distributed parallel https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255

In [None]:
%%time
trainer.train()

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=200.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=32036.0, style=ProgressStyle(description_…



In [None]:
trainer.save_model("./OriginalBert_Final")

In [None]:
tokenizer.convert_tokens_to_ids([tokenizer.mask_token])

In [None]:
tokenizer.convert_tokens_to_ids(tokenizer.mask_token)