In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

from pathlib import Path
import torch

from torch.utils.data import Dataset, DataLoader
from tokenizers import CharBPETokenizer
from tokenizers.processors import BertProcessing
from tokenizers.normalizers import BertNormalizer

from transformers import RobertaTokenizerFast, RobertaTokenizer

import random
from transformers import PreTrainedTokenizer
from transformers import AutoTokenizer
from filelock import FileLock
import logging
import time
import pickle
import multiprocessing

## Load Path file

In [2]:
# ROOT_DIR = "/datadisk/data/raw_data_extraction_v2"
ROOT_DIR = "/workdir/Code/bma_transformer_model/data/raw_data_extraction"
SELECTOR_DIR = ["classification_dataset"]
# SELECTOR_DIR = ["another_website"]

PATH_FILES_SAMPLE = []
for selector_dir in SELECTOR_DIR:
    for root, dirs, files in os.walk(os.path.join(ROOT_DIR, selector_dir)):
        for file in files:
            PATH_FILES_SAMPLE.append(os.path.join(root, file))

In [3]:
PATH_FILES_SAMPLE

['/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/siamrath_0.txt',
 '/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/dailynews_0.txt',
 '/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/prachachat_0.txt',
 '/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/naewna_0.txt',
 '/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/springnews_0.txt',
 '/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/pptv36_0.txt',
 '/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/prbangkok_0.txt',
 '/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/thaipbs_0.txt']

## DataLoader (Single Core)
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/siamrath_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/dailynews_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/prachachat_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/naewna_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/springnews_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/pptv36_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/prbangkok_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/thaipbs_0.txt

#### Processing Time: 71.71228837966919

In [8]:
logger = logging.getLogger(__name__)
class TextDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach
    soon.
    """
    def __init__(self, tokenizer: PreTrainedTokenizer, sample_path: [], block_size: int, overwrite_cache=False,):
        # assert os.path.isfile(file_path)
        # For Loop MultiFile
        self.examples = []
        #cached_directory = "/datadisk/cached_data"
        cached_directory = "/workdir/Code/bma_transformer_model/data/cached_data"
        for file_path in sample_path:
            print("file_path", file_path)
            block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)

            directory, filename = os.path.split(file_path)
            cached_features_file = os.path.join(
                cached_directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
            )

            # Make sure only the first process in distributed training processes the dataset,
            # and the others will use the cache.
            lock_path = cached_features_file + ".lock"

            with FileLock(lock_path):
                if os.path.exists(cached_features_file) and not overwrite_cache:
                    start = time.time()
                    with open(cached_features_file, "rb") as handle:
                        self.examples = pickle.load(handle)
                    logger.info(
                        f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                    )

                else:
                    with open(file_path, encoding="utf-8") as f:
                        text = f.read()

                    tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
                    for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
                        self.examples.append(
                            tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
                        )
                    # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
                    # If your dataset is small, first you should loook for a bigger one :-) and second you
                    # can change this behavior by adding (model specific) padding.

                    start = time.time()
                    with open(cached_features_file, "wb") as handle:
                        pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                    logger.info(
                        "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                    )

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
        print("self.examples", self.examples)
        return torch.tensor(self.examples[i], dtype=torch.long)


In [9]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
tic = time.time()
dataset = TextDataset(tokenizer, sample_path=PATH_FILES_SAMPLE, block_size=512, overwrite_cache=True)
print(time.time() - tic)

file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/siamrath_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/dailynews_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/prachachat_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/naewna_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/springnews_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/pptv36_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/prbangkok_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/thaipbs_0.txt
71.71228837966919


## Idea (Multi Core)

In [3]:
def f(x):
    return x**2

In [4]:
import multiprocessing
import time
pool = multiprocessing.Pool()
tic = time.time()
num_loop = 10**4
pool.map(f, range(1000))
# print(pool.map(f, range(1000)))
print("TIME MAP", time.time() - tic)

tic = time.time()
for i in range(0, num_loop):
    f(i)
print("TIME LOOP", time.time() - tic)

TIME MAP 0.0020973682403564453
TIME LOOP 0.002908468246459961


## DataLoader (Multi Core)
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/siamrath_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/dailynews_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/prachachat_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/naewna_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/springnews_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/pptv36_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/prbangkok_0.txt
file_path /workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/thaipbs_0.txt

#### Processing Time: 41.26610994338989 >> pool = multiprocessing.Pool(processes=8)

In [13]:
logger = logging.getLogger(__name__)
class TextDatasetParallel(Dataset):
    """
    This will be superseded by a framework-agnostic approach
    soon.
    """         
    def __init__(self, tokenizer: PreTrainedTokenizer, sample_path: [], block_size: int, overwrite_cache=False,):
        # assert os.path.isfile(file_path)
        # For Loop MultiFile
        self.examples = []
        self.sample_path = sample_path
        self.block_size = block_size
        self.overwrite_cache = overwrite_cache
        #print("CPU", multiprocessing.cpu_count())
        pool = multiprocessing.Pool(processes=8)
        self.examples = pool.map(self.load_data_tokenized, self.sample_path)

    def load_data_tokenized(self, file_path):
        # print("TEST_", self.block_size)
        #cached_directory = "/datadisk/cached_data"
        cached_directory = "/workdir/Code/bma_transformer_model/data/cached_data"
        self.block_size = self.block_size - tokenizer.num_special_tokens_to_add(pair=False)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            cached_directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(self.block_size), filename,),
        )

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):
            if os.path.exists(cached_features_file) and not self.overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )
            else:
                with open(file_path, encoding="utf-8") as f:
                    text = f.read()

                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
                for i in range(0, len(tokenized_text) - self.block_size + 1, self.block_size):  # Truncate in block of block_size
                    #print(">>> APPEND <<<")
                    self.examples.append(
                        tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + self.block_size])
                    )
                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
                # If your dataset is small, first you should loook for a bigger one :-) and second you
                # can change this behavior by adding (model specific) padding.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )
        return self.examples
        
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
        #print("self.examples", self.examples)
        return torch.tensor(self.examples[i], dtype=torch.long)


## Load Path File

In [18]:
# ROOT_DIR = "/datadisk/data/raw_data_extraction_v2"
ROOT_DIR = "/workdir/Code/bma_transformer_model/data/raw_data_extraction"
SELECTOR_DIR = ["classification_dataset", "another_website"]
# SELECTOR_DIR = ["another_website"]

PATH_FILES_SAMPLE = []
for selector_dir in SELECTOR_DIR:
    for root, dirs, files in os.walk(os.path.join(ROOT_DIR, selector_dir)):
        for file in files:
            PATH_FILES_SAMPLE.append(os.path.join(root, file))

In [19]:
PATH_FILES_SAMPLE

['/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/siamrath_0.txt',
 '/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/dailynews_0.txt',
 '/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/prachachat_0.txt',
 '/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/naewna_0.txt',
 '/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/springnews_0.txt',
 '/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/pptv36_0.txt',
 '/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/prbangkok_0.txt',
 '/workdir/Code/bma_transformer_model/data/raw_data_extraction/classification_dataset/thaipbs_0.txt']

In [14]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
tic = time.time()
dataset = TextDatasetParallel(tokenizer, sample_path=PATH_FILES_SAMPLE, block_size=512, overwrite_cache=True)
print(time.time() - tic)

CPU 8
TEST_TEST_  512512

TEST_ 512
TEST_ 512
TEST_ 512
TEST_ 512
TEST_ 512
TEST_ 512
50.652223348617554


In [15]:
print(dataset.__len__())

8


In [17]:
dataset.__getitem__(2)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tensor([[    0, 48759, 24107,  ..., 24107,  2469,     2],
        [    0,    17,    46,  ...,  6800,  1437,     2],
        [    0, 24107, 23133,  ...,  4034,     6,     2],
        ...,
        [    0, 24107,  4958,  ..., 24107,  6382,     2],
        [    0, 42348, 10172,  ..., 23133, 24107,     2],
        [    0, 12410, 42348,  ..., 42348,  8384,     2]])

In [9]:
dataset.__getitem__(0)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tensor([[    0, 24107, 15722,  ..., 18537, 24107,     2],
        [    0,    27, 42348,  ..., 24107,  3726,     2],
        [    0, 42348, 10172,  ..., 42348, 23171,     2],
        ...,
        [    0, 42348,  7471,  ..., 24107, 16948,     2],
        [    0, 24107, 14292,  ...,  3726, 24107,     2],
        [    0,  2469, 24107,  ..., 24107, 15389,     2]])