In [5]:
from datasets import load_dataset, load_from_disk
from pathlib import Path
from transformers import DistilBertTokenizerFast
from typing import List
import numpy as np

In [2]:
project_base_dir = Path.cwd().parent

In [3]:
pretrained_model_name = "distilbert-base-cased"
tokenizer = DistilBertTokenizerFast.from_pretrained(pretrained_model_name)

In [9]:
hdfs1_path = project_base_dir / 'data' / 'raw' / 'HDFS1' / 'HDFS.log'
hdfs1_dataset = load_dataset('text', data_files=str(hdfs1_path), split='train')

Using custom data configuration default-f7d20bad4b8d075b
Reusing dataset text (/home/cernypro/.cache/huggingface/datasets/text/default-f7d20bad4b8d075b/0.0.0/44d63bd03e7e554f16131765a251f2d8333a5fe8a73f6ea3de012dbc49443691)


In [10]:
def remove_timestamp(example):
    # need to find third occurence of a space and slice the string after it
    # using a very non robust silly solution
    s = example['text']
    example['text'] = s[s.find(' ', s.find(' ', s.find(' ')+1)+1)+1:]
    return example

cleaned_dataset = hdfs1_dataset.map(remove_timestamp)

Loading cached processed dataset at /home/cernypro/.cache/huggingface/datasets/text/default-f7d20bad4b8d075b/0.0.0/44d63bd03e7e554f16131765a251f2d8333a5fe8a73f6ea3de012dbc49443691/cache-3f531662155381a4.arrow


In [11]:
cleaned_hdfs1_path = project_base_dir / 'data' / 'interim' / 'HDFS1_no_timestamp'
cleaned_dataset.save_to_disk(cleaned_hdfs1_path)

In [12]:
def tokenize_no_special_tokens(examples, tokenizer):
    return {'tokens': tokenizer(examples['text'], add_special_tokens=False, truncation=True, return_attention_mask=False)['input_ids']}
purely_tokenized = cleaned_dataset.map(tokenize_no_special_tokens, fn_kwargs={'tokenizer': tokenizer}, batched=True, batch_size=10000)

Loading cached processed dataset at /home/cernypro/.cache/huggingface/datasets/text/default-f7d20bad4b8d075b/0.0.0/44d63bd03e7e554f16131765a251f2d8333a5fe8a73f6ea3de012dbc49443691/cache-c9d842d71543c5b3.arrow


In [13]:
pure_tokenized_hdfs1_path = project_base_dir / 'data' / 'interim' / 'HDFS1_tokenized_no_special_tokens'
purely_tokenized.save_to_disk(pure_tokenized_hdfs1_path)

In [14]:
def chunkify(examples):
    return {"chunk": [examples['tokens']]}
chunked_size_10 = purely_tokenized.map(chunkify,
                                       batched=True,
                                       batch_size=10,
                                       drop_last_batch=True,
                                       remove_columns=purely_tokenized.column_names,
                                       num_proc=4)







In [15]:
chunked_10_hdfs1_path = project_base_dir / 'data' / 'interim' / 'HDFS1_tokenized_chunked_size_10'
chunked_size_10.save_to_disk(chunked_10_hdfs1_path)

In [6]:
chunked_size_10 = load_from_disk(str(project_base_dir / 'data' / 'interim' / 'HDFS1_tokenized_chunked_size_10'))

In [8]:
Path('/home/cernypro/dev/source/ml4logs/data/processed')

PosixPath('/home/cernypro/dev/source/ml4logs/data/processed')

In [7]:
chunked_size_10

Dataset({
    features: ['chunk'],
    num_rows: 1117560
})

In [9]:
def create_target_and_flat_context(context: List[List[int]], rnd: np.random.Generator, remove_target_prob:float):
    target_idx = rnd.integers(low=0, high=len(context))
    remove_target = rnd.random() < remove_target_prob
    target_sentence = context[target_idx]
    processed_context = context[:target_idx] + context[target_idx + remove_target:]
    flattened_context = [token for sentence in context for token in sentence]
    return target_sentence, flattened_context

def prepare_ict(examples, epochs, rnd: np.random.Generator, remove_target_prob:float):
    targets = []
    flat_contexts = []
    for context in examples['chunk']:
        for _ in range(epochs):
            t, f = create_target_and_flat_context(context, rnd, remove_target_prob)
            targets.append(t)
            flat_contexts.append(f)
    return {'target': targets,
            'flat_context': flat_contexts}

In [10]:
rnd = np.random.default_rng(0)
one = chunked_size_10.select(range(10000)).map(prepare_ict, fn_kwargs={'epochs':4, 'rnd':rnd, 'remove_target_prob':0.9}, batched=True, batch_size=500, remove_columns=chunked_size_10.column_names, load_from_cache_file=False)

    

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [139]:
one

Dataset({
    features: ['flat_context', 'target'],
    num_rows: 40000
})

In [67]:
two

Dataset({
    features: ['flat_context', 'target'],
    num_rows: 60000
})