In [1]:
import gc
from datasets import load_dataset, load_metric
from transformers import DistilBertTokenizerFast, DistilBertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import numpy as np
import torch
from torch.utils.data import DataLoader
import inspect

In [2]:
SEED = 0

In [3]:
hdfs1_dataset = load_dataset('text', data_files='../data/raw/HDFS1/HDFS.log', split='train')


Using custom data configuration default-3571b23bb2152210
Reusing dataset text (/home/cernypro/.cache/huggingface/datasets/text/default-3571b23bb2152210/0.0.0/44d63bd03e7e554f16131765a251f2d8333a5fe8a73f6ea3de012dbc49443691)


In [4]:
hdfs1_dataset[0]

{'text': '081109 203518 143 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010'}

In [5]:
def remove_timestamp(example):
    # need to find third occurence of a space and slice the string after it
    # using a very non robust silly solution
    s = example['text']
    example['text'] = s[s.find(' ', s.find(' ', s.find(' ')+1)+1)+1:]
    return example

cleaned_dataset = hdfs1_dataset.map(remove_timestamp)

Loading cached processed dataset at /home/cernypro/.cache/huggingface/datasets/text/default-3571b23bb2152210/0.0.0/44d63bd03e7e554f16131765a251f2d8333a5fe8a73f6ea3de012dbc49443691/cache-15d3de4e024d5591.arrow


In [2]:
pretrained_model_name = "distilbert-base-cased"
tokenizer = DistilBertTokenizerFast.from_pretrained(pretrained_model_name)

In [7]:
def tokenize_dataset(examples):
    return tokenizer(examples['text'], padding="max_length",
                     truncation=True, max_length=256, return_special_tokens_mask=True)
# tokenized_dataset = cleaned_dataset.map(tokenize_dataset, batched=True, batch_size=1000)

In [7]:
def tokenize_dontpad_dataset(examples, tokenizer):
    return tokenizer(examples['text'], truncation=True, return_special_tokens_mask=True)
tokenized_unpadded_dataset = cleaned_dataset.map(tokenize_dontpad_dataset, fn_kwargs={'tokenizer': tokenizer}, batched=True, batch_size=1000, remove_columns=['text'])

Loading cached processed dataset at /home/cernypro/.cache/huggingface/datasets/text/default-3571b23bb2152210/0.0.0/44d63bd03e7e554f16131765a251f2d8333a5fe8a73f6ea3de012dbc49443691/cache-b9facbd72d2d63dd.arrow


In [8]:
idx = 0
print(hdfs1_dataset[idx])
print(cleaned_dataset[idx])
print(tokenized_unpadded_dataset[idx])

{'text': '081109 203518 143 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010'}
{'text': 'INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010'}
{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [101, 15969, 2271, 2346, 173, 22816, 119, 7154, 2249, 13040, 109, 7154, 3190, 23566, 1197, 131, 11336, 24271, 3510, 171, 10493, 168, 118, 7690, 1604, 1580, 1580, 1580, 1545, 1604, 1559, 1580, 16382, 22392, 26752, 1568, 1545, 188, 19878, 131, 120, 1275, 119, 4805, 119, 1627, 119, 9081, 131, 4335, 10424, 1545, 3532, 1204, 131, 120, 1275, 119, 4805, 119, 1627, 119, 9081, 131, 2260, 10424, 102], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [9]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [10]:
small_dataset = tokenized_unpadded_dataset.select(range(1000))

In [11]:
train_test_dataset = tokenized_unpadded_dataset.train_test_split(train_size=100000, test_size=30000, shuffle=True, seed=SEED)

In [25]:
train_test_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
        num_rows: 15000
    })
})

In [12]:
def _remove_unused_columns(model, dataset: "datasets.Dataset"):
        # Inspect model forward signature to keep only the arguments it accepts.
        signature = inspect.signature(model)
        signature_columns = list(signature.parameters.keys())
        # Labels may be named label or label_ids, the default data collator handles that.
        signature_columns += ["label", "label_ids"]
        columns = [k for k in signature_columns if k in dataset.column_names]
        dataset.set_format(type=dataset.format["type"], columns=columns)

def eval_loop(model, dataloader: DataLoader, device):
    total = 0
    correct = 0
    model.eval()
    for idx, inputs in enumerate(dataloader):
        t = torch.cuda.get_device_properties(0).total_memory
        c = torch.cuda.memory_reserved(0)
        a = torch.cuda.memory_allocated(0)
        f = c-a  # free inside cache
        print(f't={t} c={c} a={a} f={f}')
        for k, v in inputs.items():
            if isinstance(v, torch.Tensor):
                inputs[k] = v.to(device)
        masked_idxs = inputs['labels'] != -100
        with torch.no_grad():
            outputs = model(**inputs)
        prediction_correct = torch.argmax(outputs[1], dim=2)[masked_idxs] == inputs['labels'][masked_idxs]
        total += prediction_correct.shape[0]
        correct += torch.sum(prediction_correct).item()
    return correct/total, correct, total

In [3]:
model = DistilBertForMaskedLM.from_pretrained(pretrained_model_name)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [15]:
# _remove_unused_columns(model, small_dataset)
torch.manual_seed(SEED)
dataloader = DataLoader(small_dataset, batch_size=128, collate_fn=data_collator)
print(eval_loop(model, dataloader, device))

t=34089730048 c=13331595264 a=1060333568 f=12271261696
t=34089730048 c=16481517568 a=2634020352 f=13847497216
t=34089730048 c=16481517568 a=2322252800 f=14159264768
t=34089730048 c=16481517568 a=2337098752 f=14144418816
t=34089730048 c=16481517568 a=2352192512 f=14129325056
t=34089730048 c=16481517568 a=2366872576 f=14114644992
t=34089730048 c=16481517568 a=2352192512 f=14129325056
t=34089730048 c=16481517568 a=2084715520 f=14396802048
(0.8088865656037637, 7737, 9565)


In [59]:
del dataloader
del model
del trainer
gc.collect()

674

In [14]:
training_args = TrainingArguments(output_dir="./second_train_experiment",
                                  num_train_epochs=5,
                                  per_device_eval_batch_size=256, 
                                  per_device_train_batch_size=128,
                                  warmup_steps=500,                # number of warmup steps for learning rate scheduler
                                  weight_decay=0.01,               # strength of weight decay
                                  logging_dir='./logs',            # directory for storing logs
                                  logging_steps=100,
                                  save_steps=1000,
                                  run_name="Second-Training")
trainer = Trainer(model=model,
                  args=training_args,
                  data_collator=data_collator,
                  train_dataset=train_test_dataset['train'],
                  eval_dataset=train_test_dataset['test']
                  )

trainer.train()
trainer.save_model()

[34m[1mwandb[0m: Currently logged in as: [33mprokopcerny[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.20 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Step,Training Loss
100,2.9807
200,0.9755
300,0.8166
400,0.7819
500,0.7708
600,0.7529
700,0.7353
800,0.7052
900,0.6983
1000,0.6919


In [31]:
trainer.save_model()

In [62]:
# torch.cuda.empty_cache()
t = torch.cuda.get_device_properties(0).total_memory
c = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = c-a  # free inside cache
print(f't={t} c={c} a={a} f={f}')

t=34089730048 c=622854144 a=527545344 f=95308800


In [60]:
torch.cuda.empty_cache()

In [5]:
model.config.dim

768

In [53]:
def pretty_size(size):
    """Pretty prints a torch.Size object"""
    assert(isinstance(size, torch.Size))
    return " × ".join(map(str, size))

def dump_tensors(gpu_only=True):
	"""Prints a list of the Tensors being tracked by the garbage collector."""
	import gc
	total_size = 0
	for obj in gc.get_objects():
		try:
			if torch.is_tensor(obj):
				if not gpu_only or obj.is_cuda:
					print("%s:%s%s %s" % (type(obj).__name__, 
										  " GPU" if obj.is_cuda else "",
										  " pinned" if obj.is_pinned else "",
										  pretty_size(obj.size())))
					total_size += obj.numel()
			elif hasattr(obj, "data") and torch.is_tensor(obj.data):
				if not gpu_only or obj.is_cuda:
					print("%s → %s:%s%s%s%s %s" % (type(obj).__name__, 
												   type(obj.data).__name__, 
												   " GPU" if obj.is_cuda else "",
												   " pinned" if obj.data.is_pinned else "",
												   " grad" if obj.requires_grad else "", 
												   " volatile" if obj.volatile else "",
												   pretty_size(obj.data.size())))
					total_size += obj.data.numel()
		except Exception as e:
			pass        
	print("Total size:", total_size)

In [54]:
dump_tensors()



Parameter: GPU pinned 28996 × 768
Parameter: GPU pinned 512 × 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768 × 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768
Parameter: GPU pinned 28996
Parameter: GPU pinned 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768 × 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768 × 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768 × 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768 × 768
Parameter: GPU pinned 768
Parameter: GPU pinned 3072 × 768
Parameter: GPU pinned 3072
Parameter: GPU pinned 768 × 3072
Parameter: GPU pinned 768
Parameter: GPU pinned 28996 × 768
Parameter: GPU pinned 512 × 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768 × 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768
Parameter: GPU pinned 768
Parameter: GPU pinned 28996
Parameter: GPU pinned 768
Paramet