In [1]:
import os
from engine.data import DataModule
from engine.tasks.pretraining import MlmHeadlessPretraining
from engine.lit.lightning_module import TaskTrainer
from transformers import AutoTokenizer, AutoConfig
from pytorch_lightning.callbacks import ModelCheckpoint
import psutil
import argparse
import torch

print("CPU count: ", psutil.cpu_count()) 


config = 'configs/mlm_headless_baby_lm.json'
num_nodes = 1 
global_bs = 16
gpu_bs = 16  
dataset = 'dataset_storage/baby-lm-small.hf'
hf_tokenizer = 'bert-base-uncased'
hf_path = 'google-bert/bert-base-cased' 
model_max_seq_len = 128
run_name = 'checkpoints_test'
saved_ckpt_path = 'checkpoints'
ckpt_every = 2000

accelerator = 'hf'
precision = '16-mixed'

ckpt_path = None 

if accelerator == "xformers":
  from engine.models.xformers.efficient_bert import BertForMaskedLM
elif accelerator == "flash_attention":
  from engine.models.flash_attention.efficient_bert import BertForMaskedLM
  torch.set_float32_matmul_precision('medium')
elif accelerator == "hf":
  from transformers import BertForMaskedLM
else:
    raise NotImplementedError(f"Unknown accelerator {accelerator}. Please pick between 'hf', 'flash_attention', 'xformers'.")

gpus_by_node = torch.cuda.device_count()

if ((gpus_by_node * num_nodes) % global_bs) == 0:
  raise argparse.ArgumentError(f"Requested a batch size of {global_bs} on {gpu_bs}x{gpus_by_node} GPUs : not a multiple!")
accu_grad_batches = global_bs // (gpus_by_node * num_nodes * gpu_bs)
print(f"Grad. accumulating factor: {accu_grad_batches}")


datamodule = DataModule.from_datasets(dataset, train_batch_size=gpu_bs, infer_batch_size=gpu_bs,
split_names=["train(:0.9999)", "train(0.9999:)"], from_disk=True, num_workers=0)


tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer)
lm_config = AutoConfig.from_pretrained(hf_path)

torch.set_float32_matmul_precision('medium')

lm_config.vocab_size = len(tokenizer.vocab)
tokenizer.mask_token_id = 1
lm_config.max_position_embeddings = model_max_seq_len
lm_model = BertForMaskedLM(lm_config)


task = MlmHeadlessPretraining(
    tokenizer, lm_model, config = config
)

version_name = run_name
trainer = TaskTrainer(task, logger_args={"version": version_name})

checkpoints = [
  ModelCheckpoint(every_n_train_steps=ckpt_every, dirpath=f'{saved_ckpt_path}/{version_name}', save_top_k=-1)#,
#   ModelCheckpoint(every_n_train_steps=1000, dirpath=f'{saved_ckpt_path}/{version_name}_last', save_top_k=1)
]

trainer.fit(
  datamodule,
  num_nodes=num_nodes,
  precision=precision,
  accumulate_grad_batches=accu_grad_batches,
  callbacks=checkpoints,
  limit_val_batches=10,
  val_check_interval=0.1,
  gradient_clip_val=1.0,
  benchmark=True,
  default_root_dir=f'{run_name}{saved_ckpt_path}/{version_name}',
  ckpt_path=ckpt_path #,
  # max_epochs=max_epochs,#added myself
)


  from .autonotebook import tqdm as notebook_tqdm


CPU count:  12
Grad. accumulating factor: 1


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(128, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi


  | Name | Type                   | Params | Mode 
--------------------------------------------------------
0 | task | MlmHeadlessPretraining | 108 M  | train
--------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
434.387   Total estimated model params size (MB)
229       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\Ino\anaconda3\envs\gpu_test\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]

masked input: tensor([[ 1996,  2237,  4580,  ...,     1, 11463,  1010],
        [ 2017,  2064,     1,  ...,     1,  2113,  2054],
        [ 2000,     1,  1012,  ...,     1,  1037,  3371],
        ...,
        [    1,  2064,  3093,  ...,  3628,  2560,  1045],
        [ 2001,  1012,   102,  ...,  1010,  2852,  1012],
        [17639,     1,  2017,  ...,  2053,  1010,  1045]], device='cuda:0')

mlm labels: tensor([[-100, -100, -100,  ..., 1012, -100, -100],
        [-100, -100, 1005,  ..., 1056, -100, -100],
        [-100, 5233, -100,  ..., 4787, -100, -100],
        ...,
        [2017, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, 7608, -100,  ..., -100, -100, -100]], device='cuda:0')

mask: tensor([[False, False, False,  ...,  True, False, False],
        [False, False,  True,  ...,  True, False, False],
        [False,  True, False,  ...,  True, False, False],
 

c:\Users\Ino\anaconda3\envs\gpu_test\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Epoch 0:   0%|          | 0/411 [00:00<?, ?it/s] 

masked input: tensor([[ 8266,  2006,  1996,  ...,     1,  2128,  1999],
        [ 1996, 18672,     1,  ...,     1,  6231,  1010],
        [ 8131,     1,  1005,  ...,     1,  6526,  6974],
        ...,
        [ 2016,  2987,  1005,  ...,  2006,  1011,  1037],
        [ 2450,     1,  3061,  ...,  2477,  1029,  1045],
        [    1,  2008,  1012,  ...,  1005,  1049,  2145]], device='cuda:0')

mlm labels: tensor([[ -100,  -100,  -100,  ...,  1005,  -100,  -100],
        [ -100,  -100, 18426,  ...,  1996,  -100,  -100],
        [ -100,  5233,  -100,  ...,  4787,  -100,  -100],
        ...,
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        [ -100,  2001,  -100,  ...,  -100,  -100,  -100],
        [ 2106,  -100,  -100,  ...,  -100,  -100,  -100]], device='cuda:0')

mask: tensor([[False, False, False,  ...,  True, False, False],
        [False, False,  True,  ...,  True, False, False],
        [False,  True, False,  ...,  True


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined