In [1]:
import torch
import transformers
import os
from transformers import AutoTokenizer
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

In [3]:

gc.collect()
torch.cuda.empty_cache()

In [4]:
# describe cuda memory
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------

In [6]:
# check if the model is available locally first
name = 'mpt-1b-redpajama-200b-dolly'

if not os.path.exists(name):
    print(f'Downloading {name} from the hub')
    model = transformers.AutoModelForCausalLM.from_pretrained(
        f'mosaicml/{name}',
        trust_remote_code=True, 
        torch_dtype=torch.float16
        )
    # save the model to disk locally
    model.save_pretrained(name)




You are using config.init_device='cuda:0', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


MosaicGPT(
  (transformer): ModuleDict(
    (wte): Embedding(50432, 2048)
    (emb_drop): Dropout(p=0, inplace=False)
    (blocks): ModuleList(
      (0): GPTBlock(
        (ln_1): LPLayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (Wqkv): Linear(in_features=2048, out_features=6144, bias=False)
          (q_ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
          (k_ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
          (out_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (ln_2): LPLayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTMLP(
          (mlp_up): Linear(in_features=2048, out_features=8192, bias=False)
          (mlp_act): GELU(approximate='none')
          (mlp_down): Linear(in_features=8192, out_features=2048, bias=False)
        )
        (resid_attn_dropout): Dropout(p=0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0, inplace=Fals

In [12]:

config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
config.init_device = 'cuda:0' # For fast initialization directly on GPU! 
# load the model from disk to cuda
# convert the model to fp16
model = transformers.AutoModelForCausalLM.from_pretrained(
    name,
    config=config,
    trust_remote_code=True,
    torch_dtype=torch.float16
    ).half()

model.to(device='cuda:0', dtype=torch.bfloat16)



You are using config.init_device='cuda:0', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


MosaicGPT(
  (transformer): ModuleDict(
    (wte): Embedding(50432, 2048)
    (emb_drop): Dropout(p=0, inplace=False)
    (blocks): ModuleList(
      (0): GPTBlock(
        (ln_1): LPLayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (Wqkv): Linear(in_features=2048, out_features=6144, bias=False)
          (q_ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
          (k_ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
          (out_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (ln_2): LPLayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTMLP(
          (mlp_up): Linear(in_features=2048, out_features=8192, bias=False)
          (mlp_act): GELU(approximate='none')
          (mlp_down): Linear(in_features=8192, out_features=2048, bias=False)
        )
        (resid_attn_dropout): Dropout(p=0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0, inplace=Fals

In [20]:
# check if triton is enabled in model
print(model.config.attn_impl)

# change the attention implementation to default
model.config.attn_impl = 'default'
print(model.config.attn_impl)

AttributeError: 'Linear' object has no attribute 'config'

In [15]:
print(model.device)

cuda:0


In [16]:
# use the model
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")

inputs = tokenizer(["Tell me about Obama"], return_tensors="pt")
inputs=inputs.to(device='cuda:0')

In [9]:
# list the objects in cuda memory
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |    2502 MB |    2700 MB |    5005 MB |    2503 MB |
|       from large pool |    2502 MB |    2700 MB |    5004 MB |    2502 MB |
|       from small pool |       0 MB |       1 MB |       1 MB |       1 MB |
|---------------------------------------------------------------------------|
| Active memory         |    2502 MB |    2700 MB |    5005 MB |    2503 MB |
|       from large pool |    2502 MB |    2700 MB |    5004 MB |    2502 MB |
|       from small pool |       0 MB |       1 MB |       1 MB |       1 MB |
|---------------------------------------------------------------

In [10]:
inputs

{'input_ids': tensor([[17570,   479,   670,  6729]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1]], device='cuda:0')}

In [19]:
# initialize random tensor on cuda
x = torch.randn(1, 1, 1, device='cuda:0')

# create a 1 layer model nn.Linear
model1 = torch.nn.Linear(1, 1).to(device='cuda:0')

# run the model on the random tensor
model1(x)

tensor([[[1.2559]]], device='cuda:0', grad_fn=<ViewBackward0>)

In [17]:
# generate text using the model on cuda
outputs = model.generate(
    **inputs, 
    max_length=100, 
    do_sample=True, 
    top_p=0.95, 
    top_k=60)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

RuntimeError: CUDA: Error- no device

In [3]:
# name = 'mosaicml/mpt-7b-storywriter'

# config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
# config.attn_config['attn_impl'] = 'triton'
# config.init_device = 'cuda:0' # For fast initialization directly on GPU!

# # batch size of 16
# model = transformers.AutoModelForCausalLM.from_pretrained(
#   name,
#   config=config,
#   torch_dtype=torch.bfloat16, # Load model weights in bfloat16
#   trust_remote_code=True
# )


Instantiating an MPTForCausalLM model from /home/nathan/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b-storywriter/a5e85ae1941e31bb705adbcafce9b0dfd6f3a48b/modeling_mpt.py


OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB (GPU 0; 11.00 GiB total capacity; 7.26 GiB already allocated; 2.33 GiB free; 7.26 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF