In [1]:
#!pip install transformers==4.14.1
!pip install bitsandbytes
!pip install datasets==1.16.1 



In [2]:
import numpy as np
import transformers
import torch
import torch.nn.functional as functional
import matplotlib.pyplot as plt

from torch import nn
#from torch.cuda.amp import custom_fwd, custom_bwd
from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise
from bitsandbytes.optim import Adam8bit
from tqdm.auto import tqdm
from IPython.display import clear_output


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: Required library version not found: libsbitsandbytes_cpu.so. Maybe you need to compile it from source?
CUDA SETUP: Defaulting to libbitsandbytes_cpu.so...
dlopen(/Users/tylerdurden/miniconda3/envs/torch/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so, 0x0006): tried: '/Users/tylerdurden/miniconda3/envs/torch/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so' (not a mach-o file)


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [12]:
class FrozenBNBLinear(nn.Module):
    def __init__(self, weight, absmax, code, bias=None):
        assert isinstance(bias, nn.Parameter) or bias is None
        super().__init__()
        self.out_features, self.in_features = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None
        self.bias = bias
 
    def forward(self, input):
        output = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias).clone()
        if self.adapter:
            output += self.adapter(input)
        return output
 
    @classmethod
    def from_linear(cls, linear: nn.Linear) -> "FrozenBNBLinear":
        weights_int8, state = quantize_blockise_lowmemory(linear.weight)
        return cls(weights_int8, *state, linear.bias)
 
    def __repr__(self):
        return f"{self.__class__.__name__}({self.in_features}, {self.out_features})"
 
 
class DequantizeAndLinear(torch.autograd.Function): 
    @staticmethod
    # for CUDA devices
    ## @custom_fwd
    def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,
                absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        ctx.save_for_backward(input, weights_quantized, absmax, code)
        ctx._has_bias = bias is not None
        return functional.linear(input, weights_deq, bias)
 
    @staticmethod
    # for CUDA devices
    ## @custom_bwd
    def backward(ctx, grad_output: torch.Tensor):
        assert not ctx.needs_input_grad[1] and not ctx.needs_input_grad[2] and not ctx.needs_input_grad[3]
        input, weights_quantized, absmax, code = ctx.saved_tensors
        # grad_output: [*batch, out_features]
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        grad_input = grad_output @ weights_deq
        grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None
        return grad_input, None, None, None, grad_bias
 
 
class FrozenBNBEmbedding(nn.Module):
    def __init__(self, weight, absmax, code):
        super().__init__()
        self.num_embeddings, self.embedding_dim = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None
 
    def forward(self, input, **kwargs):
        with torch.no_grad():
            # note: both quantuized weights and input indices are *not* differentiable
            weight_deq = dequantize_blockwise(self.weight, absmax=self.absmax, code=self.code)
            output = functional.embedding(input, weight_deq, **kwargs)
        if self.adapter:
            output += self.adapter(input)
        return output 
 
    @classmethod
    def from_embedding(cls, embedding: nn.Embedding) -> "FrozenBNBEmbedding":
        weights_int8, state = quantize_blockise_lowmemory(embedding.weight)
        return cls(weights_int8, *state)
 
    def __repr__(self):
        return f"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})"
 
 
def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):
    assert chunk_size % 4096 == 0
    code = None
    chunks = []
    absmaxes = []
    flat_tensor = matrix.view(-1)
    for i in range((matrix.numel() - 1) // chunk_size + 1):
        input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()
        quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)
        chunks.append(quantized_chunk)
        absmaxes.append(absmax_chunk)
 
    matrix_i8 = torch.cat(chunks).reshape_as(matrix)
    absmax = torch.cat(absmaxes)
    return matrix_i8, (absmax, code)
 
 
def convert_to_int8(model):
    """Convert linear and embedding modules to 8-bit with optional adapters"""
    for module in list(model.modules()):
        for name, child in module.named_children():
            if isinstance(child, nn.Linear):
                print(name, child)
                setattr( 
                    module,
                    name,
                    FrozenBNBLinear(
                        weight=torch.zeros(child.out_features, child.in_features, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                        bias=child.bias,
                    ),
                )
            elif isinstance(child, nn.Embedding):
                setattr(
                    module,
                    name,
                    FrozenBNBEmbedding(
                        weight=torch.zeros(child.num_embeddings, child.embedding_dim, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                    )
                )

In [13]:
class GPTJBlock(transformers.models.gptj.modeling_gptj.GPTJBlock):
    def __init__(self, config):
        super().__init__(config)

        convert_to_int8(self.attn)
        convert_to_int8(self.mlp)


class GPTJModel(transformers.models.gptj.modeling_gptj.GPTJModel):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)
        

class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)


transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock  # monkey-patch GPT-J

In [14]:
def add_adapters(model, adapter_dim=16):
    assert adapter_dim > 0

    for name, module in model.named_modules():
        if "attn" in name:
            if isinstance(module, FrozenBNBLinear):
                module.adapter = nn.Sequential(
                    nn.Linear(module.in_features, adapter_dim, bias=False),
                    nn.Linear(adapter_dim, module.out_features, bias=False),
                )
                nn.init.zeros_(module.adapter[1].weight)
            elif isinstance(module, FrozenBNBEmbedding):
                module.adapter = nn.Sequential(
                    nn.Embedding(module.num_embeddings, adapter_dim),
                    nn.Linear(adapter_dim, module.embedding_dim, bias=False),
                )
                nn.init.zeros_(module.adapter[1].weight)
              
            elif hasattr(module, "adapter"):
                print("Initializing", name)
                nn.init.zeros_(module.adapter[1].weight)

In [6]:
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

config = transformers.GPTJConfig.from_pretrained("EleutherAI/gpt-j-6B")
tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

gpt = GPTJForCausalLM.from_pretrained("hivemind/gpt-j-6B-8bit", low_cpu_mem_usage=True)
add_adapters(gpt)
gpt.to(device)

Downloading:   0%|          | 0.00/930 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/357 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.75G [00:00<?, ?B/s]

k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, bias=False)
fc_in Linear(in_features=4096, out_features=16384, bias=True)
fc_out Linear(in_features=16384, out_features=4096, bias=True)
k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, bias=False)
fc_in Linear(in_features=4096, out_features=16384, bias=True)
fc_out Linear(in_features=16384, out_features=4096, bias=True)
k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, 

GPTJForCausalLM(
  (transformer): GPTJModel(
    (wte): FrozenBNBEmbedding(50400, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-27): 28 x GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): FrozenBNBLinear(4096, 4096)
          (v_proj): FrozenBNBLinear(4096, 4096)
          (q_proj): FrozenBNBLinear(4096, 4096)
          (out_proj): FrozenBNBLinear(4096, 4096)
        )
        (mlp): GPTJMLP(
          (fc_in): FrozenBNBLinear(4096, 16384)
          (fc_out): FrozenBNBLinear(16384, 4096)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): FrozenBNBLinear(4096, 50400)
)

In [7]:
with open('data/small_train.txt', 'r') as f:
    train_data = f.readlines()
with open('data/small_test.txt', 'r') as f:
    test_data = f.readlines()
with open('data/small_valid.txt', 'r') as f:
    valid_data = f.readlines()

train_data = train_data[:int(len(train_data)/10)]
test_data = test_data[:int(len(test_data)/10)]
valid_data = valid_data[:int(len(valid_data)/10)]

In [15]:
def train(train_data,
          valid_data,
          n_epochs,
          lr,
          lr_update=False,
          seq_length=512,
          weight_decay=0.1,
          verbose=False):
  
    if not gpt.training:
        gpt.train()

    optimizer = Adam8bit(gpt.parameters(),
                         lr=lr,
                         betas=(0.9,0.95),
                         weight_decay=weight_decay)
    if lr_update:
      scheduler = transformers.get_cosine_schedule_with_warmup(
              optimizer=optimizer,
              num_warmup_steps=int(0.05*len(train_data) * n_epochs),
              num_training_steps=len(train_data) * n_epochs)
    
    gpt.gradient_checkpointing_enable()
    
    model_best_state = {}
    train_losses = []
    train_perplexities = []
    valid_losses = []
    valid_perplexities = []
    min_valid_loss = np.inf

    for epoch in range(n_epochs):
        # For CUDA devices
        # torch.cpu.amp.autocast()
        with torch.cpu.amp.autocast():
            # Trainning loop
            if verbose:
                print("~~~~~~~~ Train ~~~~~~~~")
            for row in tqdm(train_data):
                if len(row) <= 1:
                    continue

                batch = tokenizer(row,
                                  truncation=True,
                                  max_length=seq_length,
                                  return_tensors='pt')
                # For CUDA devices
                #batch = {k: v.cuda() for k, v in batch.items()}

                out = gpt.forward(**batch,)
                train_loss = functional.cross_entropy(out.logits[:, :-1, :].flatten(0, -2),
                                                      batch['input_ids'][:, 1:].flatten(),
                                                      reduction='mean')
                train_perplexity = torch.exp(train_loss)

                if verbose:
                    #clear_output(wait=True)
                    #plt.plot(train_losses)
                    #plt.show()
                    print("Loss: ", train_loss.item())
                    print("Perplexity: ", train_perplexity.item())
                
                train_losses.append(train_loss.item())
                train_perplexities.append(train_perplexity.item())

                train_loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            
            # Validation loop
            if verbose:
                print("~~~~~~~~ Valid ~~~~~~~~")
            for row in tqdm(valid_data):
                if len(row) <= 1:
                    continue

                batch = tokenizer(row, truncation=True, max_length=seq_length, return_tensors='pt')
                # For CUDA devices
                #batch = {k: v.cuda() for k, v in batch.items()}

                out = gpt.forward(**batch,)
                valid_loss = functional.cross_entropy(out.logits[:, :-1, :].flatten(0, -2),
                                                      batch['input_ids'][:, 1:].flatten(),
                                                      reduction='mean')
                valid_perplexity = torch.exp(valid_loss)

                if verbose:
                    #clear_output(wait=True)
                    #plt.plot(valid_losses)
                    #plt.show()
                    print("Loss: ", valid_loss.item())
                    print("Perplexity: ", valid_perplexity.item())
                
                valid_losses.append(valid_loss.item())
                valid_perplexities.append(valid_perplexity.item())

                if min_valid_loss > valid_loss:
                    min_valid_loss = valid_loss
                    # Saving State Dict
                    model_best_state = gpt.state_dict()

            if lr_update:
                scheduler.step()

    return model_best_state, train_losses, train_perplexities, valid_losses, valid_perplexities

In [16]:
(best_state,
 train_losses,
 train_perplexities,
 valid_losses,
 valid_perplexities) = train(train_data=train_data,
                             valid_data=valid_data,
                             n_epochs=1,
                             lr=5e-5,
                             lr_update=True,
                             verbose=True)

~~~~~~~~ Train ~~~~~~~~


  0%|          | 0/597 [00:00<?, ?it/s]

AttributeError: 'NoneType' object has no attribute 'cdequantize_blockwise_cpu_fp32'

In [19]:
def predict(test_data, seq_length=256):
    test_losses = []
    test_perplexities = []

    if gpt.training:
        gpt.eval()

    for row in tqdm(test_data):
        if len(row) <= 1:
            continue

        batch = tokenizer(row,
                          truncation=True,
                          max_length=seq_length,
                          return_tensors='pt')
        # For CUDA devices
        #batch = {k: v.cuda() for k, v in batch.items()}

        out = gpt.forward(**batch,)
        test_loss = functional.cross_entropy(out.logits[:, :-1, :].flatten(0, -2),
                                              batch['input_ids'][:, 1:].flatten(),
                                              reduction='mean')
        test_perplexity = torch.exp(test_loss)
        test_losses.append(test_loss.item())
        test_perplexities.append(test_perplexity.item())
    return test_losses, test_perplexities

In [None]:
test_losses, test_perplexities = predict(test_data)

In [21]:
print("Mean test perplexity: ")
print(np.mean(test_perplexities))

Mean test perplexity: 
100.88329913941297


In [23]:
prompt = """Instruction: Generate a Python function that check if number is palindrome.

Answer: """
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

generated_ids = gpt.generate(input_ids,
                               do_sample=True,
                               temperature=0.9,
                               max_length=128)
generated_text = tokenizer.decode(generated_ids[0])
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Instruction: Generate a Python function that check if number is palindrome.

Answer: 
def isPalindrome(x):
    x = int(x)
    a = x + "^2"
    b = x/2

    if x == b or a == b:
        return True
    else:
        return False

<|endoftext|>
