In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
import torch
from datasets import load_from_disk
from trl import SFTTrainer
from peft import PeftModel, LoraConfig, prepare_model_for_kbit_training, get_peft_model

In [2]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

In [3]:
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"

In [4]:
bnb_4b_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

In [30]:
bnb_8b_config = BitsAndBytesConfig(
    load_in_8bit=True,
    # bnb_4bit_quant_type="nf4",
    bnb_8bit_compute_dtype=torch.bfloat16,
)

In [6]:
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=bnb_8b_config,
    device_map={"": 0},
    trust_remote_code=True,
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
# #Create a new token and add it to the tokenizer
# tokenizer.add_special_tokens({"pad_token":"<pad>"})

# #Resize the embeddings
# base_model.resize_token_embeddings(len(tokenizer))

# #Configure the pad token in the model
# base_model.config.pad_token_id = tokenizer.pad_token_id

In [8]:
base_model.config.vocab_size, base_model.config.max_position_embeddings

(32000, 4096)

In [None]:
peft_config = LoraConfig(
      lora_alpha=16,
      lora_dropout=0.1,
      r=64,
      bias="none",
      task_type="CAUSAL_LM",
)

In [None]:
peft_model = get_peft_model(base_model, peft_config)

In [8]:
trainingArgs = TrainingArguments(
    output_dir='llama2-sft-debug',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=5,
    save_strategy="epoch",
    learning_rate=2e-4,
    weight_decay=0.001,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    group_by_length=False,
    lr_scheduler_type="cosine",
    disable_tqdm=True,
    report_to="wandb",
    seed=42
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=prompt_instruction_format,
    args=trainingArgs,
)

In [None]:
trainer.train()

In [None]:
# Merge LoRA with the base model and save the merged model
merged = trainer.model.merge_and_unload()
merged.save_pretrained("merged",safe_serialization=True)
tokenizer.save_pretrained("merged")

#push merged model to the hub
merged.push_to_hub("codegen-350M-mono-python-18k-alpaca")
tokenizer.push_to_hub("codegen-350M-mono-python-18k-alpaca")

In [14]:
def tokenize(tokenizer, prompt, max_length=4096, add_eos_token=False):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=max_length,
        padding=False,
        return_tensors=None)

    result["labels"] = result["input_ids"].copy()
    return result

# Data Preparation

In [70]:
from datasets import load_from_disk
import pdb
import numpy as np

In [8]:
dataset = load_from_disk('goemotion_subset/')

In [9]:
dataset['train'][0]

{'text': 'WHY THE FUCK IS BAYLESS ISOING', 'labels': [0], 'id': 'eezlygj'}

In [10]:
# SYSTEM_MESSAGE = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. \n\n If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."

In [11]:
SYSTEM_MESSAGE = "Find the emotions from the sentence given below. The options are 'anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise'. The sentence can be one or more emotions from this list"

In [12]:
label2id = {'anger': 0, 'disgust': 1, 'fear': 2, 'joy': 3, 'sadness': 4, 'surprise': 5}
id2label = {v:k for k,v in label2id.items()}

In [13]:
tokenize = lambda text,special_tokens : tokenizer.encode(text,truncation=True,padding=False,max_length=512,add_special_tokens=special_tokens)

In [14]:
tokenize('how are you doing',True)

[1, 920, 526, 366, 2599]

In [15]:
### generate prompt based on template ###
prompt_template = {
    "with_label": \
    "<s>[INST] <<SYS>>\n{instr}\n<</SYS>>\n\n {text} [/INST] The emotions in this sentence are {labels} </s>",

    "without_label": \
    "<s>[INST] <<SYS>>\n{instr}\n<</SYS>>\n\n {text} [/INST] </s>"
}

def generate_prompt(example, prompt_template=prompt_template):
    text = example['text']
    labels_int = example['labels']    
    labels_str = ", ".join([id2label[x] for x in labels_int])
    
    with_label = prompt_template["with_label"].format(
            text=text,labels=labels_str,instr=SYSTEM_MESSAGE.strip())
    
    without_label = prompt_template["without_label"].format(
            text=text,instr=SYSTEM_MESSAGE.strip())
    # pdb.set_trace()
    tokenized_with_label = tokenize(with_label,False)
    tokenized_wo_label = tokenize(without_label,False)
    prompt_len = len(tokenized_wo_label)-2 # For [/INST] </s>
    mask = [-100]*prompt_len
    labels = tokenized_with_label.copy()
    labels[:prompt_len]=mask
    
    enc = {}
    enc['input_ids']=tokenized_with_label
    enc['label']=labels

    return enc

In [55]:
prompted = dataset.map(generate_prompt,remove_columns=dataset['train'].column_names)

In [56]:
prompted = prompted.rename_columns({'label':'labels'})
# prompted.set_format('pt')

In [45]:
tokenizer.decode([x for x in generate_prompt(dataset['train'][39])['label'] if x!=-100])

'The emotions in this sentence are surprise </s>'

In [46]:
tokenizer.decode([x for x in prompted['train'][10]['labels'] if x!=-100])

'The emotions in this sentence are joy </s>'

In [47]:
tokenizer.decode(prompted['train'][10]['input_ids'][prompted['train'][10]['labels']==-100])

"<s> [INST] <<SYS>>\nFind the emotions from the sentence given below. The options are 'anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise'. The sentence can be one or more emotions from this list\n<</SYS>>\n\n It's because you play against 1000 ms ping EU players that have no idea what's going on. Happy hunting! [/INST]"

In [21]:
idx = 10
input_ids = prompted['train'][idx]['input_ids'][prompted['train'][idx]['label']==-100].unsqueeze(0).cuda()
input_ids

tensor([[    1,   518, 25580, 29962,  3532, 14816, 29903,  6778,    13, 12542,
           278, 23023,  1080,   515,   278, 10541,  2183,  2400, 29889,   450,
          3987,   526,   525,  4600,   742,   525,  2218, 29887,   504,   742,
           525, 29888,   799,   742,   525,  2212, 29891,   742,   525, 29879,
           328,  2264,   742,   525,  7610,  7734,  4286,   450, 10541,   508,
           367,   697,   470,   901, 23023,  1080,   515,   445,  1051,    13,
         29966,   829, 14816, 29903,  6778,    13,    13,   739, 29915, 29879,
          1363,   366,  1708,  2750, 29871, 29896, 29900, 29900, 29900, 10887,
         24543, 19007, 10769,   393,   505,   694,  2969,   825, 29915, 29879,
          2675,   373, 29889, 28569, 29826, 29991,   518, 29914, 25580, 29962]],
       device='cuda:0')

In [None]:
# output = base_model.generate(input_ids)

In [61]:
prompted['train'][idx]

{'input_ids': [1,
  518,
  25580,
  29962,
  3532,
  14816,
  29903,
  6778,
  13,
  12542,
  278,
  23023,
  1080,
  515,
  278,
  10541,
  2183,
  2400,
  29889,
  450,
  3987,
  526,
  525,
  4600,
  742,
  525,
  2218,
  29887,
  504,
  742,
  525,
  29888,
  799,
  742,
  525,
  2212,
  29891,
  742,
  525,
  29879,
  328,
  2264,
  742,
  525,
  7610,
  7734,
  4286,
  450,
  10541,
  508,
  367,
  697,
  470,
  901,
  23023,
  1080,
  515,
  445,
  1051,
  13,
  29966,
  829,
  14816,
  29903,
  6778,
  13,
  13,
  739,
  29915,
  29879,
  1363,
  366,
  1708,
  2750,
  29871,
  29896,
  29900,
  29900,
  29900,
  10887,
  24543,
  19007,
  10769,
  393,
  505,
  694,
  2969,
  825,
  29915,
  29879,
  2675,
  373,
  29889,
  28569,
  29826,
  29991,
  518,
  29914,
  25580,
  29962,
  450,
  23023,
  1080,
  297,
  445,
  10541,
  526,
  15331,
  29871,
  2],
 'labels': [-100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100

In [32]:
%%time 
output = base_model(input_ids=prompted['train'][idx]['input_ids'].unsqueeze(0).cuda(),
                    labels=prompted['train'][idx]['label'].unsqueeze(0).cuda())
output.loss.backward()

CPU times: user 738 ms, sys: 311 ms, total: 1.05 s
Wall time: 1.11 s


In [33]:
output.loss

tensor(3.3494, device='cuda:0', grad_fn=<NllLossBackward0>)

In [52]:
from transformers import DataCollatorForLanguageModeling, default_data_collator
from torch.utils.data import DataLoader

In [71]:
class DataCollator:
    
    def __init__(self,tokenizer,pad_to_multiple_of=None):
        self.tokenizer = tokenizer
        self.pad_to_multiple_of = pad_to_multiple_of
        
    def __call__(self,batch):
        
        batch = {k:[example[k] for example in batch] for k in batch[0].keys()}
        for k,v in batch.items():
            batch[k]=self.collate(v)
            
        return batch
        
    def collate(self,examples):
        """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
        import torch

        # Tensorize if necessary.
        if isinstance(examples[0], (list, tuple, np.ndarray)):
            examples = [torch.tensor(e, dtype=torch.long) for e in examples]

        length_of_first = examples[0].size(0)

        # Check if padding is necessary.

        are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
        if are_tensors_same_length and (self.pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
            return torch.stack(examples, dim=0)

        # If yes, check if we have a `pad_token`.
        if self.tokenizer._pad_token is None:
            raise ValueError(
                "You are attempting to pad samples but the tokenizer you are using"
                f" ({tokenizer.__class__.__name__}) does not have a pad token."
            )

        # Creating the full tensor and filling it with our data.
        max_length = max(x.size(0) for x in examples)
        if self.pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
            max_length = ((max_length // self.pad_to_multiple_of) + 1) * self.pad_to_multiple_of
        result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
        for i, example in enumerate(examples):
            if self.tokenizer.padding_side == "right":
                result[i, : example.shape[0]] = example
            else:
                result[i, -example.shape[0] :] = example
        return result

In [72]:
collator = DataCollator(tokenizer)

In [73]:
loader = DataLoader(prompted['train'],batch_size=4,collate_fn=collator)

In [74]:
batch = next(iter(loader))

In [79]:
batch = {k:v.cuda() for k,v in batch.items()}

In [77]:
batch['input_ids'][0]

tensor([    1,   518, 25580, 29962,  3532, 14816, 29903,  6778,    13, 12542,
          278, 23023,  1080,   515,   278, 10541,  2183,  2400, 29889,   450,
         3987,   526,   525,  4600,   742,   525,  2218, 29887,   504,   742,
          525, 29888,   799,   742,   525,  2212, 29891,   742,   525, 29879,
          328,  2264,   742,   525,  7610,  7734,  4286,   450, 10541,   508,
          367,   697,   470,   901, 23023,  1080,   515,   445,  1051,    13,
        29966,   829, 14816, 29903,  6778,    13,    13, 12317, 29979,  6093,
          383, 29965,  7077,  8519,   350, 29909, 29979,  1307,  1799, 17723,
         4214,   518, 29914, 25580, 29962,   450, 23023,  1080,   297,   445,
        10541,   526, 27343, 29871,     2,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [78]:
batch['labels'][0]

tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,   450, 23023,  1080,   297,   445,
        10541,   526, 27343, 29871,     2,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [80]:
%%time 
output = base_model(**batch)
output.loss.backward()

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 14.61 GiB total capacity; 13.35 GiB already allocated; 17.19 MiB free; 13.90 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF