In [1]:
import os
import sys
from typing import List


import torch
import transformers
from datasets import load_dataset

"""
Unused imports:
import torch.nn as nn
import bitsandbytes as bnb
"""

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import LlamaForCausalLM, LlamaTokenizer

from utils.prompter import Prompter

base_model = "/root/llama-7b-hf"  # the only required argument
data_path = "train_data_3_class_clean.jsonl"
output_dir = "/root/autodl-tmp/output2s"
# training hyperparams
batch_size = 128
micro_batch_size = 4
num_epochs = 100
learning_rate = 3e-4
cutoff_len = 256
val_set_size = 0
# lora hyperparams
lora_r = 8
lora_alpha = 16
lora_dropout = 0.05
lora_target_modules = [
    "q_proj",
    "v_proj",
]
# llm hyperparams
train_on_inputs = True  # if False, masks out inputs in loss
add_eos_token = False
group_by_length = False  # faster, but produces an odd training loss curve
# wandb params
wandb_project = ""
wandb_run_name = ""
wandb_watch = ""  # options: false | gradients | all
wandb_log_model = ""  # options: false | true
# resume_from_checkpoint = '/root/autodl-tmp/output/checkpoint-100'  # either training checkpoint or final adapter
resume_from_checkpoint=None
prompt_template_name = "alpaca"  # The prompt template to use, will default to alpaca.
device_map = "auto"
gradient_accumulation_steps = batch_size // micro_batch_size

world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
use_wandb = len(wandb_project) > 0 or (
    "WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0
)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /root/miniconda3/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /root/miniconda3/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [2]:
prompter = Prompter(prompt_template_name)

In [3]:
model = LlamaForCausalLM.from_pretrained(
        base_model,
        load_in_8bit=False,
        torch_dtype=torch.float16,
        device_map=device_map,
    )
# model = LlamaForCausalLM.from_pretrained(
#         base_model,
#         # load_in_8bit=True,
#         # torch_dtype=torch.float16,
#         # device_map=device_map,
#     ).half().cuda()

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

In [4]:
tokenizer = LlamaTokenizer.from_pretrained(base_model)

tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
 )
tokenizer.padding_side = "left"  # Allow batched inference

In [5]:
# model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=lora_target_modules,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

if resume_from_checkpoint:
    # Check the available weights and load them
    checkpoint_name = os.path.join(
        resume_from_checkpoint, "pytorch_model.bin"
    )  # Full checkpoint
    if not os.path.exists(checkpoint_name):
        checkpoint_name = os.path.join(
            resume_from_checkpoint, "adapter_model.bin"
        )  # only LoRA model - LoRA config above has to fit
        resume_from_checkpoint = (
            False  # So the trainer won't try loading its state
        )
    # The two files above have a different name depending on how they were saved, but are actually the same.
    if os.path.exists(checkpoint_name):
        print(f"Restarting from {checkpoint_name}")
        adapters_weights = torch.load(checkpoint_name)
        set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {checkpoint_name} not found")


model.print_trainable_parameters()

trainable params: 4194304 || all params: 6742609920 || trainable%: 0.06220594176090199


In [6]:
def tokenize(prompt, add_eos_token=True):
        # there's probably a way to do this with the tokenizer settings
        # but again, gotta move fast
        result = tokenizer(
            prompt,
            truncation=True,
            max_length=cutoff_len,
            padding=False,
            return_tensors=None,
        )
        if (
            result["input_ids"][-1] != tokenizer.eos_token_id
            and len(result["input_ids"]) < cutoff_len
            and add_eos_token
        ):
            result["input_ids"].append(tokenizer.eos_token_id)
            result["attention_mask"].append(1)

        result["labels"] = result["input_ids"].copy()

        return result
def generate_and_tokenize_prompt(data_point):
    data_point["instruction"] = 'What is the sentiment toward Bitcoin in the input sentence? [positive, negative, neutral]'
    data_point["input"] = data_point['text']
    data_point["output"] = data_point['label']
    del data_point['text']
    del data_point['label']
    full_prompt = prompter.generate_prompt(
        data_point["instruction"],
        data_point["input"],
        data_point["output"],
    )
    # print(full_prompt)
    tokenized_full_prompt = tokenize(full_prompt)
    if not train_on_inputs:
        user_prompt = prompter.generate_prompt(
            data_point["instruction"], data_point["input"]
        )
        tokenized_user_prompt = tokenize(
            user_prompt, add_eos_token=add_eos_token
        )
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if add_eos_token:
            user_prompt_len -= 1

        tokenized_full_prompt["labels"] = [
            -100
        ] * user_prompt_len + tokenized_full_prompt["labels"][
            user_prompt_len:
        ]  # could be sped up, probably
    return tokenized_full_prompt





data = load_dataset("json", data_files=data_path)

data["test"] = data["train"].select(range(300,len(data["train"])))
data["train"] = data["train"].select(range(300))
# print(data["train"][0])
train_data = data["train"].map(generate_and_tokenize_prompt)
val_data = data["test"].map(generate_and_tokenize_prompt)
val_set_size = len(val_data)
# val_set_size = 0
print(train_data)
print(val_set_size, val_data)
# train_data


Found cached dataset json (/root/.cache/huggingface/datasets/json/default-c18f34d3d9ca0ab9/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-c18f34d3d9ca0ab9/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-32aabb3f5ee4accb.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-c18f34d3d9ca0ab9/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-8d99697f064efea7.arrow


Dataset({
    features: ['instruction', 'input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 300
})
815 Dataset({
    features: ['instruction', 'input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 815
})


In [7]:
import json
from torch.utils.data import DataLoader
import time
def my_evaluate(self, ignore_keys):
    self.model.eval()    
    instructions = []
    with open(data_path, 'r') as f:
        for line in f:
            data = json.loads(line)
            data_point = {}
            data_point["instruction"] = 'What is the sentiment toward Bitcoin in the input sentence? [positive, negative, neutral]'
            data_point["input"] = data['text']
            data_point["output"] = data['label']
            full_prompt = prompter.generate_prompt(
                data_point["instruction"],
                data_point["input"]
            )
            instructions.append({'context':full_prompt, 'target':data['label']})

        # print(instructions[0])
        start_time = time.time()
        with torch.no_grad():
            right = 0
            all = 0
            batch_size = 32
            input_texts = []
            targets = []
            for idx, item in enumerate(instructions[300:]):
                # feature = format_example(item)
                # input_text = feature["context"]
                all = all + 1
                input_texts.append(item["context"])
                targets.append(item["target"])
            test_loader = DataLoader(input_texts, batch_size=batch_size)
            for batch_idx,batch in enumerate(test_loader):
                input_ids = tokenizer(batch, padding=True,return_tensors='pt').to('cuda')
                out = model.generate(
                    **input_ids,
                    temperature=0,
                    return_dict_in_generate= True,
                    output_scores=True,
                    max_new_tokens = 1
                )
                seqs = out['sequences']
                scores = out['scores']
                # print(scores[0].shape)
                results = tokenizer.batch_decode(seqs)
                # print(results, '\n\n')
                # break
                for idx,res in enumerate(results):
                    pred = res[res.find('Response') + 10:]
                    target = targets[batch_idx*batch_size + idx]
                    if target.find(pred) >= 0:
                        right = right + 1
                # print(right,all,right/all)
    metrics = {"eval_acc": right/all}
    self.log(metrics)
    # print(metrics)
    self.model.train()
    self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
    return metrics



transformers.Trainer.evaluate = my_evaluate
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=micro_batch_size,
        gradient_accumulation_steps=1,
        # warmup_steps=100,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        fp16=True,
        # logging_strategy  = "steps",
        logging_steps=10,
        optim="adamw_torch",
        metric_for_best_model = "acc",
        evaluation_strategy="steps" if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=100 if val_set_size > 0 else None,
        save_steps=100,
        output_dir=output_dir,
        save_total_limit=50,
        load_best_model_at_end=True if val_set_size > 0 else False,
        ddp_find_unused_parameters=False if ddp else None,
        group_by_length=group_by_length,
        report_to="wandb" if use_wandb else None,
        run_name=wandb_run_name if use_wandb else None,
    ),
    data_collator=transformers.DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)
model.config.use_cache = False

# old_state_dict = model.state_dict
# model.state_dict = (
#     lambda self, *_, **__: get_peft_model_state_dict(
#         self, old_state_dict()
#     )
# ).__get__(model, type(model))
# if torch.__version__ >= "2" and sys.platform != "win32":
#     model = torch.compile(model)

In [8]:
trainer.train(resume_from_checkpoint=resume_from_checkpoint)

Step,Training Loss,Validation Loss,Acc
100,0.9915,No log,0.790184
200,0.7887,No log,0.814724
300,0.7898,No log,0.807362
400,0.4387,No log,0.826994
500,0.3917,No log,0.828221
600,0.3023,No log,0.818405
700,0.1818,No log,0.828221
800,0.1812,No log,0.817178
900,0.167,No log,0.834356
1000,0.1458,No log,0.833129




TrainOutput(global_step=7500, training_loss=0.1646334870815277, metrics={'train_runtime': 4946.4504, 'train_samples_per_second': 6.065, 'train_steps_per_second': 1.516, 'total_flos': 1.7496858579173376e+17, 'train_loss': 0.1646334870815277, 'epoch': 100.0})

In [9]:
model.save_pretrained(output_dir)

In [10]:
import json
from torch.utils.data import DataLoader
import time

# instructions = json.load(open("data/alpaca_data.json"))
instructions = []
with open(data_path, 'r') as f:
    for line in f:
        data = json.loads(line)
        data_point = {}
        data_point["instruction"] = 'What is the sentiment toward Bitcoin in the input sentence? [positive, negative, neutral]'
        data_point["input"] = data['text']
        data_point["output"] = data['label']
        full_prompt = prompter.generate_prompt(
            data_point["instruction"],
            data_point["input"]
        )
        instructions.append({'context':full_prompt, 'target':data['label']})

print(instructions[0])
start_time = time.time()
with torch.no_grad():
    right = 0
    all = 0
    batch_size = 8
    input_texts = []
    targets = []
    for idx, item in enumerate(instructions[300:]):
        # feature = format_example(item)
        # input_text = feature["context"]
        all = all + 1
        input_texts.append(item["context"])
        targets.append(item["target"])
    test_loader = DataLoader(input_texts, batch_size=batch_size)
    for batch_idx,batch in enumerate(test_loader):
        input_ids = tokenizer(batch, padding=True,return_tensors='pt').to('cuda')
        # out = model.generate(**input_ids, max_length=250, temperature=0)
        out = model.generate(
            **input_ids,
            temperature=0,
            return_dict_in_generate= True,
            output_scores=True,
            max_new_tokens = 1
        )
        seqs = out['sequences']
        scores = out['scores']
        # print(scores[0].shape)
        results = tokenizer.batch_decode(seqs)
        # print(results, '\n\n')
        # break
        for idx,res in enumerate(results):
            # print(res,'\n\n')
            pred = res[res.find('Response') + 10:]
            
            target = targets[batch_idx*batch_size + idx]
            # print(pred, target)
#             # print(pred, batch_idx*batch_size + idx,targets[batch_idx*batch_size + idx])\
            # if pred.find(target) >= 0:
            #     right = right + 1
            if target.find(pred) >= 0:
                right = right + 1
        print(right,all,right/all)
end_time = time.time()
elapsed_time = end_time - start_time
print("Finished in {:.2f} seconds.".format(elapsed_time))





{'context': "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is the sentiment toward Bitcoin in the input sentence? [positive, negative, neutral]\n\n### Input:\nBut... Bitcoin's halving IS priced in.\n\nFirst reminder of 2020\n\n### Response:\n", 'target': 'negative'}
6 815 0.007361963190184049
14 815 0.01717791411042945
21 815 0.025766871165644172
25 815 0.03067484662576687
31 815 0.03803680981595092
35 815 0.04294478527607362
42 815 0.051533742331288344
49 815 0.06012269938650307
56 815 0.0687116564417178
64 815 0.0785276073619632
72 815 0.08834355828220859
78 815 0.09570552147239264
86 815 0.10552147239263804
94 815 0.11533742331288344
101 815 0.12392638036809817
108 815 0.1325153374233129
114 815 0.13987730061349693
122 815 0.14969325153374233
130 815 0.15950920245398773
134 815 0.16441717791411042
142 815 0.17423312883435582
150 815 0.184049079754