In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset
from bitnet_158 import BitLinear, BitLinear158, inject
from transformers import TrainerCallback
import jsonlines
import json
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# necessary for chosing the tokenizer
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    use_cache=True
)


In [5]:
num_params = sum(p.numel() for p in model.parameters())
print(f"number of parameters: {num_params}")

number of parameters: 1100048384


In [6]:
# inject BitLinear158 layers
inject(model, copy_weights=True, module_class=BitLinear158)


In [7]:
class PrintFirstLayerGradientsCallback(TrainerCallback):
    """
    A custom callback that prints the gradient of the first layer of the model at each training step.
    """
    def on_step_end(self, args, state, control, **kwargs):
        # Assuming 'model' is your model instance and it's a PyTorch model
        # You may need to adjust the layer name depending on your model architecture
        num_params = 0
        print("First layer gradients")
        for k, v in model.named_parameters():
            if v.grad is not None:
                print(k, v.grad)
                num_params += v.numel()
            else:
                print(k, "None")
        print("Number of parameters: ", num_params)

In [8]:
def get_training_pairs(x_arr: list, y_arr: list):
    x = []
    y = []
    for i in range(len(y_arr)):
        y.append(y_arr[i])
        x.append(x_arr + y_arr[:i])
    return x, y

In [9]:
# Start training

X = []
y = []
n_tokens = 0

x_lens = set()
y_lens = set()

with jsonlines.open("./data/conversations.jsonl") as reader:
    for obj in tqdm(reader):
        json_obj = json.loads(obj)
        
        sys_prompt = ""
        x_text = ""
        y_text = ""
        for elem in json_obj:
            if elem["role"] == "user":
                x_text = elem["content"]
            elif elem["role"] == "system":
                sys_prompt = elem["content"]
            else:
                y_text = elem["content"]
        
        x_text = sys_prompt + " " + x_text
        
        x_tokens = tokenizer(x_text, return_tensors="pt", padding="max_length", max_length=512, truncation=True)
        y_tokens = tokenizer(y_text, return_tensors="pt", padding="max_length", max_length=512, truncation=True)


        x_lens.add(len(x_tokens["input_ids"][0]))
        y_lens.add(len(y_tokens["input_ids"][0]))

        

        n_tokens += len(x_tokens["input_ids"][0]) + len(y_tokens["input_ids"][0])
        if n_tokens > 100_000:
            break

        x_combinations, y_combinations = get_training_pairs(x_tokens["input_ids"][0].tolist(), y_tokens["input_ids"][0].tolist())
        X += x_combinations
        y += y_combinations

print("x_lens", x_lens)
print("y_lens", y_lens)

X = [x if len(x) == 512 else x[:512] for x in X]
#y = [y if len(y) == 512 else y[:512] for y in y]

# convert to tensors
x_tensor = torch.tensor(X)
y_tensor = torch.tensor(y)

97it [00:00, 135.99it/s]


x_lens {512}
y_lens {512}


In [17]:
#convert to torch.float16
x_tensor = x_tensor.to(torch.float16)
y_tensor = y_tensor.to(torch.float16)

In [18]:
train_dataset = Dataset.from_dict({"input_ids": x_tensor, "labels": y_tensor})

In [19]:
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        "test-clm",
        evaluation_strategy="steps",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        weight_decay=0.01,
        logging_dir="logs",
        logging_steps=10,
        save_steps=10,
        save_total_limit=2,
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    train_dataset=train_dataset,
    callbacks=[PrintFirstLayerGradientsCallback]
)
trainer.train()

  0%|          | 0/6208 [01:33<?, ?it/s]
  0%|          | 0/6208 [00:52<?, ?it/s]
  0%|          | 0/6208 [00:00<?, ?it/s]

ValueError: type of 1.0 unknown: <class 'numpy.float16'>. Should be one of a python, numpy, pytorch or tensorflow object.

In [None]:
def inference(
        model,
        tokenizer,
        sys_prompt: str,
        user_input: str,
        max_new_tokens: int = 100
):

    x_tokens = tokenizer(sys_prompt + " " + user_input, return_tensors="pt", padding="max_length", max_length=512, truncation=True)

    # generate
    output = model.generate(
        x_tokens["input_ids"],
        do_sample=True,
        max_length=x_tokens["input_ids"].shape[1] + max_new_tokens,
        pad_token_id=tokenizer.eos_token_id
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)