In [1]:
!pip install torch datasets bitsandbytes peft transformers

You should consider upgrading via the '/work/projects/mhahsler/course_recomm/allocation001/AI_Club/projects/ft_llm/.venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [None]:
from datasets import load_dataset, load_from_disk
from peft import LoraConfig, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)



max_length = 128


# Model loading params
load_in_4bit = True

# LoRA Params
lora_alpha = 16             # How much to weigh LoRA params over pretrained params
lora_dropout = 0.1          # Dropout for LoRA weights to avoid overfitting
lora_r = 32                 # Bottleneck size between A and B matrix for LoRA params
lora_bias = "all"           # "all" or "none" for LoRA bias
model_type = ""      # falcon or llama or wizard7 or wizard13
dataset_type = "squad"      # "squad" or "reddit" or "reddit_negative"
lora_target_modules = [     # Which modules to apply LoRA to (names of the modules in state_dict)
    "query_key_value",
    "dense",
    "dense_h_to_4h",
    "dense_4h_to_h",
] if model_type == "falcon" else [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj"
]

# Trainer params
output_dir = "outputs"                              # Directory to save the model
optim_type = "adafactor"                            # Optimizer type to train with
learning_rate = 0.0005                              # Model learning rate
weight_decay = 0.002                                # Model weight decay
per_device_train_batch_size = 4                     # Train batch size on each GPU
per_device_eval_batch_size = 2                      # Eval batch size on each GPU
gradient_accumulation_steps = 2                     # Number of steps before updating model
warmup_steps = 5                                    # Number of warmup steps for learning rate
save_steps = 25                                     # Number of steps before saving model
logging_steps = 25                                  # Number of steps before logging








# Load in the model as a 4-bit or 8-bit model
if load_in_4bit == True:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=True,
    )
    model = AutoModelForCausalLM.from_pretrained(
        "WizardLM/WizardLM-13B-V1.2" if model_type == "wizard13" \
            else "TheBloke/wizardLM-7B-HF" if model_type == "wizard7" \
            else "tiiuae/falcon-7b" if model_type == "falcon" \
            else "gmongaras/Meta-Llama-3.1-8B",
        trust_remote_code=True,
        device_map="auto",
        quantization_config=bnb_config,
        cache_dir="./models",
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        "WizardLM/WizardLM-13B-V1.2" if model_type == "wizard13" \
            else "TheBloke/wizardLM-7B-HF" if model_type == "wizard7" \
            else "tiiuae/falcon-7b" if model_type == "falcon" \
            else "gmongaras/Meta-Llama-3.1-8B",
        trust_remote_code=True,
        device_map="auto",
        load_in_8bit=True,
        cache_dir="./models",
    )



# Load in the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "WizardLM/WizardLM-13B-V1.2" if model_type == "wizard13" \
            else "TheBloke/wizardLM-7B-HF" if model_type == "wizard7" \
            else "tiiuae/falcon-7b" if model_type == "falcon" \
            else "meta-llama/Llama-3.1-8B",
    trust_remote_code=True,
    cache_dir="./models",
)
tokenizer.pad_token = tokenizer.eos_token



def load_data():
    dataset = load_dataset(
        "gmongaras/Elon_Tweets",
        cache_dir="./datasets",
    )

    # Load in the dataset and map using the tokenizer
    def map_function(example):
        text = example["text"]
        favorites = example["favorites"]

        # Make the text look like
        """
        Favorites: {favorites}
        {text}
        """
        # if the reply is empty, just use the text
        text = f"Favorites: {favorites}\n {text}"

        # Encode the question and output
        text_encoded = tokenizer(text, max_length=max_length-1, truncation=True, padding="max_length")

        # Add on a pad token to the end of the input_ids
        text_encoded["input_ids"] = text_encoded["input_ids"] + [tokenizer.pad_token_id]

        # Attention mask is the length of the input_ids without the padding + 1
        # because we want the model to stop itself
        attention_mask = [1 for i in range(0, sum(text_encoded["attention_mask"]) + 1)] + [0 for i in range(sum(text_encoded["attention_mask"])+1, max_length)]
        assert len(attention_mask) == max_length and len(text_encoded["input_ids"]) == max_length, \
            "Attention mask or input_ids is not the correct length"
        # attention_mask = text_encoded["attention_mask"]

        # The labels are the input ids, but we want to mask the loss for the context and padding
        labels = [text_encoded["input_ids"][i] if attention_mask[i] == 1 else -100 for i in range(len(attention_mask))]
        assert len(labels) == len(attention_mask) and len(attention_mask) == len(text_encoded["input_ids"]), "Labels is not the correct length"

        return {
            "input_ids": text_encoded["input_ids"],
            "labels": labels,
            "attention_mask": attention_mask
        }
    dataset = dataset.map(map_function, batched=False, remove_columns=["id", "user_name", "user_location", "user_description", "user_created", "user_followers", "user_friends", "user_favourites", "user_verified", "date", "text", "hashtags", "source", "retweets", "favorites", "is_retweet"])
    return dataset

dataset = load_data()

# Randomize data
dataset = dataset.shuffle()

# Test/train split
train_size = len(dataset)
test_size = len(dataset)
data_train = dataset["train"]
data_test = data_train


# Adapt the model with LoRA weights
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias=lora_bias,
    task_type="CAUSAL_LM",
    inference_mode=False,
    target_modules=lora_target_modules
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


training_args = TrainingArguments(
    num_train_epochs = 50,
    output_dir=output_dir,
    eval_strategy="epoch",
    optim=optim_type,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    do_train=True,
    warmup_steps=warmup_steps,
    save_steps=save_steps,
    logging_steps=logging_steps,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_test,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


Loading checkpoint shards: 100%|██████████████████████████████████████████████| 7/7 [00:16<00:00,  2.31s/it]
  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 83,886,080 || all params: 8,114,147,328 || trainable%: 1.0338


Epoch,Training Loss,Validation Loss
1,2.7282,2.350325
2,2.2788,1.815321
3,1.82,1.519399
4,1.5684,1.322011
5,1.4041,1.13242
6,1.1932,0.9628
7,1.0083,0.846884
8,0.8793,0.714054
9,0.786,0.631959
10,0.7061,0.562283


In [None]:
import os
os.kill(os.getpid(), 9)

In [7]:
import peft
import torch
from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
import shutil



lora_path = "outputs/checkpoint-26875" # Path to the LoRA weights
output_path = "outputs/merged_model"   # Path to output the merged weights
model_type = "llama"                      # falcon or llama or wizard7 or wizard13



peft_model_id = lora_path
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(
    peft_config.base_model_name_or_path,
    return_dict=True,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="cuda",
    cache_dir="./models"
)
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, cache_dir="./models")

# Copy local model to output path
import os
if model_type == "wizard7":
    path = os.listdir("models/models--TheBloke--wizardLM-7B-HF/snapshots")[0]
    path = os.path.join("models/models--TheBloke--wizardLM-7B-HF/snapshots", path)
elif model_type == "wizard13":
    path = os.listdir("models/models--WizardLM--WizardLM-13B-V1.2/snapshots")[0]
    path = os.path.join("models/models--WizardLM--WizardLM-13B-V1.2/snapshots", path)
elif model_type == "llama":
    path = os.listdir("models/models--meta-llama--Llama-3.1-8B/snapshots/")[0]
    path = os.path.join("models/models--meta-llama--Llama-3.1-8B/snapshots/", path)
else:
    raise ValueError("Invalid model type")
shutil.copytree(path, output_path, dirs_exist_ok=True, ignore=shutil.ignore_patterns('*.pt', "*.pth", "*.bin"))
if model_type == "wizard13":
    # Remove the "added_tokens.json" file
    os.remove(os.path.join(output_path, "added_tokens.json"))

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)
model.eval()

key_list = [key for key, _ in model.named_modules() if "lora" not in key]
for key in key_list:
    try:
        sub_mod = model.get_submodule(key)
        parent = model.get_submodule(".".join(key.split(".")[:-1]))
    except AttributeError:
        continue
    target_name = key.split(".")[-1]
    if isinstance(sub_mod, peft.tuners.lora.Linear):
        sub_mod.merge()
        try:
            bias = sub_mod.bias is not None
        except:
            bias = None
        new_module = torch.nn.Linear(sub_mod.in_features, sub_mod.out_features, bias=bias)
        new_module.weight.data = sub_mod.weight
        if bias:
            new_module.bias.data = sub_mod.bias
        model.base_model._replace_module(parent, target_name, new_module, sub_mod)

model = model.base_model.model

# Save the model
model.save_pretrained(output_path)

Loading checkpoint shards: 100%|██████████████████████████████████████████████| 7/7 [00:09<00:00,  1.34s/it]


In [None]:
import os
os.kill(os.getpid(), 9)

In [8]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
)
import torch






device = "auto"
model_path = "outputs/merged_model"             # Path to the combined weights





bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=True,
    )
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    # trust_remote_code=True,
    device_map=device,
    torch_dtype=torch.bfloat16,
    # load_in_8bit=True,
    quantization_config=bnb_config if device == "auto" else None,
).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path)

Loading checkpoint shards: 100%|██████████████████████████████████████████████| 4/4 [00:17<00:00,  4.43s/it]


In [12]:
# Prompt should be in this style due to how the data was created
favorites = "0"
init = ""
prompt = f"favorites: {favorites}\n"
if init != None and init != "":
    prompt += f"{init}"

limit = 128


inputs = tokenizer(prompt, return_tensors="pt")
if device != "cpu":
    inputs = inputs.to('cuda')
output = model.generate(**inputs, temperature=0.75, do_sample=True, top_p=0.95, top_k=60, max_new_tokens=limit-len(inputs["input_ids"]), pad_token_id=tokenizer.pad_token_id)
output = tokenizer.decode(output[0], skip_special_tokens=True)

print('output:', output)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


output: favorites: 0
 @cb_doge @stillgray @ezralevant @latimes That one was real too


In [72]:
# Prompt should be in this style due to how the data was created
favorites = "10"
init = ""
prompt = f"favorites: {favorites}\n"
if init != None and init != "":
    prompt += f"{init}"

limit = 128


inputs = tokenizer(prompt, return_tensors="pt")
if device != "cpu":
    inputs = inputs.to('cuda')
output = model.generate(**inputs, temperature=0.99, do_sample=True, top_p=0.95, top_k=60, max_new_tokens=limit-len(inputs["input_ids"]), pad_token_id=tokenizer.pad_token_id)
output = tokenizer.decode(output[0], skip_special_tokens=True)

print('output:', output)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


output: favorites: 10
I love   https://t.co/koWp9nq1pE


In [172]:
from transformers import StoppingCriteriaList
import torch

favorites = "9"
init = "@miakhalifa Mia Khalifa has "
prompt = f"favorites: {favorites}\n{init}" if init else f"favorites: {favorites}\n"
limit = 128

inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# If you intend "limit" to be total tokens (prompt + gen), do:
# input_len = inputs["input_ids"].shape[1]
# max_new = max(1, limit - input_len)
# Otherwise, if "limit" means *new* tokens, do:
max_new = limit

gen_out = model.generate(
    **inputs,
    do_sample=True,
    temperature=4.0,
    top_p=0.95,
    top_k=60,
    max_new_tokens=max_new,
    min_new_tokens=8,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

# Show only the completion (not the prompt)
input_len = inputs["input_ids"].shape[1]
new_tokens = gen_out[0, input_len:]
completion = tokenizer.decode(new_tokens, skip_special_tokens=True)

print("prompt:", prompt)
print("completion:", completion)

prompt: favorites: 9
@miakhalifa Mia Khalifa has 
completion: 11084 likes | https://t.co/yxUIQEOaqC
