In [None]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [None]:
from datasets import load_dataset
import numpy as np

dataset = load_dataset("json", data_files='../data/dataset.json')

In [None]:
example1 = """{example1}"""

In [None]:
answer1 = """{answers1}"""

In [None]:
example2 = """{example2}"""

In [None]:
answer2 = """{answer2}"""

In [None]:
def format_dataset(data_point):
    system_prompt = """{Add question}"""
    addon_prompt =  """You are a {task}. {Add addon prompt}. 
                If the answer is not available, please explicitly state that the information is not known or accessible."""
    input_prompt =f"""{addon_prompt}\n{system_prompt}\n
    ### USER: {data_point["INPUT"]}
    ### ASSISTANT: {data_point["RESPONSE "]}
    """
    return {"text": input_prompt}

### Load Base Model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

### Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=1024,
    padding_side="left",
    add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
def generate_and_tokenize_prompt(data_point):
    system_prompt = """{Add question}"""
    addon_prompt = """You are a {task}. {Add addon prompt}. 
                If the answer is not available, please explicitly state that the information is not known or accessible."""
    full_prompt =f"""{addon_prompt}\n{system_prompt}\n
    ### USER: {data_point["INPUT"]}
    ### ASSISTANT: {data_point["RESPONSE "]}
    """
    return tokenize(full_prompt)

In [None]:
dataset_prepared = dataset['train'].train_test_split(test_size=0.1, shuffle=False, seed=42)

In [None]:
tokenized_train_dataset = dataset_prepared['train'].map(generate_and_tokenize_prompt)
tokenized_val_dataset = dataset_prepared['test'].map(generate_and_tokenize_prompt)

In [None]:
dataset_prepared['train']

In [None]:
dataset_prepared['test']

In [None]:
print(tokenized_train_dataset[4]['input_ids'])

In [None]:
tokenized_train_dataset[4]

In [None]:
print(len(tokenized_train_dataset[4]['input_ids']))

In [None]:
tokenized_train_dataset[0]

### Evaluation on base model

In [None]:
eval_prompt = """You are a question-answering system. Given the context answer the user queries.

### USER: Text: Q:\n\nPosition character based on enemy coordinates in lua\n\nI have written a function here which should turn my character based on enemy coordinates but it\'s not perfect because it does not always turn where I want it to and perhaps there is a better way of writing it\nlocal myPosition = {x = 350, y = 355}\nlocal enemyPosition = {x = 352, y = 354}\nlocal xValue, yValue, xDir, yDir, dir\n\nif myPosition.x > enemyPosition.x then\n    xValue = myPosition.x - enemyPosition.x\nelseif myPosition.x < enemyPosition.x then\n    xValue = myPosition.x - enemyPosition.x\nelse\n    xValue = 0\nend\n\nif myPosition.y > enemyPosition.y then\n    yValue = myPosition.y - enemyPosition.y\nelseif myPosition.y < enemyPosition.y then\n    yValue = myPosition.y - enemyPosition.y\nelse\n    yValue = 0\nend\n\nif xValue < 0 then\n    xDir = "TURN RIGHT"\nelseif xValue > 0 then\n    xDir = "TURN LEFT"\nend\n\nif yValue < 0 then\n    yDir = "TURN DOWN"\nelseif yValue > 0 then\n    yDir = "TURN UP"\nend\n\nif xValue > yValue then\n    dir = xDir\nelseif xValue \n    dir = yDir\nend\n\nprint("Turn: " .. dir)\n\nAnd here you have some pictures to further illustrate what I have in mind:\n\nAs you can see on the pictures, direction depends on the higher number.

### ASSISTANT: I've read this text.

### USER: What describes programming concept in the text?

### ASSISTANT: 
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True))

In [None]:
from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

### Setup Lora 

In [None]:
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
print_trainable_parameters(model)
# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)

### Run Training

In [None]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [None]:
import transformers
from datetime import datetime


project = "viggo-finetune1"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=16,
        max_steps=1000,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=50,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=50,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

### Try the trained model

In [None]:
from peft import PeftModel
ft_model = PeftModel.from_pretrained(base_model, "mistral-viggo-finetune/checkpoint-1000")

In [None]:
eval_prompt = """You are a question-answering system. Given the context answer the user queries.

### USER: Text: Q:\n\nPosition character based on enemy coordinates in lua\n\nI have written a function here which should turn my character based on enemy coordinates but it\'s not perfect because it does not always turn where I want it to and perhaps there is a better way of writing it\nlocal myPosition = {x = 350, y = 355}\nlocal enemyPosition = {x = 352, y = 354}\nlocal xValue, yValue, xDir, yDir, dir\n\nif myPosition.x > enemyPosition.x then\n    xValue = myPosition.x - enemyPosition.x\nelseif myPosition.x < enemyPosition.x then\n    xValue = myPosition.x - enemyPosition.x\nelse\n    xValue = 0\nend\n\nif myPosition.y > enemyPosition.y then\n    yValue = myPosition.y - enemyPosition.y\nelseif myPosition.y < enemyPosition.y then\n    yValue = myPosition.y - enemyPosition.y\nelse\n    yValue = 0\nend\n\nif xValue < 0 then\n    xDir = "TURN RIGHT"\nelseif xValue > 0 then\n    xDir = "TURN LEFT"\nend\n\nif yValue < 0 then\n    yDir = "TURN DOWN"\nelseif yValue > 0 then\n    yDir = "TURN UP"\nend\n\nif xValue > yValue then\n    dir = xDir\nelseif xValue \n    dir = yDir\nend\n\nprint("Turn: " .. dir)\n\nAnd here you have some pictures to further illustrate what I have in mind:\n\nAs you can see on the pictures, direction depends on the higher number.

### ASSISTANT: I've read this text.

### USER: What describes programming concept in the text?

### ASSISTANT: 
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    print(tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=100, pad_token_id=2)[0], skip_special_tokens=True))