# Training Mistra use QLora on Zephyr medical dataset

References: https://bdtechtalks.com/2023/11/03/gpt-llm-trainer/ & https://youtu.be/9bl1mJImj10?si=v9lazUoCWqc4d4Hq

In [1]:
from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split
import json
import pandas as pd

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
from accelerate import infer_auto_device_map
from tqdm import tqdm

tqdm.pandas()

In [2]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [3]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
# model_name = "lmsys/vicuna-7b-v1.5"

device = "cuda:0"

In [4]:
# del model

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
# this should be set as False for finetuning
model.config.use_cache = False

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# base_model=model.half()

In [5]:
max_seq_length = 1024

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=max_seq_length,
    padding_side="left",
    # add_eos_token=True,
    trust_remote_code=True
)

tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
# model.base_model.model.model.embed_tokens.weight.data = model.base_model.model.model.embed_tokens.weight.data.float()
# model.base_model.model.lm_head.weight.data = model.base_model.model.lm_head.weight.data.float()

# Load data

In [5]:
dataset = load_dataset("squad_v2")
df = dataset['train'].to_pandas()

In [11]:
train_full, test_full = train_test_split(df, test_size=0.2, random_state=11)

In [12]:
def get_answer_from_json(data):
    # data = json.loads(json_data)
    texts = data['text']
    if len(texts) == 0:
        return 'Unanswerable.'
    text = texts[0]
    if text[-1] != ".":
        text = text + "."
    return text

In [13]:
train_full['answer'] = train_full['answers'].apply(get_answer_from_json)
test_full['answer'] = test_full['answers'].apply(get_answer_from_json)
train = train_full.iloc[0:2000]
test = test_full.iloc[0:200]
eval_df = test_full.iloc[200:300]

In [14]:
def make_template(context, question, answer=""):
    s_part = f'Extract the answer to the question from the context. Answer with "Unanswerable" if the answer cannot be found.'
    c_part = f"### Context: {context}"
    q_part = f"### Question: {question}"
    parts = [s_part, c_part, q_part]
    a_part = f"### Answer:"
    if answer:
        a_part = a_part + " " + answer + "\n### End"

    parts.append(a_part)
    return "\n".join(parts)

In [15]:
train_ds = Dataset.from_pandas(train)
eval_ds = Dataset.from_pandas(eval_df)

In [16]:
eos_tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=max_seq_length,
    padding_side="left",
    add_eos_token=True,
    trust_remote_code=True
)
eos_tokenizer.pad_token = eos_tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
# def generate_and_tokenize_prompt(data_point):
#     full_prompt = make_template(data_point["context"], data_point["question"], data_point["answer"])
#     result = eos_tokenizer(
#         full_prompt,
#         truncation=True,
#         max_length=512,
#         padding="max_length",
#     )
#     result["labels"] = result["input_ids"].copy()
#     return result

In [18]:
# tokenized_train_dataset = train_ds.map(generate_and_tokenize_prompt)
# tokenized_eval_dataset = eval_ds.map(generate_and_tokenize_prompt)

## Evaluation functions

In [17]:
from transformers import StoppingCriteria, StoppingCriteriaList

class KeywordsStoppingCriteria(StoppingCriteria):
    def __init__(self, keywords_ids:list):
        self.keywords = keywords_ids

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        if input_ids[0][-1] in self.keywords:
            return True
        return False

# display(tokenizer.decode([13]))
# '\n'
# display(tokenizer.decode([13, 27332]))
# '\n###'
stop_ids = [13, tokenizer.eos_token_id]
stop_criteria = KeywordsStoppingCriteria(stop_ids)

In [18]:
import re
import time

tokens_generated_count = 0
token_generation_time = 0

def get_clean_word_list(sentence):
    words = sentence.split(" ")
    words = [re.sub('[^a-zA-Z0-9\s]', '', x).lower() for x in words]
    return words

def score_answer(llm_answer, answer):
    a_words = get_clean_word_list(answer)
    a_words = set(a_words)
    word_count = len(a_words)
    l_words = get_clean_word_list(llm_answer)
    count = 0
    for word in l_words:
        if word in a_words:
            a_words.remove(word)
            count += 1
    score = float(count) / word_count
    # arbitrary threshold
    return int(score > 0.5)

def get_redundancy(llm_answer, answer):
    return len(llm_answer) / len(answer)

def generate_answer(context, question, is_print=False):
    global token_generation_time, tokens_generated_count
    template = make_template(context, question)
    if is_print:
        print(template)
    inputs = tokenizer(template, return_tensors="pt").to(device)
    start = time.time()
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False,
        stopping_criteria=StoppingCriteriaList([stop_criteria]),
    )
    token_generation_time += time.time() - start
    tokens_generated_count += len(outputs[0]) - len(inputs["input_ids"][0])
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text.removeprefix(template)

## Evaluate the base model

fast check

In [21]:
row = test.iloc[4]
text = make_template(row.context, row.question)

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



Extract the answer to the question from the context. Answer with "Unanswerable" if the answer cannot be found.
### Context: Albania has often been called the 51st state for its perceived strongly pro-American positions, mainly because of the United States' policies towards it. In reference to President George W. Bush's 2007 European tour, Edi Rama, Tirana's mayor and leader of the opposition Socialists, said: "Albania is for sure the most pro-American country in Europe, maybe even in the world ... Nowhere else can you find such respect and hospitality for the President of the United States. Even in Michigan, he wouldn't be as welcome." At the time of ex-Secretary of State James Baker's visit in 1992, there was even a move to hold a referendum declaring the country as the 51st American state. In addition to Albania, Kosovo which is predominately Albanian is seen as a 51st state due to the heavily presence and influence of the United States. The US has had troops and the largest base out

In [22]:
print(make_template("{context}", "{question}", "{answer}"))

Extract the answer to the question from the context. Answer with "Unanswerable" if the answer cannot be found.
### Context: {context}
### Question: {question}
### Answer: {answer}
### End


Full check

In [23]:
test['base_llm_answer'] = test.progress_apply(lambda x: generate_answer(x.context, x.question).strip(), axis=1)
test['base_score'] = test.apply(lambda x: score_answer(x.base_llm_answer, x.answer), axis=1)
test['base_redundancy'] = test.apply(lambda x: get_redundancy(x.base_llm_answer, x.answer), axis=1)

100%|██████████| 200/200 [05:40<00:00,  1.70s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['base_llm_answer'] = test.progress_apply(lambda x: generate_answer(x.context, x.question).strip(), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['base_score'] = test.apply(lambda x: score_answer(x.base_llm_answer, x.answer), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gui

In [27]:
test['base_score'].value_counts()

base_score
1    143
0     57
Name: count, dtype: int64

In [28]:
test.loc[test.base_score == 1]['base_redundancy'].describe()

count    143.000000
mean       3.497930
std        4.316378
min        0.666667
25%        0.923077
50%        1.541667
75%        4.641026
max       21.142857
Name: base_redundancy, dtype: float64

In [26]:
# filtered_df = test[test['score'] == 0]
with pd.option_context('display.max_colwidth', 60):
    # display(filtered_df.head(10))
    df1 = test[['context', 'question', "answer", "base_llm_answer", "base_score", "base_redundancy"]]
    display(df1.head(15))

Unnamed: 0,context,question,answer,base_llm_answer,base_score,base_redundancy
99996,Secondary education in the United States did not emerge ...,Who didn't benefit from secondary schools?,Unanswerable.,Unanswerable,1,0.923077
57815,A few special additions enhance the language-learning ex...,What is the largest and longest-running university-run f...,BYU's International Cinema.,BYU's International Cinema,1,0.962963
72886,"When Eisenhower was elected President in 1952, he believ...",What was the main purpose of the Government Contract Com...,conducted surveys of the racial composition of federal e...,The main purpose of the Government Contract Committee wa...,1,1.621053
46410,"Nonetheless, within a few years of his death, Gregory of...",What do the Protestants call him?,Father of the Canon.,"""Father of the Canon""",1,1.05
107618,Albania has often been called the 51st state for its per...,Who is the leader of the Socialists?,Unanswerable.,Edi Rama,0,0.615385
69710,"Ibn Sīnā wrote at least one treatise on alchemy, but sev...","According to some, what was Ibn Sina trying to do regard...","""re-Aristotelianise"" Muslim philosophy.","According to some, Ibn Sina was trying to ""re-Aristoteli...",1,2.487179
93911,The city was founded in 734 BC by the Phoenicians as Ziz...,Who founded Palermo in 734 AD?,Unanswerable.,"The Phoenicians founded Palermo in 734 BC, not in 734 AD.",0,4.384615
40196,"In 1988, Apple sued Microsoft and Hewlett-Packard on the...",How long did the FSF boycott GUN software for the Macint...,Unanswerable.,The FSF boycotted GNU software for the Macintosh platfor...,0,5.692308
84186,"During Mubarak's presidency, Nasserist political parties...",What party came in third in the 2013 election?,Sabahi.,Unanswerable,0,1.714286
99763,"The consensus among linguists is that modern, standard C...",How much has Slovak changed from the past until now?,Unanswerable.,Unanswerable,1,0.923077


In [26]:
# Save to csv

from pathlib import Path  
filepath = Path('base_model.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
df1 = test[['context', 'question', "answer", "base_llm_answer", "base_score", "base_redundancy"]]
df1.to_csv(filepath)  

## Set Up LoRA

In [27]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [28]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

Let's print the model to examine its layers, as we will apply QLoRA to all the linear layers of the model. Those layers are `q_proj`, `k_proj`, `v_proj`, `o_proj`, `gate_proj`, `up_proj`, `down_proj`, and `lm_head`.

In [29]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )

Here we define the LoRA config.

`r` is the rank of the low-rank matrix used in the adapters, which thus controls the number of parameters trained. A higher rank will allow for more expressivity, but there is a compute tradeoff.

`alpha` is the scaling factor for the learned weights. The weight matrix is scaled by `alpha/r`, and thus a higher value for `alpha` assigns more weight to the LoRA activations.

The values used in the QLoRA paper were `r=64` and `lora_alpha=16`, and these are said to generalize well, but we will use `r=16` and `lora_alpha=16` so that we have more emphasis on the new fine-tuned data while also reducing computational complexity.

In [30]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

In [31]:
model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)

trainable params: 21260288 || all params: 3773331456 || trainable%: 0.5634354746703705


See how the model looks different now, with the LoRA adapters added:

In [32]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(
                in_features=4096, out_features=1024, bias=False
       

# Train

Here we will use the SFTTrainer from TRL library that gives a wrapper around transformers Trainer to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

In [33]:
import wandb, os
# wandb.login()

wandb_project = "squad-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

In [34]:
import transformers
from datetime import datetime

if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [35]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['context'])):
        text = make_template(example['context'][i], example['question'][i], example['answer'][i])
        output_texts.append(text)
    return output_texts

In [36]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM


response_template_with_context = "\n### Answer:"  # We added context here: "\n". This is enough for this tokenizer
response_template_ids = tokenizer.encode(response_template_with_context, add_special_tokens=False)[2:]  # Now we have it like in the dataset texts: `[2277, 29937, 4007, 22137, 29901]`

collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)

In [37]:
project = "squad-finetune-2"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

# tokenizer.pad_token = tokenizer.eos_token

trainer = SFTTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    # peft_config=peft_config,
    # train_dataset=tokenized_train_dataset,
    # eval_dataset=tokenized_eval_dataset,
    max_seq_length=max_seq_length,
    tokenizer=eos_tokenizer,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_steps=1000,
        learning_rate=2.5e-4, # Mistral learning rate
        logging_steps=50,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=50,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    packing=False
)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]



In [39]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgerhean1[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
50,0.5007,0.235642
100,0.3754,0.251745
150,0.3749,0.283176
200,0.3327,0.262872
250,0.3134,0.237007
300,0.1746,0.292642
350,0.1668,0.218049
400,0.1553,0.209315
450,0.1913,0.281108
500,0.143,0.194502


TrainOutput(global_step=1000, training_loss=0.15756725454330445, metrics={'train_runtime': 2644.6431, 'train_samples_per_second': 3.025, 'train_steps_per_second': 0.378, 'total_flos': 9.495139258601472e+16, 'train_loss': 0.15756725454330445, 'epoch': 4.0})

# Model Evaluation

In [30]:
model.unload()
model = AutoModelForCausalLM.from_pretrained(
    model_name,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    trust_remote_code=True
)

# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Now load the QLoRA adapter from the appropriate checkpoint directory, i.e. the best performing model checkpoint:

In [19]:
from peft import PeftModel
model = PeftModel.from_pretrained(model, "mistral-squad-finetune-2/checkpoint-700")

### Short evaluation

In [22]:
row = test.iloc[2]
text = make_template(row.context, row.question)

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id, stopping_criteria=StoppingCriteriaList([stop_criteria]))
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



Extract the answer to the question from the context. Answer with "Unanswerable" if the answer cannot be found.
### Context: When Eisenhower was elected President in 1952, he believed hiring practices and anti-discrimination laws should be decided by the states, although the administration gradually continued to desegregate the Armed Forces and the federal government.:50 The President also established the Government Contract Committee in 1953, which "conducted surveys of the racial composition of federal employees and tax-supported contractors".:50–51 The committee, chaired by Vice President Richard Nixon, had minimal outcomes in that they imposed the contractors with the primary responsibility of desegregation within their own companies and corporations.:51
### Question: What was the main purpose of the Government Contract Committee?
### Answer: to imposed the contractors with the primary responsibility of desegregation.



In [23]:
len(outputs[0]) - len(inputs["input_ids"][0])

15

### Full evaluation

In [24]:
model.eval()
test['llm_answer'] = test.progress_apply(lambda x: generate_answer(x.context, x.question).strip(), axis=1)

100%|██████████| 200/200 [02:24<00:00,  1.38it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['llm_answer'] = test.progress_apply(lambda x: generate_answer(x.context, x.question).strip(), axis=1)


In [30]:
test['score'] = test.apply(lambda x: score_answer(x.llm_answer, x.answer), axis=1)
test['redundancy'] = test.apply(lambda x: get_redundancy(x.llm_answer, x.answer), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['score'] = test.apply(lambda x: score_answer(x.llm_answer, x.answer), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['redundancy'] = test.apply(lambda x: get_redundancy(x.llm_answer, x.answer), axis=1)


In [31]:
# filtered_df = test[test['score'] == 0]
with pd.option_context('display.max_colwidth', 60):
    # display(filtered_df.head(10))
    df1 = test[['context', 'question', "answer", "llm_answer", "score", "redundancy"]]
    display(df1.head(15))

Unnamed: 0,context,question,answer,llm_answer,score,redundancy
99996,Secondary education in the United States did not emerge ...,Who didn't benefit from secondary schools?,Unanswerable.,Unanswerable.,1,1.0
57815,A few special additions enhance the language-learning ex...,What is the largest and longest-running university-run f...,BYU's International Cinema.,BYU's International Cinema.,1,1.0
72886,"When Eisenhower was elected President in 1952, he believ...",What was the main purpose of the Government Contract Com...,conducted surveys of the racial composition of federal e...,to imposed the contractors with the primary responsibili...,0,0.8
46410,"Nonetheless, within a few years of his death, Gregory of...",What do the Protestants call him?,Father of the Canon.,Father of the Canon.,1,1.0
107618,Albania has often been called the 51st state for its per...,Who is the leader of the Socialists?,Unanswerable.,Edi Rama.,0,0.692308
69710,"Ibn Sīnā wrote at least one treatise on alchemy, but sev...","According to some, what was Ibn Sina trying to do regard...","""re-Aristotelianise"" Muslim philosophy.","""under the name of the ancient Greek philosopher"".",0,1.282051
93911,The city was founded in 734 BC by the Phoenicians as Ziz...,Who founded Palermo in 734 AD?,Unanswerable.,Phoenicans.,0,0.846154
40196,"In 1988, Apple sued Microsoft and Hewlett-Packard on the...",How long did the FSF boycott GUN software for the Macint...,Unanswerable.,Unanswerable.,1,1.0
84186,"During Mubarak's presidency, Nasserist political parties...",What party came in third in the 2013 election?,Sabahi.,Unanswerable.,0,1.857143
99763,"The consensus among linguists is that modern, standard C...",How much has Slovak changed from the past until now?,Unanswerable.,Unanswerable.,1,1.0


In [32]:
test['score'].value_counts()

score
1    143
0     57
Name: count, dtype: int64

In [33]:
test.loc[test.score == 1]['redundancy'].describe()

count    143.000000
mean       1.057457
std        0.338980
min        0.552632
25%        1.000000
50%        1.000000
75%        1.000000
max        3.600000
Name: redundancy, dtype: float64

In [34]:
print(tokens_generated_count / token_generation_time)

8.995458112085034


In [29]:
tokens_generated_count

47284

In [30]:
token_generation_time

163.44584608078003

In [50]:
# Save to csv

from pathlib import Path  
filepath = Path('qlora_train1.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
df1 = test[['context', 'question', "answer", "llm_answer", "score", "redundancy"]]
df1.to_csv(filepath)  

# Difference

In [39]:
test['is_different'] = False

# Iterate over rows and compare the values in StringColumn1 and StringColumn2
for index, row in test.iterrows():
    if row['llm_answer'] != row['base_llm_answer']:
        test.at[index, 'is_different'] = True
    else:
        test.at[index, 'is_different'] = False

In [41]:
with pd.option_context('display.max_colwidth', 60):
    # display(filtered_df.head(10))
    df1 = test.loc[test.is_different]
    df1 = df1[['question', "answer", "base_llm_answer", "llm_answer"]]
    display(df1.head(15))

Unnamed: 0,question,answer,base_llm_answer,llm_answer
40114,When was the Microsoft branded as Mac?,Unanswerable,"Unanswerable. The question is about the Macintosh brand,...",Unanswerable. The question is about the Macintosh comput...


## Save Model

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "mistralai/Mistral-7B-Instruct-v0.1"
# model_name = "lmsys/vicuna-7b-v1.5"

device = "cuda:0"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

: 

In [None]:
from peft import PeftModel
model = PeftModel.from_pretrained(model, "mistral-squad-finetune-2/checkpoint-700")

In [20]:
merged_model = model.merge_and_unload()
merged_model.save_pretrained("./mistral-squad-model")



NotImplementedError: You are calling `save_pretrained` on a 4-bit converted model. This is currently not supported