# Fine tune StableLM-Zephyr-3B model using Axolotl framework

How to install dependencies (in HPC environment):

- load Python and cuDNN modules
- create a Python venv and activate it
- install dependencies from requirements.txt (e.g. torch)
- install Axolotl from git clone (pip won't work, see [this issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/945)):

```
git clone git@github.com:OpenAccess-AI-Collective/axolotl.git
cd axolotl
pip install -e '.[flash-attn,deepspeed]'
```


In [1]:
# Check if GPU is available
import torch
print('GPU available?', torch.cuda.is_available())
print('BF16 is supported?', torch.cuda.is_bf16_supported())

GPU available? True
BF16 is supported? True


In [5]:
# set model name etc.

MODEL_NAME = "stabilityai/stablelm-zephyr-3b"
MODEL_SHORT_NAME = MODEL_NAME.split('/')[-1]

stabilityai/stablelm-zephyr-3b stablelm-zephyr-3b


In [2]:
# Load and prepare fine-tuning dataset

import json
import glob
import random

random.seed(42)  # for deterministic sampling of test set

train_files = glob.glob("../../llm-dataset/*-train.jsonl")
test_files = glob.glob("../../llm-dataset/*-test.jsonl")

KEEP_FIELDS = {
    'dc.contributor.author',
    'dc.date.issued',
    'dc.identifier.isbn',
    'dc.language.iso',
    'dc.publisher',
    'dc.relation.eissn',
    'dc.title'    
}
MAX_TEXT_LENGTH = 3072
EVAL_SIZE = 32  # how many documents to evaluate (i.e. calculate loss) on during fine-tuning
INSTRUCTION = "Extract metadata. Return as JSON."

def preprocess_sample(sample):
    # subset & JSON encode the ground truth
    subset = {key: val
              for key, val in sample["ground_truth"].items()
              if key in KEEP_FIELDS}
    output = json.dumps(subset)
    input_ = sample["text"][:MAX_TEXT_LENGTH]
    # Alpaca format
    return {"instruction": INSTRUCTION, "input": input_, "output": output}

def dataset_to_records(files):
    records = []
    for filename in files:
        with open(filename) as infile:
            for line in infile:
                sample = json.loads(line)
                records.append(preprocess_sample(sample))
    return records

def write_jsonl(records, filename):
    with open(filename, "w") as outfile:
        for record in records:
            json.dump(record, outfile)
            outfile.write("\n")

train_recs = dataset_to_records(train_files)
random.shuffle(train_recs)
write_jsonl(train_recs, "train.jsonl")
print(f"Wrote {len(train_recs)} train records")

test_recs = dataset_to_records(test_files)
write_jsonl(test_recs, "test.jsonl")
print(f"Wrote {len(test_recs)} test records")

eval_recs = random.sample(test_recs, EVAL_SIZE)
write_jsonl(eval_recs, "eval.jsonl")
print(f"Wrote {len(eval_recs)} eval records")

Wrote 556 train records
Wrote 167 test records
Wrote 32 eval records


In [3]:
# Create Axolotl configuration file

CONFIG_FILE = f"config-{MODEL_SHORT_NAME}.yml"


CONFIG = f"""
base_model: {MODEL_NAME}
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
trust_remote_code: true

load_in_8bit: true
load_in_4bit: false
strict: false

datasets:
  - path: train.jsonl
    ds_type: json
    split: train
    type: alpaca

test_datasets:
  - path: eval.jsonl
    ds_type: json
    split: train
    type: alpaca

output_dir: ./out-{MODEL_SHORT_NAME}

adapter: lora
lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target_modules:
  - q_proj
  - v_proj
  - k_proj
  - o_proj
  - gate_proj
  - down_proj
  - up_proj
sequence_len: 4096
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 4
eval_batch_size: 4
num_epochs: 5
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

train_on_inputs: false
group_by_length: false
bf16: true
fp16: false
tf32: false

gradient_checkpointing: true  # true: saves VRAM but is slower to train
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: false  # true gives RuntimeError: FlashAttention only support fp16 and bf16 data type

warmup_steps: 10
evals_per_epoch: 2
eval_table_size:
eval_table_max_new_tokens: 128
saves_per_epoch: 1
debug:
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:
  bos_token: "<|endoftext|>"
  eos_token: "<|endoftext|>"
  unk_token: "<|endoftext|>"
  pad_token: "<|endoftext|>"
""".strip()

with open(CONFIG_FILE, 'w') as outfile:
    print(CONFIG, file=outfile)

In [None]:
%%time

!venv/bin/accelerate launch -m axolotl.cli.train {CONFIG_FILE}

# Use the fine-tuned model

In [5]:
import torch
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

qlora_model = f"./out-{MODEL_SHORT_NAME}"
base_model = MODEL_NAME
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForCausalLM.from_pretrained(base_model, device_map=0, trust_remote_code=True, torch_dtype="auto", attn_implementation="flash_attention_2",)
model = PeftModel.from_pretrained(model, qlora_model)
model = model.merge_and_unload()



  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
pl = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length=4096, return_full_text=False, repetition_penalty=1.1)

def generate(prompt):
    response = pl(prompt, pad_token_id=pl.tokenizer.eos_token_id)
    return response[0]['generated_text'].strip()

In [7]:
%%time

import json

PROMPT_TEMPLATE = """
### Instruction:
{instruction}

### Input:
{input}

### Response:
""".strip()

with open(f'test-records-{MODEL_SHORT_NAME}.jsonl', 'w') as outfile:
    for rec in test_recs:
        prompt = PROMPT_TEMPLATE.format(instruction=INSTRUCTION, input=rec['input']) + "\n"
        response = generate(prompt)

        print(100 * "-")
        print("Ground Truth:")
        print(rec['output'])
        print("Prediction:")
        print(response)
        
        ground_truth = json.loads(rec['output'])
        try:
            prediction = json.loads(response)
        except json.JSONDecodeError:
            prediction = {}
        
        # rowid is set to unknown as we've lost it somewhere along the way...
        record = {"ground_truth": ground_truth, "prediction": prediction, "rowid": "unknown"}
        json.dump(record, outfile)
        outfile.write("\n")

----------------------------------------------------------------------------------------------------
Ground Truth:
{"dc.title": "Poliisikoulutuksen vaikuttavuusarviointi 2021 : vuosina 2018-2019 valmistuneiden poliisien ty\u00f6llisyys ja arviot koulutuksen ty\u00f6el\u00e4m\u00e4vastaavuudesta", "dc.contributor.author": ["Vuorensyrj\u00e4, Matti"], "dc.date.issued": "2021", "dc.identifier.isbn": ["978-951-815-386-6"], "dc.language.iso": "fin", "dc.publisher": ["Poliisiammattikorkeakoulu"]}
Prediction:
{"dc.title": "Poliisikoulutuksen vaikuttavuusarviointi 2021", "dc.contributor.author": ["Vuorensyr\u00e4, Matti"], "dc.date.issued": "2021-06-01", "dc.language.iso": "fin"}
----------------------------------------------------------------------------------------------------
Ground Truth:
{"dc.title": "FAQ : Taiteen digitaaliset toimintaymp\u00e4rist\u00f6t", "dc.date.issued": "2018", "dc.identifier.isbn": ["978-952-7266-07-6"], "dc.language.iso": "fin", "dc.publisher": ["Tampereen ammatti



----------------------------------------------------------------------------------------------------
Ground Truth:
{"dc.title": "Ymp\u00e4rist\u00f6vaikutusten arviointi 80-luvun liikerakennuksen kiinteist\u00f6kehitt\u00e4misess\u00e4 : Elinkaariarvioinnin (LCA), elinkaarikustannusten (LCC) ja energiasimuloinnin arviointiraportti", "dc.contributor.author": ["Keskisalo, Mika", "Kuusisto, Jari", "Matveinen, Mikko"], "dc.date.issued": "2022", "dc.relation.eissn": "2323-6914", "dc.identifier.isbn": ["978-952-275-365-6"], "dc.language.iso": "fin", "dc.publisher": ["Karelia-ammattikorkeakoulu"]}
Prediction:
{"dc.title": "Mika Keskisalo, Jari Kuusisto, Mikko Matveinen", "dc.date.issued": "2020-09-23", "dc.language.iso": "fin", "dc.publisher": ["Karelia-ammattikorkeakoulu"]}
----------------------------------------------------------------------------------------------------
Ground Truth:
{"dc.title": "IMF ennustaa talouskasvua euroalueelle ja Suomelle", "dc.contributor.author": ["Toivanen, Me

Token indices sequence length is longer than the specified maximum sequence length for this model (2226 > 2048). Running this sequence through the model will result in indexing errors


----------------------------------------------------------------------------------------------------
Ground Truth:
{"dc.title": "Monitorering av biofilmbildningen med kvartskristallmikrobalans (QCM) och ytplasmonresonans (SPR)", "dc.contributor.author": ["Holmstr\u00f6m, Ellinoora"], "dc.publisher": ["\u00c5bo Akademi"], "dc.date.issued": "2023", "dc.language.iso": "swe"}
Prediction:
{"dc.title": "Monitorering av biofilmbildning med kquartskristallmikrobalans och ytplasmonresonans", "dc.contributor.author": ["Holmstr\u00f6m, Ellinoora"], "dc.date.issued": "2023-01-01", "dc.identifier.isbn": ["978-952-244-942-2"], "dc.language.iso": "eng", "dc.publisher": ["\u00c5bo Akademi"]}
----------------------------------------------------------------------------------------------------
Ground Truth:
{"dc.title": "Resultatmanipulering : En studie om f\u00f6ruts\u00e4ttningarna p\u00e5 den finska marknaden", "dc.contributor.author": ["Lindqvist, Robert"], "dc.publisher": ["\u00c5bo Akademi"], "dc.d

In [8]:
# Analyze the statistics of the extracted metadata and save to file

import sys
sys.path.append('..')

from eval import MetadataEvaluator

evaluator = MetadataEvaluator(f'test-records-{MODEL_SHORT_NAME}.jsonl')
results = evaluator.evaluate_records() #prediction_records[:9])
# Use only the fields that Meteor extracts
fields = [
        "dc.contributor.author",
        "dc.date.issued",
        "dc.identifier.isbn",
        "dc.language.iso",
        "dc.publisher",
        "dc.relation.eissn",
        "dc.title",
    ]
statistics_filename = '../results-axolotl-fine-tune-' + MODEL_SHORT_NAME + '.md'
evaluator.save_md(results, statistics_filename, fields)