# Fine tune Mistral-7B model using Ludwig framework

See requirements.txt for dependencies. The main dependency is `ludwig[llm]`

Based on these sources:
* https://ludwig.ai/latest/getting_started/llm_finetuning/
* https://ludwig.ai/latest/user_guide/llms/finetuning/
* https://colab.research.google.com/drive/1Ly01S--kUwkKQalE-75skalp-ftwl0fE?usp=sharing
* https://predibase.com/blog/fine-tuning-mistral-7b-on-a-single-gpu-with-ludwig
* https://levelup.gitconnected.com/no-more-hard-coding-use-declarative-configuration-to-build-and-fine-tune-custom-llms-on-your-data-6418b243fad7

In [1]:
# Load and prepare fine-tuning dataset

import json
import glob
import pandas as pd

train_files = glob.glob("../../llm-dataset/*-eng-train.jsonl")
test_files = glob.glob("../../llm-dataset/*-eng-test.jsonl")

KEEP_FIELDS = {
    'dc.contributor.author',
    'dc.date.issued',
    'dc.identifier.isbn',
    'dc.language.iso',
    'dc.publisher',
    'dc.relation.eissn',
    'dc.title'    
}
MAX_TEXT_LENGTH = 2048
#MAX_TEXT_LENGTH = 1024

def preprocess_sample(sample):
    # subset & JSON encode the ground truth
    subset = {key: val
              for key, val in sample["ground_truth"].items()
              if key in KEEP_FIELDS}
    sample["ground_truth"] = subset
    sample["ground_truth_json"] = json.dumps(subset)
    sample["text"] = sample["text"][:MAX_TEXT_LENGTH]
    del sample["metadata"]
    del sample["id"]
    del sample["url"]
    return sample

def dataset_to_df(files):
    records = []
    for filename in files:
        with open(filename) as infile:
            for line in infile:
                sample = json.loads(line)
                records.append(preprocess_sample(sample))
    return pd.DataFrame.from_records(records)

train_df = dataset_to_df(train_files)
test_df = dataset_to_df(test_files)
print(train_df.shape, test_df.shape)
print(train_df.keys())

model = None  # placeholder

(212, 3) (59, 3)
Index(['text', 'ground_truth', 'ground_truth_json'], dtype='object')


In [2]:
# workaround for bitsandbytes bug https://github.com/TimDettmers/bitsandbytes/issues/675

import os

if "SLURM_SUBMIT_DIR" in os.environ:
    del os.environ["SLURM_SUBMIT_DIR"]

# Finetuning

Finetuning specific code starts here, skip to Inference if you have a fine-tuned model and want to use that

In [3]:
import yaml

config_str = """
model_type: llm
base_model: HuggingFaceH4/zephyr-7b-beta

quantization:
  bits: 4

adapter:
  type: lora

prompt:
  template: |
    ### Instruction:
    Extract metadata from the following document. Return as JSON.

    ### Input:
    {text}

    ### Response:

input_features:
  - name: text
    type: text
    preprocessing:
      max_sequence_length: 1280

output_features:
  - name: ground_truth_json
    type: text
    preprocessing:
      max_sequence_length: 768

trainer:
  type: finetune
  learning_rate: 0.0002
  batch_size: 1
  gradient_accumulation_steps: 16
  enable_gradient_checkpointing: true
  epochs: 3
  learning_rate_scheduler:
    decay: cosine
    warmup_fraction: 0.03
    reduce_on_plateau: 0

preprocessing:
  sample_ratio: 1.0

generation:
  temperature: 0.1
  max_new_tokens: 768
"""

config = yaml.safe_load(config_str)

In [None]:
import logging
from ludwig.api import LudwigModel

model = LudwigModel(config=config, logging_level=logging.INFO)
results = model.train(training_set=train_df, test_set=test_df)
#print(results)

In [5]:
# Save the model
model.save("finetuned-model")

# Inference

You can start running the notebook from here if you already have fine-tuned a model in a previous session!

**Note:** you still need to load the datasets above and run the bitsandbytes workaround

In [3]:
# If no model exists, load the previously fine-tuned model

import logging
from ludwig.api import LudwigModel

if model is None:
    model = LudwigModel.load("finetuned-model", logging_level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


PyTorch version 2.0.1 available.
Loading large language model...
We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards: 100%|██████████| 8/8 [02:35<00:00, 19.41s/it]


Done.
Loaded HuggingFace implementation of HuggingFaceH4/zephyr-7b-beta tokenizer
Loading metadata from: finetuned-model/training_set_metadata.json


In [4]:
%%time
# Inference using the trained model

# We need two do this in two separate batches, otherwise the kernel gets killed (GPU OOM?)
test_preds1, _ = model.predict(test_df[:32], batch_size=16)
test_preds2, _ = model.predict(test_df[32:], batch_size=16)


Loaded HuggingFace implementation of HuggingFaceH4/zephyr-7b-beta tokenizer


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Prediction: 100%|██████████| 2/2 [03:23<00:00, 101.91s/it]
Loaded HuggingFace implementation of HuggingFaceH4/zephyr-7b-beta tokenizer
Finished predicting in: 206.03s.


  return np.sum(np.log(sequence_probabilities))


Loaded HuggingFace implementation of HuggingFaceH4/zephyr-7b-beta tokenizer


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Prediction: 100%|██████████| 2/2 [02:17<00:00, 68.69s/it]
Loaded HuggingFace implementation of HuggingFaceH4/zephyr-7b-beta tokenizer
Finished predicting in: 138.56s.
CPU times: user 5min 39s, sys: 8.84 s, total: 5min 48s
Wall time: 5min 44s


  return np.sum(np.log(sequence_probabilities))


In [26]:
# merge the prediction batches into a single dataframe
test_preds = pd.concat([test_preds1, test_preds2])

with open('test-records.jsonl', 'w') as outfile:
    for ground_truth, prediction in zip(test_df['ground_truth_json'], test_preds['ground_truth_json_response']):
        print(f"Ground Truth:\n{ground_truth}")
        print(f"Prediction:\n{prediction[0]}\n")
        ground_truth = json.loads(ground_truth)
        prediction = json.loads(prediction[0])
        # rowid is set to unknown as we've lost it somewhere along the way...
        record = {"ground_truth": ground_truth, "prediction": prediction, "rowid": "unknown"}
        json.dump(record, outfile)
        outfile.write("\n")

Ground Truth:
{"dc.title": "Essays on economic productivity", "dc.contributor.author": ["L\u00e4hdem\u00e4ki, Sakari"], "dc.date.issued": "2021-03-05", "dc.identifier.isbn": ["978-951-29-8349-0"], "dc.language.iso": "eng", "dc.publisher": ["University of Turku, Turku School of Economics"], "dc.relation.eissn": "2343-3167"}
Prediction:
{"dc.title": "Essays on Economic Productivity", "dc.contributor.author": ["L\u00e4hdem\u00e4ki, Sakari"], "dc.date.issued": "2021-03-25", "dc.language.iso": "eng", "dc.publisher": ["University of Turku"], "dc.relation.eissn": "2343-3167"}

Ground Truth:
{"dc.title": "Educating global citizens : a study of interaction between NGOs and schools in Finland", "dc.contributor.author": ["Henriksson, Heidi"], "dc.date.issued": "2022-10-07", "dc.identifier.isbn": ["978-952-389-029-9"], "dc.language.iso": "eng", "dc.publisher": ["\u00c5bo Akademis f\u00f6rlag - \u00c5bo Akademi University Press"]}
Prediction:
{"dc.title": "Educating global citizens : a study of int

In [1]:
# Analyze the statistics of the extracted metadata and save to file
model_name = 'zephyr-7b'

import sys
sys.path.append('..')

from eval import MetadataEvaluator

evaluator = MetadataEvaluator('test-records.jsonl')
results = evaluator.evaluate_records() #prediction_records[:9])
# Use only the fields that Meteor extracts
fields = [
        "dc.contributor.author",
        "dc.date.issued",
        "dc.identifier.isbn",
        "dc.language.iso",
        "dc.publisher",
        "dc.relation.eissn",
        "dc.title",
    ]
statistics_filename = '../results-ludwig-fine-tune-' + model_name + '.md'
evaluator.save_md(results, statistics_filename, fields)