## Install and Import Dependencies

In [1]:
%%capture output
!pip install --user -r requirements.txt

In [2]:
# Optionally show the above output
# output.show()

In [3]:
# Automatically restart kernel after installs so that the environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [1]:
from __future__ import annotations
import string, re, collections
from datasets import load_dataset, Dataset
from evaluate import load
from google.auth import default
import pandas as pd
import pinecone
from sentence_transformers import SentenceTransformer
import subprocess
import torch
from tqdm import tqdm
import vertexai
from vertexai.language_models import TextGenerationModel
import yaml

# Import config
with open("./config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Setup Pinecone
api_key = config["pinecone"]["api_key"]
environment = config["pinecone"]["environment"]
search_index_name = config["pinecone"]["search_index_name"]

# Setup Vertex AI
model_name = config["vertex_ai"]["model_name"]
project_id = config["vertex_ai"]["project_id"]
location = config["vertex_ai"]["location"]
bucket = config["vertex_ai"]["bucket"]

## Prepare Dataset

The data is from the `databricks/databricks-dolly-15k` dataset imported from Hugging Face. Among other tasks, the dataset contains examples for closed question answering, information extraction, and summarization. These tasks are useful for training an LLM to extract information for the FAQ serice. From [Databricks](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm):

>[databricks-dolly-15k](https://github.com/databrickslabs/dolly/tree/master/data) contains 15,000 high-quality human-generated prompt / response pairs specifically designed for instruction tuning large language models. Under the licensing terms for databricks-dolly-15k ([Creative Commons Attribution-ShareAlike 3.0 Unported License](https://creativecommons.org/licenses/by-sa/3.0/)), anyone can use, modify, or extend this dataset for any purpose, including commercial applications.
>
>To the best of our knowledge, this dataset is the first open source, human-generated instruction dataset specifically designed to make large language models exhibit the magical interactivity of ChatGPT. databricks-dolly-15k was authored by more than 5,000 Databricks employees during March and April of 2023. These training records are natural, expressive and designed to represent a wide range of the behaviors, from brainstorming and content generation to information extraction and summarization.

I'll split this dataset into train and test datasets. Also, I'll use 300 training examples in line with Google's recommendation of 100+ for extractive QA and 100-500+ for summarization.

In [None]:
# Load dataset
dataset = load_dataset("databricks/databricks-dolly-15k")

# Convert dataset to a DataFrame
df_full = pd.DataFrame({
    "instruction": dataset["train"]["instruction"],
    "context": dataset["train"]["context"],
    "output_text": dataset["train"]["response"],
    "category": dataset["train"]["category"],
})

# Keep only rows with context
df_full = df_full[df_full["category"].isin(["closed_qa", "information_extraction", "summarization"])]
df_full.reset_index(drop=True, inplace=True)

# Train / Test split
df_train = df_full.sample(frac=0.8, random_state=2023)
df_test = df_full.drop(df_train.index)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

# Add semantically similar alterative phrasing of the queries ("instruction")
df_test = df_test.truncate(after=249) # I got tired of rephrasing questions
alt_questions = pd.read_csv("./alt_questions.csv")
df_test["alt_questions"] = alt_questions

df_train["input_text"] = "question: " + df_train["instruction"] + " context: " + df_train["context"]
df_test["input_text"] = "question: " + df_test["instruction"] + " context: " + df_test["context"]

# Set number of training examples to 300
num_train_examples = 300
df_train = df_train[:num_train_examples]

# It needs to be in a JSONL format for batch prediction
test_json = df_test["input_text"].to_frame().rename(columns={"input_text": "prompt"}).to_json(orient="records", lines=True)
with open("./test.jsonl", "w") as f:
    f.write(test_json)

subprocess.run(f"gsutil cp ./test.jsonl gs://{bucket}/test.jsonl", shell=True)

## Evaluate the Base Model

I'll be using F1 score (higher the better, 0-1) to be evaluating models on the test set. Some definitions:

- **True Positive:** Number of shared tokens between the prediction and the correct answer.
- **False Positive:** Number of tokens in the predicted sequence, excluding the shared tokens.
- **False Negative:** Number of tokens in the correct answer, excluding the shared tokens.
- **Precision (P):** True Positive / (True Positive + False Positive)
- **Recall (R):** True Positive / (True Positive + False Negative)
- **F1 Score** 2PR / (P + R)

Also, `temperature` will be `0` and `topK` will be `1` as these tend to work well with the foundational model according to Google. I could set up a hyperparameter search if optimization is required.

In [3]:
def batch_prediction(model_name):
    model = TextGenerationModel.from_pretrained(model_name)
    batch_prediction_job = model.batch_predict(
        dataset=[f"gs://{bucket}/test.jsonl"],
        destination_uri_prefix=f"gs://{bucket}",
        model_parameters={
            "maxOutputTokens": "512",
            "temperature": "0",
            "topP": "0.95",
            "topK": "1",
        },
    )
    
    output_file = batch_prediction_job.output_info.gcs_output_directory + "/000000000000.jsonl"
    subprocess.run(f"gsutil cp {output_file} ./response.jsonl", shell=True)
    
    response = pd.read_json("./response.jsonl", lines=True).drop(columns=["status"])
    response["instance"] = response["instance"].apply(lambda x: x["prompt"])
    response["predictions"] = response["predictions"].apply(lambda x: x[0]["content"])
        
    return response

In [None]:
base_model_response = batch_prediction("text-bison@001")

In [5]:
df_test = df_test.merge(base_model_response, left_on="input_text", right_on="instance")

In [6]:
# Derived from Official evaluation script for SQuAD version 2.0.
# https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
    
def normalize_text(text):

    # lower case the text
    text = text.lower()

    # remove punctuation
    punctuation = set(string.punctuation)
    text = "".join(char for char in text if char not in punctuation)

    # remove articles
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    text =  re.sub(regex, " ", text)

    # correct whitespace
    text = " ".join(text.split())

    return text

def tokenize(text):
  if not text: return []
  return normalize_text(text).split()

def f1_score(predicted, ground_truth):
    predicted_tokens = tokenize(predicted)
    ground_truth_tokens = tokenize(ground_truth)
    
    common = collections.Counter(ground_truth_tokens) & collections.Counter(predicted_tokens)
    num_same = sum(common.values())
    
    if len(ground_truth_tokens) == 0 or len(predicted_tokens) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(ground_truth_tokens == predicted_tokens)
    
    if num_same == 0:
        return 0.0
    
    precision = 1.0 * num_same / len(predicted_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    
    return f1

In [7]:
f1_scores_base = []
for i in range(len(df_test)):
    f1_scores_base.append(f1_score(df_test["predictions"][i], df_test["output_text"][i]))
f1_scores_base = pd.Series(f1_scores_base)
f1_scores_base.describe()

count    250.000000
mean       0.425794
std        0.246602
min        0.000000
25%        0.250000
50%        0.375273
75%        0.594595
max        1.000000
dtype: float64

## Create a model tuning job

The fine tuning recommendations from Google are 100-500 train steps for extractive QA and 200-1000 train steps for summarization. I'll use 300 train steps.

In [None]:
credentials, _ = default(scopes=["https://www.googleapis.com/auth/cloud-platform"])


def tuning(
    project_id: str,
    training_data: pd.DataFrame | str,
    train_steps: int = 300,
    location: str = "us-central1"
) -> None:
    """Tune a new model, based on a prompt-response data.

    "training_data" can be either the GCS URI of a file formatted in JSONL format
    (for example: training_data=f'gs://{bucket}/{filename}.jsonl'), or a pandas
    DataFrame. Each training example should be JSONL record with two keys, for
    example:
      {
        "input_text": <input prompt>,
        "output_text": <associated output>
      },
    or the pandas DataFame should contain two columns:
      ['input_text', 'output_text']
    with rows for each training example.

    Args:
      project_id: GCP Project ID, used to initialize vertexai
      location: GCP Region, used to initialize vertexai
      training_data: GCS URI of jsonl file or pandas dataframe of training data
      train_steps: Number of training steps to use when tuning the model.
    """
    vertexai.init(project=project_id, location=location, credentials=credentials)
    model = TextGenerationModel.from_pretrained("text-bison@001")

    model.tune_model(
        training_data=training_data,
        train_steps=train_steps,
        tuning_job_location=location,
        tuned_model_location=location,
    )

    print(model._job.status)

tuning(project_id, training_data=df_train, train_steps=300, location=location)

## Evaluate the Fine-Tuned Model

In [None]:
tuned_model_response = batch_prediction("###")

# Rename for merge below
df_test.rename(columns={"predictions": "predictions_base", "instance": "instance_base"}, inplace=True)
df_test = df_test.merge(tuned_model_response, left_on="input_text", right_on="instance")

# Get F1 scores
f1_scores_tuned = []
for i in range(len(df_test)):
    f1_scores_tuned.append(f1_score(df_test["predictions"][i], df_test["output_text"][i]))
f1_scores_tuned = pd.Series(f1_scores_tuned)
f1_scores_tuned.describe()

## Connect to prediction endpoint of fine-tuned model

In [8]:
def single_prediction(
    input_text: str,
    model_name: str
) -> str:

    vertexai.init(project=project_id, location=location)
    parameters = {
        "temperature": 0,  # Temperature controls the degree of randomness in token selection.
        "max_output_tokens": 512,  # Token limit determines the maximum amount of text output.
        "top_p": 0.95,  # Tokens are selected from most probable to least until the sum of their probabilities equals the top_p value.
        "top_k": 1,  # A top_k of 1 means the selected token is the most probable among all tokens.
    }

    model = TextGenerationModel.from_pretrained(model_name)
    response = model.predict(
        input_text,
        **parameters,
    )
    return response.text

## Convert the questions to embeddings and store in Pinecone vector database

In [9]:
def create_search_index(df, model, api_key, environment, search_index_name, batch_size = 100):
    
    # Get api_key and environment for that api_key from app.pinecone.io
    pinecone.init(api_key=api_key, environment=environment)
    
    # If the search index doesn't exist, create one
    if search_index_name not in pinecone.list_indexes():
        pinecone.create_index(
            name = search_index_name,
            dimension = model.get_sentence_embedding_dimension(),
            metric = "cosine"
        )
    search_index = pinecone.Index(search_index_name)
    
    for i in tqdm(range(0, len(df), batch_size), desc="Corpus Progress"):
        # Get the last index of the batch
        i_end = min(i+batch_size, len(df))
        
        questions = df["instruction"][i:i_end].tolist()
        
        ids = [f"{x}" for x in range(i, i_end)]
        embeddings = model.encode(questions).tolist()
        metadata = [{"instruction": df["instruction"][x], "context": df["context"][x]} for x in range(i, i_end)]

        # Create batch
        batch = zip(ids, embeddings, metadata)

        # Upsert the batch to the search index
        search_index.upsert(vectors = list(batch))
    
    return search_index        

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
if device != "cuda":
    print(f"You are using {device}. Embedding will take a long time unless using a CUDA-enabled GPU.")

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)
search_index = create_search_index(df_full, model, api_key=api_key, environment=environment, search_index_name=search_index_name)

# How many embeddings do we have?
search_index.describe_index_stats()

Corpus Progress: 100%|██████████| 45/45 [00:20<00:00,  2.19it/s]


{'dimension': 384,
 'index_fullness': 0.04467,
 'namespaces': {'': {'vector_count': 4467}},
 'total_vector_count': 4467}

In [11]:
query = df_test["alt_questions"][0]
embedded_query = model.encode(query).tolist()
query_results = search_index.query(embedded_query, top_k=1, include_metadata=True)
for result in query_results["matches"]:
    print(f"{round(result['score'], 2)}: {result['metadata']['instruction']}")

0.95: When was Tomoaki Komorida born?


## Question answering

In [12]:
def rtr_qa(query):
    embedded_query = model.encode(query).tolist()
    query_results = search_index.query(embedded_query, top_k=1, include_metadata=True)
    instruction = query_results["matches"][0]["metadata"]["instruction"]
    context = query_results["matches"][0]["metadata"]["context"]
    input_text = "question: " + instruction + " context: " + context
    response = single_prediction(input_text, model_name)
    return response

print(rtr_qa(query))

The answer is July 10, 1981.

The question is "When was Tomoaki Komorida born?". The context is "Komorida was born in Kumamoto Prefecture on July 10, 1981". So the answer is July 10, 1981.


## Appendix

Before moving on to use PaLM 2, I first fine-tuning a Llama-2 model using the `trl` library from Huggging Face on an Nvidia L4 GPU using QLoRA 4bit quantization. It took a while and it became obvious that I would need to use distributed training. Since I was already on GCP's Vertex AI Workbench, I decided to switch from Hugging Face and not interrupt my development speed.

I'll return to this model later, and use the Hugging Face [Accelerate](https://huggingface.co/docs/accelerate/v0.21.0/en/basic_tutorials/notebook) library.

I keep the code below for reference.

In [None]:
# %%capture output
#!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('###')"

In [None]:
# Optionally show the above output
# output.show()

In [None]:
# # Automatically restart kernel after installs so that the environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

In [None]:
# from datasets import load_dataset, Dataset
# import pandas as pd
# from huggingface_hub import notebook_login
# from peft import LoraConfig
# from transformers import AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
# from trl import SFTTrainer

In [2]:
# # Load dataset
# dataset = load_dataset("databricks/databricks-dolly-15k")

# # Convert dataset to a DataFrame
# df_full = pd.DataFrame({
#     "instruction": dataset["train"]["instruction"],
#     "context": dataset["train"]["context"],
#     "response": dataset["train"]["response"],
#     "category": dataset["train"]["category"],
# })

# # Keep only rows with context
# df_full = df_full[df_full["category"].isin(["closed_qa", "information_extraction", "summarization"])]

# # Train / Test split
# df_train = df_full.sample(frac=0.8, random_state=2023)
# df_test = df_full.drop(df_train.index)
# df_train.reset_index(drop=True, inplace=True)
# df_test.reset_index(drop=True, inplace=True)

# # Add semantically similar alterative phrasing of the queries ("instruction")
# df_test = df_test.truncate(after=249) # I got tired of rephrasing questions
# alt_questions = pd.read_csv("./alt_questions.csv")
# df_test["alt_questions"] = alt_questions

# df_train["text"] = "### Human: " + df_train["context"] + "Based on above context, answer the following question: " + df_train["instruction"] + "### Assistant: " + df_train["response"]
# df_train = df_train[:64] # Full training would take 8 hrs on Nvidia L4, this takes about 11 minutes

# ds_train = Dataset.from_pandas(df_train)

Found cached dataset json (/home/jupyter/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-1a24287182230a5f/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
# notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
# from transformers import pipeline
# testing="### Human: " + df_train["context"][0] + "Based on above context, answer the following question: " + df_train["instruction"][0] + "### Assistant: " + df_train["response"][0]
# print(testing)

### Human: On the team's first full day at the Olympics in Atlanta, the media announced that O'Neal would join the Los Angeles Lakers on a seven-year, $121 million contract. O'Neal insisted he did not choose Los Angeles for the money; discussing the signing he referred to a couple of his product endorsements, saying: "I'm tired of hearing about money, money, money, money, money. I just want to play the game, drink Pepsi, wear Reebok." The Lakers won 56 games during the 1996–97 season. O'Neal averaged 26.2 points and 12.5 rebounds in his first season with Los Angeles; however, he again missed over 30 games due to injury. The Lakers made the playoffs, but were eliminated in the second round by the Utah Jazz in five games. In his first playoff game for the Lakers, O'Neal scored 46 points against the Portland Trail Blazers, the most for the Lakers in a playoff game since Jerry West had 53 in 1969. On December 17, 1996, O'Neal shoved Dennis Rodman of the Chicago Bulls; Rodman's teammates Sc

In [26]:
# pipe = pipeline(model=model_name, device_map=device_map, model_kwargs={"load_in_4bit": True})
# output = pipe(testing, do_sample=True, top_p=0.95)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [31]:
# print(output[0]["generated_text"])

On the team's first full day at the Olympics in Atlanta, the media announced that O'Neal would join the Los Angeles Lakers on a seven-year, $121 million contract. O'Neal insisted he did not choose Los Angeles for the money; discussing the signing he referred to a couple of his product endorsements, saying: "I'm tired of hearing about money, money, money, money, money. I just want to play the game, drink Pepsi, wear Reebok." The Lakers won 56 games during the 1996–97 season. O'Neal averaged 26.2 points and 12.5 rebounds in his first season with Los Angeles; however, he again missed over 30 games due to injury. The Lakers made the playoffs, but were eliminated in the second round by the Utah Jazz in five games. In his first playoff game for the Lakers, O'Neal scored 46 points against the Portland Trail Blazers, the most for the Lakers in a playoff game since Jerry West had 53 in 1969. On December 17, 1996, O'Neal shoved Dennis Rodman of the Chicago Bulls; Rodman's teammates Scottie Pippe

In [12]:
# from peft import PeftConfig, PeftModel
# from transformers import AutoTokenizer

# fine_tuned_model = PeftModel.from_pretrained(model, output_dir)
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# inputs = tokenizer.encode(testing, return_tensors="pt").to("cuda")
# outputs = model.generate(inputs, max_length= 1024)
# print(tokenizer.decode(outputs[0]))

<s> ### Human: On the team's first full day at the Olympics in Atlanta, the media announced that O'Neal would join the Los Angeles Lakers on a seven-year, $121 million contract. O'Neal insisted he did not choose Los Angeles for the money; discussing the signing he referred to a couple of his product endorsements, saying: "I'm tired of hearing about money, money, money, money, money. I just want to play the game, drink Pepsi, wear Reebok." The Lakers won 56 games during the 1996–97 season. O'Neal averaged 26.2 points and 12.5 rebounds in his first season with Los Angeles; however, he again missed over 30 games due to injury. The Lakers made the playoffs, but were eliminated in the second round by the Utah Jazz in five games. In his first playoff game for the Lakers, O'Neal scored 46 points against the Portland Trail Blazers, the most for the Lakers in a playoff game since Jerry West had 53 in 1969. On December 17, 1996, O'Neal shoved Dennis Rodman of the Chicago Bulls; Rodman's teammate

In [4]:
# modified from https://github.com/huggingface/trl/tree/main/examples/scripts/sft_trainer.py

# This is a previous experiment where I use PEFT to fine-tune Llama-2-7b on Hugging Face


# # Set variables
# model_name="meta-llama/Llama-2-7b-hf"
# dataset_text_field = "text" # The text field of the dataset
# log_with = None # use 'wandb' to log with wandb
# learning_rate = 2e-4
# batch_size = 2
# seq_length = 1024 # Input sequence length
# gradient_accumulation_steps = 2
# load_in_4bit = True # Load the model in 4 bits precision
# trust_remote_code = True
# output_dir = "trained_model"
# peft_lora_r = 64 # The r parameter of the LoRA adapters
# peft_lora_alpha = 16 # The alpha parameter of the LoRA adapters
# logging_steps = 1
# use_auth_token = True # Use HuggingFace auth token to access the model
# num_train_epochs = 3
# max_steps = -1 # The number of training steps
# quantization_config = BitsAndBytesConfig(load_in_4bit=load_in_4bit)
# device_map = {"": 0} # Fit the entire model on the GPU:0
# torch_dtype = torch.bfloat16

# # Load the model
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=quantization_config,
#     device_map=device_map,
#     trust_remote_code=trust_remote_code,
#     torch_dtype=torch_dtype,
#     use_auth_token=use_auth_token
# )

# # Load the dataset
# dataset = ds_train

# # Define the training arguments
# training_args = TrainingArguments(
#     output_dir=output_dir,
#     per_device_train_batch_size=batch_size,
#     gradient_accumulation_steps=gradient_accumulation_steps,
#     learning_rate=learning_rate,
#     logging_steps=logging_steps,
#     num_train_epochs=num_train_epochs,
#     max_steps=max_steps,
#     report_to=log_with,
# )

# # Define the LoRA configuration
# peft_config = LoraConfig(
#     r=peft_lora_r,
#     lora_alpha=peft_lora_alpha,
#     bias="none",
#     task_type="CAUSAL_LM",
# )

# # Define the trainer
# trainer = SFTTrainer(
#     model=model,
#     args=training_args,
#     max_seq_length=seq_length,
#     train_dataset=ds_train,
#     dataset_text_field=dataset_text_field,
#     peft_config=peft_config,
# )

# trainer.train()

# # Save the model
# trainer.save_model(output_dir)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.


Map:   0%|          | 0/64 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,1.8981
2,1.6273
3,1.9227
4,1.9635
5,2.0249
6,1.6156
7,1.7576
8,1.3083
9,1.749
10,1.6488
