### Running the notebook would require T4 GPU and High RAM, and with your Huggingface read and write tokens predefined in the Secret

Mount on Google Drive and download Python libraries that are needed

In [1]:
from google.colab import drive
from google.colab import userdata

In [2]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
!pip install chromadb
!pip install llama-index
!pip install transformers
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install accelerate
!pip install peft
!pip install trl
!pip install ragas
!pip install llama-index-llms-huggingface
!pip install llama-index-llms-openai
!pip install llama-index-embeddings-huggingface
!pip install llama-index-program-openai
!pip install llama-index-agent-openai
!pip install llama-index-vector-stores-chroma


Collecting chromadb
  Downloading chromadb-0.4.24-py3-none-any.whl (525 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m525.5/525.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.110.1-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.9/91.9 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)
  Downloading uvicorn-0.29.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2

In [3]:

import json
import chromadb
import torch
import pandas as pd
import matplotlib.pyplot as plt
import tensorboard

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings
from llama_index.core.prompts import PromptTemplate
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.response.notebook_utils import display_response
from llama_index.readers.file import FlatReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.openai import OpenAI
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from transformers import (BitsAndBytesConfig,
               AutoTokenizer,
               AutoModelForCausalLM,
               AutoConfig,
               TrainingArguments)
from peft import LoraConfig,PeftModel
from trl import SFTTrainer
from datasets import load_dataset, Dataset
from ragas import evaluate
from ragas.metrics import answer_relevancy, faithfulness, answer_similarity

import os
import copy
from pathlib import Path
import seaborn as sns
from tqdm import tqdm
import gc
from IPython.display import Markdown, display

In [4]:
# Define global variables, load access tokens/keys
w_hf_token = userdata.get('HF_W_TOKEN')
r_hf_token = userdata.get('HF_TOKEN')
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

# Huggingface repo for storing the finetuned LoRA parameters for Llama2-7b model
# You will need to change the repo IDs below,
# but the HF repo for the existing IDs below has been set to be public
FINETUNED_LLAMA_LORA_HF_REPO = 'LilVegeDog/finetuned_llama_lora_alpha_8_rank_32'
# Huggingface repo for storing the finetuned Llama2-7b model
FINETUNED_LLAMA_HF_REPO = 'LilVegeDog/finetuned_Llama_2_for_SEP775'
EMBED_MODLE_ID = "BAAI/bge-small-en-v1.5" # Embedding model ID
SYSTEM_PROMPT = """
You are an AI teaching Assistant for the course SEP 775.
You will provide an interactive platform for students to ask questions and receive guidance on course materials.
Your goal is to answer questions as accurately as possible based on the instructions and context provided.
If you do not know the answer, response with "I don't know."
"""

In [5]:
# Load generated prompt-response dataset, split into train and validation sets
prompt_response_data = load_dataset("json", data_files=Path("gdrive/MyDrive/llm_finetune_data/prompt_response_data.jsonl").as_posix())
pr_data_split = prompt_response_data['train'].train_test_split(test_size=0.3, shuffle=True, seed=97)
pr_data_split

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt_response_text'],
        num_rows: 128
    })
    test: Dataset({
        features: ['prompt_response_text'],
        num_rows: 55
    })
})

## Load pretrained Llama2-7b model then finetune with the LoRA technique and SFTTrainer, using the generated prompt-response dataset

In [None]:
# Load Llama2-7b tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=r_hf_token)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Set up qunatization config for 4 bit
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [None]:
# Load Llama2 in 4 bit to reduce the RAM it takes
llama_4b = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=r_hf_token,quantization_config=quantization_config)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
llama_4b.config.use_cache = False

In [None]:
# Load the config of Llama2-7b model and redefine the config
llama_config = AutoConfig.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=r_hf_token,quantization_config=quantization_config)
llama_config = llama_4b.config
llama_config

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-chat-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"
  },
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,

In [None]:
def get_model_module_names_recursion(parent_name,model):
    """
    Function to get all names of modules within a model
    """
    # Model can also be a module if it contains sub-components
    for name, module in model.named_children():
        print(parent_name+ '.' + name)
        get_model_module_names_recursion(parent_name+ '.' + name,module)

In [None]:
# Get names of all modules within the Llama2-7b model
get_model_module_names_recursion('llama2',llama_4b)

llama2.model
llama2.model.embed_tokens
llama2.model.layers
llama2.model.layers.0
llama2.model.layers.0.self_attn
llama2.model.layers.0.self_attn.q_proj
llama2.model.layers.0.self_attn.k_proj
llama2.model.layers.0.self_attn.v_proj
llama2.model.layers.0.self_attn.o_proj
llama2.model.layers.0.self_attn.rotary_emb
llama2.model.layers.0.mlp
llama2.model.layers.0.mlp.gate_proj
llama2.model.layers.0.mlp.up_proj
llama2.model.layers.0.mlp.down_proj
llama2.model.layers.0.mlp.act_fn
llama2.model.layers.0.input_layernorm
llama2.model.layers.0.post_attention_layernorm
llama2.model.layers.1
llama2.model.layers.1.self_attn
llama2.model.layers.1.self_attn.q_proj
llama2.model.layers.1.self_attn.k_proj
llama2.model.layers.1.self_attn.v_proj
llama2.model.layers.1.self_attn.o_proj
llama2.model.layers.1.self_attn.rotary_emb
llama2.model.layers.1.mlp
llama2.model.layers.1.mlp.gate_proj
llama2.model.layers.1.mlp.up_proj
llama2.model.layers.1.mlp.down_proj
llama2.model.layers.1.mlp.act_fn
llama2.model.layers.

In [None]:
# Define LoRA configuration with LoRA applied on the last 4 self attention layers of Llama2-7b, LoRA's alpha value 8 and rank value 32
lora_target_modules = ["31.self_attn.q_proj","31.self_attn.k_proj","31.self_attn.v_proj","31.self_attn.o_proj",
            "30.self_attn.q_proj","30.self_attn.k_proj","30.self_attn.v_proj","30.self_attn.o_proj",
            "29.self_attn.q_proj","29.self_attn.k_proj","29.self_attn.v_proj","29.self_attn.o_proj",
            "28.self_attn.q_proj","28.self_attn.k_proj","28.self_attn.v_proj","28.self_attn.o_proj"]

peft_params = LoraConfig(
    lora_alpha=8,
    target_modules=lora_target_modules,
    lora_dropout=0.1,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
# Define training arguments, using paged_adamw_32bit as optimizer, 5 epochs, and 2e-4 as learning rate
training_params = TrainingArguments(
    output_dir="gdrive/MyDrive/llama_finetune_alpha_8_results",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=80,
    logging_steps=20,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [None]:
# Define SFTTrainer with max sequence length 4096, batch size 1, and the previously defined LoRA config
trainer = SFTTrainer(
    model=llama_4b,
    train_dataset=pr_data_split['train'],
    eval_dataset=pr_data_split['test'],
    peft_config=peft_params,
    dataset_text_field="prompt_response_text",
    max_seq_length=4096,
    tokenizer=tokenizer,
    args=training_params,
    dataset_batch_size = 1,
    packing=False,
)

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Finetune Llama2-7b with trainer,
# note that during training it might show that the training reaches some unknown error
# but the training should resume by itself within several seconds
trainer.train()

Step,Training Loss
20,2.2354
40,2.216
60,2.2101
80,2.0882
100,2.0914
120,2.0676
140,1.9477
160,1.9947
180,1.8947
200,1.9327


TrainOutput(global_step=640, training_loss=1.9007563054561616, metrics={'train_runtime': 6128.4422, 'train_samples_per_second': 0.104, 'train_steps_per_second': 0.104, 'total_flos': 4.73712063891456e+16, 'train_loss': 1.9007563054561616, 'epoch': 5.0})

In [None]:
# Push the finetuned LoRA parameters for Llama2-7b to the Huggingface Hub
trainer.model.push_to_hub(FINETUNED_LLAMA_LORA_HF_REPO, token=w_hf_token)
trainer.tokenizer.push_to_hub(FINETUNED_LLAMA_LORA_HF_REPO,token=w_hf_token)

adapter_model.safetensors:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/LilVegeDog/finetuned_llama_lora_alpha_8_rank_32/commit/c9bb5bb1248903c336c2edea1ac1f953621c6692', commit_message='Upload tokenizer', commit_description='', oid='c9bb5bb1248903c336c2edea1ac1f953621c6692', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Delete the 4-bit Llama2 model and clean the cache to have enough
# space to run the following code
del llama_4b
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Load the Llama2-7b model again but note this time should be at least with 16-bit dtype
# to avoid rounding error when merging with the finetuned LoRA parameters
llama_full = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=r_hf_token,
                          torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
llama_full.config

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-chat-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 32000
}

In [None]:
# Load the finetuned LoRA parameters
llama_merged_peft = PeftModel.from_pretrained(llama_full, FINETUNED_LLAMA_LORA_HF_REPO)

adapter_config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

In [None]:
# Merge the LoRA parameters into original Llama2-7b model
# to get a finetuned Llama2-7b model
llama_merged_peft = llama_merged_peft.merge_and_unload()

In [None]:
llama_merged_peft

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head):

In [None]:
# Push the finetuned Llama2-7b model onto Huggingface repo
# to be later retrieved by LlamaIndex API
llama_merged_peft.push_to_hub(FINETUNED_LLAMA_HF_REPO, token=w_hf_token)
tokenizer.push_to_hub(FINETUNED_LLAMA_HF_REPO, token=w_hf_token)

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/LilVegeDog/finetuned_Llama_2_for_SEP775/commit/6f5c567d90d7af4203ddaa7eb99e8bbbb63ecb75', commit_message='Upload tokenizer', commit_description='', oid='6f5c567d90d7af4203ddaa7eb99e8bbbb63ecb75', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Delete the Llama2 model and clean the cache to have enough
# space to run the following code.
del llama_full
del llama_merged_peft
gc.collect()
torch.cuda.empty_cache()

It would be more promising to simply restart the kernel, import and define the libraries and global variables, then run the cells below for evaluation.

## Evaluate the original pretrained and finetuned Llama2-7b model

### Load context embeddings from Chroma VectorDB, and sample test questions

In [6]:
db = chromadb.PersistentClient(path="gdrive/My Drive/course_materials_db")

In [7]:
chroma_collection = db.get_or_create_collection("NLP_course_materials")

In [8]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [9]:
# Set up text chunk settings and embedding model
Settings.chunk_size = 512
Settings.chunk_overlap = 20
Settings.context_window = 2048
Settings.embed_model = HuggingFaceEmbedding(model_name=EMBED_MODLE_ID, max_length=512)


In [10]:
index_from_vec_store = VectorStoreIndex.from_vector_store(vector_store, embed_model=Settings.embed_model)

In [11]:
# Load previously generated question-context-response dataset for testing
sample_QCR_df = pd.read_json("gdrive/My Drive/llm_finetune_data/ref_QCR_test_data.json")
sample_QCR_df.head()

Unnamed: 0,questions,responses
0,What were the main results achieved by the Ima...,The Image Transformer achieved a new state of ...
1,How does the proposed architecture in the pape...,I don't know.
2,What significant improvements do the proposed ...,The proposed models offer significant improvem...
3,What future directions are suggested for impro...,The future directions suggested for improving ...
4,What are the key components of the Transformer...,The key components of the Transformer architec...


In [18]:
def gen_QCR_data(query_engine, ref_QCR_df, num_of_Q_to_eval):
    """
    Function to generate a question-context-response dataset to be evaluated,
    the returned dataset also includes reference "ground-truth" response from the input.

    Parameters:
    query_engine: LlamaIndex query engine object
    ref_QCR_df: Dataframe that includes sample QCR data for testing

    Output:
    QCR_ds: question-context-response dataset with features [question, answer, contexts, ground_truth]
    """
    sample_questions = ref_QCR_df['questions'].values
    ref_answers = ref_QCR_df['responses'].values

    print("Performing queries for %d sample questions..."%num_of_Q_to_eval)

    contexts = []
    answers = []
    for Q in tqdm(sample_questions[:num_of_Q_to_eval]):
        response = query_engine.query(Q)
        contexts.append([x.node.get_content() for x in response.source_nodes])
        answers.append(str(response))

    # Take the question, context, response, and reference response of the first 40 queries for later evaluations
    QCR_ds = Dataset.from_dict(
        {
            "question": sample_questions[:num_of_Q_to_eval],
            "answer": answers[:num_of_Q_to_eval],
            "contexts": contexts[:num_of_Q_to_eval],
            "ground_truth": ref_answers[:num_of_Q_to_eval],
        }
    )

    return QCR_ds

In [13]:
def eval_QCR_data(QCR_ds):
    """
    Function to perform evaluation on question-context-response dataset
    with answer_relevancy, faithfulness, and answer_similarity metrics

    Outputs:
    ans_eval_result: Evaluation results of answer_relevancy and faithfulness metrics
    ans_sim_eval_result: Evaluation results of answer_similarity metric
    """
    print("Performing evaluation...")
    # Seldomly the evaluations could reach exception due to closed AsyncClient,
    # but that DID NOT found to be affecting the evaluation results
    # So we decided to not raise the exceptions during evaluations
    ans_eval_result = evaluate(QCR_ds, [answer_relevancy, faithfulness],raise_exceptions=False)
    ans_sim_eval_result = evaluate(QCR_ds.select_columns(["question","answer","ground_truth"]),
                                [answer_similarity],raise_exceptions=False)

    print(ans_eval_result)
    print(ans_sim_eval_result)

    return ans_eval_result, ans_sim_eval_result

### Evaluate the finetuned Llama2 model

In [14]:
# Set up qunatization config for 4 bit
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [15]:
# Define the query wrapper prompt template then use LlamaIndex's API
# to get the finetuned Llama2-7b as the LLM
query_wrapper_prompt = PromptTemplate(
        "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n{query_str}[/INST]"
        )
finetuned_llm = HuggingFaceLLM(
    model_name=FINETUNED_LLAMA_HF_REPO,
    tokenizer_name=FINETUNED_LLAMA_HF_REPO,
    query_wrapper_prompt=query_wrapper_prompt,
    context_window=3900,
    max_new_tokens=512,
    model_kwargs={"token": r_hf_token, "quantization_config": quantization_config},
    generate_kwargs={"temperature": 0.3},
    tokenizer_kwargs={"token": r_hf_token},
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
# Set up LlamaIndex LLM, chunk settings and initialize query engine
Settings.llm = finetuned_llm
query_engine = index_from_vec_store.as_query_engine(similarity_top_k=3,response_mode="compact")

In [19]:
# Use the query engine with the finetuned Llama2-7b LLM to perform query
# for sample test questions
num_of_Q_to_eval = 10
QCR_data_to_eval = gen_QCR_data(query_engine, sample_QCR_df, num_of_Q_to_eval)

Performing queries for 10 sample questions...


100%|██████████| 10/10 [06:32<00:00, 39.27s/it]


In [20]:
# Evaluate the query responses with Ragas answer relevancy,
# answer semantics similarity and faithfullness metrics
finetuned_ans_eval, finetuned_ans_sim_eval = eval_QCR_data(QCR_data_to_eval)

Performing evaluation...


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

{'answer_relevancy': 0.8416, 'faithfulness': 0.9250}
{'answer_similarity': 0.9010}


### Evaluate the original pretrained Llama2 model

In [22]:
# Delete the finetuned Llama2 model and clean the cache to have enough
# space to run the following code.
del finetuned_llm
gc.collect()
torch.cuda.empty_cache()

It would be more straight forward to simply restart the kernel, import and define the libraries and global variables, load vectorDB and sample questions, then run the cells below for evaluation.

In [23]:
# Set up qunatization config for 4 bit
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [24]:
# Define the query wrapper prompt template then use LlamaIndex's API
# to get the original pretrained Llama2-7b as the LLM
query_wrapper_prompt = PromptTemplate(
        "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n{query_str}[/INST]"
        )
orig_llm = HuggingFaceLLM(
    model_name="meta-llama/Llama-2-7b-chat-hf",
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    query_wrapper_prompt=query_wrapper_prompt,
    context_window=3900,
    max_new_tokens=512,
    model_kwargs={"token": r_hf_token, "quantization_config": quantization_config},
    generate_kwargs={"temperature": 0.3},
    tokenizer_kwargs={"token": r_hf_token},
    device_map="auto",
)

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [25]:
# Set up LlamaIndex LLM, chunk settings and initialize query engine
Settings.llm = orig_llm
query_engine = index_from_vec_store.as_query_engine(similarity_top_k=3,response_mode="compact")

In [26]:
# Use the query engine with the original pretrained Llama2-7b LLM to perform query
# for sample test questions
num_of_Q_to_eval = 10
orig_QCR_data_to_eval = gen_QCR_data(query_engine, sample_QCR_df, num_of_Q_to_eval)

Performing queries for 10 sample questions...


100%|██████████| 10/10 [03:20<00:00, 20.08s/it]


In [27]:
# Evaluate the query responses with Ragas answer relevancy,
# answer semantics similarity and faithfullness metrics
orig_ans_eval, orig_ans_sim_eval = eval_QCR_data(orig_QCR_data_to_eval)

Performing evaluation...


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

{'answer_relevancy': 0.8826, 'faithfulness': 0.9800}
{'answer_similarity': 0.9095}
