In [1]:
import json
import chromadb
import torch
import pandas as pd

from llama_index.core import VectorStoreIndex, StorageContext, Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.response.notebook_utils import display_response
from llama_index.readers.file import FlatReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from datasets import load_dataset, Dataset


import os
from dotenv import load_dotenv
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
from IPython.display import Markdown, display

C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


### Intialize environment/global variables, load course material data from documents, transform into embedding vectors and store in vectorDB

In [2]:
load_dotenv()
r_hf_token=os.getenv("HUGGINGFACE_READ_API")
w_hf_token=os.getenv("HUGGINGFACE_WRITE_API")
os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")

INPUT_DATA_PATH = '../data/course_materials_markdown' # Path to markdown file that contains all the course materials text
EMBED_MODLE_ID = "BAAI/bge-small-en-v1.5" # Embedding model ID
SYSTEM_PROMPT = """
You are an AI teaching Assistant for the course SEP 775. 
You will provide an interactive platform for students to ask questions and receive guidance on course materials.
Your goal is to answer questions as accurately as possible based on the instructions and context provided.
If you do not know the answer, response with "I don't know."
"""

In [5]:
md_doc = FlatReader().load_data(Path('../data/materials_formatted.md'))

In [6]:
# Set up text chunk settings and embedding model
Settings.chunk_size = 512
Settings.chunk_overlap = 20
Settings.embed_model = HuggingFaceEmbedding(model_name=EMBED_MODLE_ID, max_length=512)

In [7]:
# Initialize a Chroma VectorDB
db = chromadb.PersistentClient(path="../course_materials_db")
chroma_collection = db.get_or_create_collection("NLP_course_materials")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [8]:
# Generate and store embedding vectors from course material docs, or load embeddings from an existing vectorDB
#index = VectorStoreIndex.from_documents(md_doc, storage_context=storage_context)
index_from_vec_store = VectorStoreIndex.from_vector_store(vector_store, embed_model=Settings.embed_model)

### Generate training data for finetuning Llama2 model, with ChatGPT generated questions and GPT-4 as the RAG system's LLM

In [9]:
# Load 230 sample questions generated by ChatGPT with GPT-4
questions_context_df = pd.read_csv('../data/question_context.csv')

In [10]:
questions_context_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  230 non-null    object
 1   Content   230 non-null    object
 2   Type      230 non-null    object
dtypes: object(3)
memory usage: 5.5+ KB


In [12]:
# Set up LLM and query engine
Settings.llm = OpenAI(model="gpt-4", temperature=0.3, system_prompt=SYSTEM_PROMPT)
Settings.context_window = 2048
query_engine = index_from_vec_store.as_query_engine(similarity_top_k=3,response_mode="compact")

In [13]:
def gen_QCR_data(questions, save_to_file):
    """
    Function for generating question-context-response dataset for finetuning Llama2 model and testing finetuned Llama2 model
    
    """
    out_path = save_to_file

    out_path.parent.mkdir(parents=True, exist_ok=True)
    
    no_ans_Q = 0
    for Q in tqdm(questions):
        response = query_engine.query(Q)
        context = [x.node.get_content() for x in response.source_nodes]
        answer = str(response)
        if "not provide" in answer:
            no_ans_Q += 1
            continue
        with open(out_path, "a") as f:
            newitem = {
                "question": Q,
                "context": context,
                "response": answer,
            }
            f.write(json.dumps(newitem) + "\n")
    
    print("QCR data generated, roughly %d questions can't find answer from the context."%no_ans_Q)

In [14]:
gen_QCR_data(questions_context_df['Question'].values,Path("../llm_finetune_data/QCR_data.jsonl"))

100%|████████████████████████████████████████████████████████████████████████████████| 230/230 [17:47<00:00,  4.64s/it]

QCR data generated, roughly 7 questions can't find answer from the context.





In [30]:
ref_QCR_data = load_dataset("json", data_files=Path("../llm_finetune_data/QCR_data.jsonl").as_posix())

In [16]:
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

In [17]:
display_prompt_dict(query_engine.get_prompts())

**Prompt Key**: response_synthesizer:text_qa_template<br>**Text:** <br>

Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query_str}
Answer: 


<br><br>

**Prompt Key**: response_synthesizer:refine_template<br>**Text:** <br>

The original query is as follows: {query_str}
We have provided an existing answer: {existing_answer}
We have the opportunity to refine the existing answer (only if needed) with some more context below.
------------
{context_msg}
------------
Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer.
Refined Answer: 


<br><br>

In [37]:
# Split the QCR_data into train and test sets, save the test set
ref_QCR_data_split = ref_QCR_data['train'].train_test_split(test_size=40, shuffle=True, seed=97)

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

270472

In [57]:
ref_QCR_data_split['test']

Dataset({
    features: ['question', 'context', 'response'],
    num_rows: 40
})

In [59]:
ref_QCR_test_data_df = pd.DataFrame({"questions":ref_QCR_data_split['test']['question'],
                                     "responses":ref_QCR_data_split['test']['response'],})
ref_QCR_test_data_df.to_json(Path("../llm_finetune_data/ref_QCR_test_data.json"))

In [43]:
def gen_prompt(qcr_item):
    """
    Function to inject a sample question-contexts-response data into 
    the LlamaIndex default prompt template with the predefined system prompt and response
    """
    context_str = ("filename: materials_formatted.md\nextension: .md\n\n"+
                   "\n\nfilename: materials_formatted.md\nextension: .md\n\n".join(qcr_item['context']))
    query_str = "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n"+qcr_item['question']+"[/INST] "
    
    return f"""Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query_str}
Answer: {qcr_item['response']}"""

In [44]:
print(gen_prompt(ref_QCR_data_split['train'][1]))

Context information is below.
---------------------
filename: materials_formatted.md
extension: .md

Our experiments show that using triplet loss does not affect the results much. More details can be found in Appendix B.  Cross-dataset generalization One interesting question regarding DPR’s discriminative training is how much performance degradation it may suf- fer from a non-iid setting. In other words, can it still generalize well when directly applied to a different dataset without additional ﬁne-tuning? To test the cross-dataset generalization, we train DPR on Natural Questions only and test it directly on the smaller WebQuestions and CuratedTREC datasets. We ﬁnd that DPR generalizes well, with 3-5 points loss from the best performing ﬁne-tuned model in top-20 retrieval accuracy (69.9/86.3 vs. 75.0/89.1 for WebQuestions and TREC, respec- tively), while still greatly outperforming the BM25 baseline (55.0/70.9).  5.3 Qualitative Analysis  Although DPR performs better than BM25 in gen

In [45]:
def gen_prompt_response_data(qcr_dataset, save_to_file):
    """
    Function to transfer question-context-response dataset into dataset of concatenated prompts and responses
    """
    dataset_splits = {"train": qcr_dataset["train"]}
    out_path = save_to_file

    out_path.parent.mkdir(parents=True, exist_ok=True)

    for key, ds in dataset_splits.items():
        with open(out_path, "a") as f:
            for item in ds:
                prompt = gen_prompt(item)
                newitem = {
                    "prompt_response_text": prompt
                }
                f.write(json.dumps(newitem) + "\n")

In [48]:
gen_prompt_response_data(ref_QCR_data_split, Path("../llm_finetune_data/prompt_response_data.jsonl"))

In [49]:
prompt_response_data = load_dataset("json", data_files=Path("../llm_finetune_data/prompt_response_data.jsonl").as_posix())

Generating train split: 0 examples [00:00, ? examples/s]

In [50]:
pr_data_split = prompt_response_data['train'].train_test_split(test_size=0.3, shuffle=True, seed=97)

In [51]:
pr_data_split

DatasetDict({
    train: Dataset({
        features: ['prompt_response_text'],
        num_rows: 128
    })
    test: Dataset({
        features: ['prompt_response_text'],
        num_rows: 55
    })
})

In [53]:
print(prompt_response_data['train'][1]['prompt_response_text'])

Context information is below.
---------------------
filename: materials_formatted.md
extension: .md

Our experiments show that using triplet loss does not affect the results much. More details can be found in Appendix B.  Cross-dataset generalization One interesting question regarding DPR’s discriminative training is how much performance degradation it may suf- fer from a non-iid setting. In other words, can it still generalize well when directly applied to a different dataset without additional ﬁne-tuning? To test the cross-dataset generalization, we train DPR on Natural Questions only and test it directly on the smaller WebQuestions and CuratedTREC datasets. We ﬁnd that DPR generalizes well, with 3-5 points loss from the best performing ﬁne-tuned model in top-20 retrieval accuracy (69.9/86.3 vs. 75.0/89.1 for WebQuestions and TREC, respec- tively), while still greatly outperforming the BM25 baseline (55.0/70.9).  5.3 Qualitative Analysis  Although DPR performs better than BM25 in gen

## After running all the code cells above, upload the course_materials_db directory, the llm_finetune_data direcotry and the Llama2_Finetune_Eval_pipeline_colab notebook onto Google Drive and run the notebook with T4 GPU and High RAM