## Naive RAG : QA Pipeline 

In [1]:
#Init Libraries

#Env Variables
from dotenv import load_dotenv
import os

#Embeddings 
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


#LlamaIndex RAG Core Libraries
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import Settings

#LLM
from groq import Groq
from groqllm import GroqLLM

#VectorStorage
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex
from llama_index.core import PromptTemplate,Document 
import chromadb

#Chunking
from llama_index.core.node_parser import SentenceSplitter

#DML
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


#### Loading RAG Components

In [2]:
#Loading Embedding Model
embed_model = HuggingFaceEmbedding(model_name='Snowflake/snowflake-arctic-embed-m' 
                                   ,trust_remote_code=True
                                   )

#Loading LLM from Groq
llm = GroqLLM(model_name = "llama3-8b-8192"
             ,client =Groq(api_key=os.getenv("GROQ_API_KEY"))
             ,temperature =0.1
             ,system_prompt = ('You are a helpful assistant.'))

In [3]:
#Setting up temporary ChromaDB
db = chromadb.EphemeralClient() #Makes a temporary client which is not on disk
chroma_collection = db.get_or_create_collection("temp")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

### Setting LlamaIndex Contexts

In [4]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)
Settings.llm = llm
Settings.embed_model = embed_model

#### Loading Context

In [5]:
df = pd.read_csv('../data/MainDataset/data/Context.csv')

context_list = []
for contexts in df['context']:
    context_list.append(Document(text=contexts))

In [6]:
# Chunking Pipeline
pipeline = IngestionPipeline(
    transformations=[
        #Splits chunks to 512 with 50 overlap
        SentenceSplitter(chunk_size=512, chunk_overlap=50), 
    ],
    vector_store=vector_store,
)

documents = pipeline.run(documents=context_list)

In [7]:
#Create VectorStoreIndex for RAG
index = VectorStoreIndex(documents, storage_context=storage_context,similarity_top_k=5) 

In [8]:
#Turn Index into a Query Engine
query_engine = index.as_query_engine(streaming=True,similarity_top_k=3)

In [38]:
#Set a RAG Prompt for the System
qa_prompt_template_str = """
Context: {context_str}
Instructions:
- You are under an examination setting, your goal is to accurately answer the questions to your best ability.
- You will be evaluated after the exam.
- Utilize the context provided for accurate and specific information.
- Keep your answers short.
- You will be penalised if you are writing unnessecary words
Question: {query_str}
"""

In [40]:
#Update initial prompt and remove the refining prompt
qa_prompt_template = PromptTemplate(qa_prompt_template_str)
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template":qa_prompt_template}
)


query_engine.update_prompts(
    {"response_synthesizer:refine_template":PromptTemplate("")}
)


### Loading Questions

In [11]:
df_qa = pd.read_csv('../data/MainDataset/data/QA.csv')

In [12]:
question_list = []
for questions in df_qa['question']:
    question_list.append(questions)
    

In [42]:
import time

for i in range(len(question_list)): 
    time.sleep(3)
    print(question_list[i])
    response = query_engine.query(question_list[i])
    response.print_response_stream()
    rowIndex = df_qa.index[i]

    df_qa.at[rowIndex, 'gen_answer'] = response.response_txt
    print('')
    print("--"*100)
    print(f'{i}/{128}')

who introduced modern methods of surgery, such as antiseptics, sterilization, and washing hands? brainly
Joseph Lister introduced modern methods of surgery, such as antiseptics, sterilization, and washing hands, with his paper "Antiseptic Principle of the Practice of Surgery" in 1867.
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
0/128
president adams
John Adams was the second President of the United States, serving from 1797 to 1801.
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1/128
define semitic
The term "Semitic" refers to a language family that includes languages such as Hebrew, Arabic, and Amharic, among others. It is not relevant to the provided context, which discusses ci

In [31]:
df_qa.head()

Unnamed: 0,question,answers,gen_answer
0,"who introduced modern methods of surgery, such...",Joseph Lister,Joseph Lister introduced modern methods of sur...
1,president adams,John Quincy Adams was an American statesman wh...,John Adams was the second President of the Uni...
2,define semitic,A member of any of a number of peoples of anci...,"The term ""Semitic"" refers to a language family..."
3,what is wudfhost service,It is a set of Microsoft tools that aid in the...,"I'm happy to help! However, I must point out t..."
4,what is an overactive pancreas,An overactive pancrease would produce more enz...,An overactive pancreas is a condition where th...


In [43]:
df_qa.to_csv(f'../data/MainDataset/results/Official/Naive_RAG1.csv',index=False)