## Naive RAG : QA Pipeline 

In [13]:
#Init Libraries

#Env Variables
from dotenv import load_dotenv
import os

#Embeddings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

#LlamaIndex RAG Core Libraries
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import Settings

#LLM
from groq import Groq
from groqllm import GroqLLM

#VectorStorage
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex
from llama_index.core import PromptTemplate,Document 
import chromadb

#Chunking
from llama_index.core.node_parser import SemanticSplitterNodeParser

#DML
import pandas as pd

#### Loading RAG Components

In [4]:
#Loading Embedding Model
embed_model = HuggingFaceEmbedding(model_name='Snowflake/snowflake-arctic-embed-m-long' 
                                   ,trust_remote_code=True
                                   )

#Loading LLM from Groq
llm = GroqLLM(model_name = "llama3-8b-8192"
             ,client =Groq(api_key=os.getenv("GROQ_API_KEY"))
             ,temperature =0.1
             ,system_prompt = ('You are a helpful assistant.'))

<All keys matched successfully>


In [5]:
#Setting up temporary ChromaDB
db = chromadb.EphemeralClient() #Makes a temporary client which is not on disk
chroma_collection = db.get_or_create_collection("temp")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

### Setting LlamaIndex Contexts

In [6]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)
Settings.llm = llm
Settings.embed_model = embed_model

#### Loading Context

In [7]:
df = pd.read_csv('../data/MainDataset/data/Context.csv')

context_list = []
for contexts in df['context']:
    context_list.append(Document(text=contexts))

In [10]:
# Chunking Pipeline
pipeline = IngestionPipeline(
    transformations=[
        """SemanticSplitterNodeParser(
        embed_model=embed_model
)"""
    ],
    vector_store=vector_store,
)

documents = pipeline.run(documents=context_list)

In [34]:
reranker_model = FlagEmbeddingReranker(model="mixedbread-ai/mxbai-embed-large-v1", top_n=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mixedbread-ai/mxbai-embed-large-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
#Create VectorStoreIndex for RAG
index = VectorStoreIndex(documents, storage_context=storage_context,similarity_top_k=5,node_postprocessors=[reranker_model]) 

In [36]:
#Turn Index into a Query Engine
query_engine = index.as_query_engine(streaming=True,similarity_top_k=3)

In [42]:
#Set a RAG Prompt for the System
qa_prompt_template_str ="""
Context: {context_str}
Instructions:
- Be helpful and answer questions concisely. If you don't know the answer, say 'I don't know'
- Utilize the context provided for accurate and specific information.
- Incorporate your preexisting knowledge to enhance the depth and relevance of your response.
- Be concise and to the point.
Question: {query_str}
{query_str}
"""

In [43]:
#Update initial prompt and remove the refining prompt
qa_prompt_template = PromptTemplate(qa_prompt_template_str)
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template":qa_prompt_template}
)


query_engine.update_prompts(
    {"response_synthesizer:refine_template":PromptTemplate("")}
)


### Loading Questions

In [44]:
df_qa = pd.read_csv('../data/MainDataset/data/QA.csv')

In [45]:
question_list = []
for questions in df_qa['question']:
    question_list.append(questions)
    

In [46]:
import time

for i in range(len(question_list)): 
    time.sleep(3)
    print(question_list[i])
    response = query_engine.query(question_list[i])
    response.print_response_stream()
    rowIndex = df_qa.index[i]

    df_qa.at[rowIndex, 'gen_answer'] = response.response_txt
    print('')
    print("--"*100)
    print(f'{i}/{128}')

who introduced modern methods of surgery, such as antiseptics, sterilization, and washing hands? brainly
According to the provided context, Joseph Lister introduced modern methods of surgery, such as antiseptics, sterilization, and washing hands, following the publication of his paper "Antiseptic Principle of the Practice of Surgery" in 1867.
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
0/128
president adams
I don't think President Adams is mentioned in the provided text. The text appears to be discussing Vice President Agnew and the concept of "shop with confidence." If you're looking for information on President Adams, I'd be happy to help with that!
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [47]:
df_qa.head()

Unnamed: 0,question,answers,gen_answer
0,"who introduced modern methods of surgery, such...",Joseph Lister,"According to the provided context, Joseph List..."
1,president adams,John Quincy Adams was an American statesman wh...,I don't think President Adams is mentioned in ...
2,define semitic,A member of any of a number of peoples of anci...,"According to the provided context, the term ""S..."
3,what is wudfhost service,It is a set of Microsoft tools that aid in the...,"Based on the context, WUDFHost.exe is a proces..."
4,what is an overactive pancreas,An overactive pancrease would produce more enz...,"According to the provided context, an overacti..."


In [48]:
df_qa.to_csv(f'../data/MainDataset/results/Official/Advanced_RAG.csv',index=False)