Added Semantic Chunking, and Reranking

## Naive RAG : QA Pipeline 

In [3]:
#Init Libraries

#Env Variables
from dotenv import load_dotenv
import os

#Embeddings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

#LlamaIndex RAG Core Libraries
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import Settings

#LLM
from groq import Groq
from groqllm import GroqLLM

#VectorStorage
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex
from llama_index.core import PromptTemplate,Document 
import chromadb

#Chunking
from llama_index.core.node_parser import  SemanticSplitterNodeParser

#DML
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


#### Loading RAG Components

In [4]:
#Loading Embedding Model
embed_model = HuggingFaceEmbedding(model_name='Snowflake/snowflake-arctic-embed-s' 
                                   ,trust_remote_code=True
                                   )

#Loading LLM from Groq
llm = GroqLLM(model_name = "llama3-8b-8192"
             ,client =Groq(api_key=os.getenv("GROQ_API_KEY"))
             ,temperature =0.1)

In [5]:
#Setting up temporary ChromaDB
db = chromadb.EphemeralClient() #Makes a temporary client which is not on disk
chroma_collection = db.get_or_create_collection("temp")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

### Setting LlamaIndex Contexts

In [6]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)
Settings.llm = llm
Settings.embed_model = embed_model

#### Loading Context

In [7]:
df = pd.read_csv('../data/MainDataset/data/Context.csv')

context_list = []
for contexts in df['context']:
    context_list.append(Document(text=contexts))

In [7]:
# Chunking Pipeline
pipeline = IngestionPipeline(
    transformations=[
      SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
)
    ],
    vector_store=vector_store,
)

documents = pipeline.run(documents=context_list)

In [22]:
df_test = pd.Series([node.text for node in documents])

In [25]:
df_test.str.len().mean()

1030.0277777777778

In [26]:
reranker_model = FlagEmbeddingReranker(model="mixedbread-ai/mxbai-embed-large-v1", top_n=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mixedbread-ai/mxbai-embed-large-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
#Create VectorStoreIndex for RAG
index = VectorStoreIndex(documents, storage_context=storage_context,similarity_top_k=10,node_postprocessors=[reranker_model]) 

In [29]:
#Turn Index into a Query Engine
query_engine = index.as_query_engine(streaming=True,similarity_top_k=5)

In [1]:
#Set a RAG Prompt for the System
qa_prompt_template_str ="""
Context: {context_str}
Instructions:
- You are under an examination setting, your goal is to accurately answer the questions to your best ability.
- You will be evaluated after the exam.
- Utilize the context provided for accurate and specific information.
- Keep your answers 2-3 words in length, else as short as possible.
Question: {query_str}
{query_str}
"""

In [2]:
#Update initial prompt and remove the refining prompt
qa_prompt_template = PromptTemplate(qa_prompt_template_str)
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template":qa_prompt_template}
)


query_engine.update_prompts(
    {"response_synthesizer:refine_template":PromptTemplate("")}
)


NameError: name 'PromptTemplate' is not defined

### Loading Questions

In [13]:
df_qa = pd.read_csv('../data/MainDataset/data/QA.csv')

In [14]:
question_list = []
for questions in df_qa['question']:
    question_list.append(questions)
    

In [30]:
import time

for i in range(len(question_list)): 
    time.sleep(3)
    print(question_list[i])
    response = query_engine.query(question_list[i])
    response.print_response_stream()
    rowIndex = df_qa.index[i]

    df_qa.at[rowIndex, 'gen_answer'] = response.response_txt
    print('')
    print("--"*100)
    print(f'{i}/{128}')

who introduced modern methods of surgery, such as antiseptics, sterilization, and washing hands? brainly
Joseph Lister.
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
0/128
president adams
The sixth President of the United States from 1825 to 1829.
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1/128
define semitic
A member of any of a number of peoples of ancient southwestern Asia, including the Akkadians, Phoenicians, Hebrews, and Arabs, or a descendant of these peoples.
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
2/128
what is

In [16]:
df_qa.head()

Unnamed: 0,question,answers,gen_answer
0,"who introduced modern methods of surgery, such...",Joseph Lister,"According to the provided context, Joseph List..."
1,president adams,John Quincy Adams was an American statesman wh...,"You're referring to John Quincy Adams, the 6th..."
2,define semitic,A member of any of a number of peoples of anci...,"According to the provided context, the term ""S..."
3,what is wudfhost service,It is a set of Microsoft tools that aid in the...,"Based on the context provided, WUDFHost.exe is..."
4,what is an overactive pancreas,An overactive pancrease would produce more enz...,"An overactive pancreas, also known as hyperins..."


In [31]:
df_qa.to_csv(f'../data/MainDataset/results/Official/Advanced_RAG_3.csv',index=False)