### Loading Libraries

In [1]:
#Init Libraries

#Fitting Libraries
from components import Components
from llama_index.core import Settings

#Additional Core Libraries
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex
from llama_index.core import PromptTemplate,Document 
from llama_index.core.node_parser import SentenceSplitter

import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#initialise components, reranker and retriever
rag_components = Components("Snowflake/snowflake-arctic-embed-s","mixedbread-ai/mxbai-embed-large-v1","llama3-8b-8192")

In [6]:
rag_components.model_name

'llama3-8b-8192'

Setting Contexts


In [5]:
Settings.embed_model = rag_components.get_embedding_model()
Settings.llm = rag_components.get_groq_llm()

Embedding model loaded!


ValidationError: 1 validation error for GroqLLM
system_prompt
  str type expected (type=type_error.str)

In [None]:
vector_store = ChromaVectorStore(chroma_collection=rag_components.get_db())
storage_context = StorageContext.from_defaults(vector_store=vector_store)

Loading Papers

In [7]:
df = pd.read_csv('../data/MS MARCO/data/MS_MARCO128_sample2.csv')

In [8]:
context_list = []
for contexts in df['context'][:128]:
    context_list.append(Document(text=contexts))

NameError: name 'df' is not defined

In [9]:
pipeline = IngestionPipeline(
    transformations=[
          SentenceSplitter(chunk_size=1000, chunk_overlap=200),
    ],
    vector_store=vector_store,
)

In [10]:
documents = pipeline.run(documents=context_list)

In [4]:
llm = rag_components.get_groq_llm()

In [12]:
llm.complete("Hello!")

NameError: name 'llm' is not defined

In [12]:
index = VectorStoreIndex(documents, storage_context=storage_context,similarity_top_k=5) #node_postprocessors=[rag_components.get_reranker()]
 

In [13]:
query_engine = index.as_query_engine(streaming=True,similarity_top_k=3)

In [None]:
qa_prompt_template_str = """
Context: {context_str}
Instructions:
- Be helpful and answer questions concisely. If you don't know the answer, say 'I don't know'
- Utilize the context provided for accurate and specific information.
- Incorporate your preexisting knowledge to enhance the depth and relevance of your response.
- Be concise and to the point.
Question: {query_str}
"""


qa_prompt_template = PromptTemplate(qa_prompt_template_str)
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template":qa_prompt_template}
)


In [None]:
query_engine.update_prompts(
    {"response_synthesizer:refine_template":PromptTemplate("")}
)


In [8]:
question_list = []
for questions in df['question'][:128]:
    question_list.append(questions)
    

In [9]:
import time
for i in range(len(question_list)): 
    time.sleep(3)
    print(question_list[i])
    response = llm.complete(question_list[i])
    print(response.text)
    rowIndex = df.index[i]

    df.at[rowIndex, 'gen_answer'] = response.text
    print('')
    print("--"*100)
    print(f'{i}/{128}')
 


distance between dc to atlanta ga
The distance between Washington D.C. and Atlanta, GA is approximately 760 miles (1,223 km).

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
0/128
what is spiractin used for
Spiractin (also known as Spirapril) is a medication used to treat hypertension (high blood pressure) and heart failure.

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1/128
what state is birthplace to most us presidents
Virginia is the birthplace to the most US Presidents, with a total of 8 presidents born there.

------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
df[:128]

Unnamed: 0,question,answer,context,gen_answer
0,distance between dc to atlanta ga,639 miles or 1 028 kilometers,"['Map of driving directions from Washington, D...",The distance between Washington D.C. and Atlan...
1,what is spiractin used for,It is used to treat fluid retention (oedema) c...,['Spiractin 25 and Spiractin 100 tablets also ...,Spiractin (also known as Spirapril) is a medic...
2,what state is birthplace to most us presidents,"Massachusetts, and New York.",['The number of presidents born per state are:...,Virginia is the birthplace to the most US Pres...
3,what is an ea for tax,An individual who has demonstrated technical c...,['What does the term “Enrolled Agent” mean? “E...,An EA (Enrolled Agent) is a tax professional w...
4,what does dtc stand for?,Data Transformation Corporation,['Automated Customer Account Transfer. The tra...,DTc stands for Direct-to-Consumer.
...,...,...,...,...
123,average price for a vented gas fireplace,$115 per set and designer vented gas logs can ...,['Basic Conversion: $225 - $900. • Fireplace T...,The average price for a vented gas fireplace c...
124,what movie is the song true blue on?,Its a title track from Madonna's third studio ...,"[""True Blue is a song by American singer Madon...","The song ""True Blue"" is by Madonna and is the ..."
125,definition of garb,It is defined as to dress someone.,"[""Acronym for Goofy ass retarded bitch. Person...","Garb refers to clothing or attire, especially ..."
126,how much do cna make?,"$24,890 a year",['Certified Nursing Assistant-Long-Term Care S...,The average hourly wage for a Certified Nursin...


In [None]:
df = df[:128]

In [13]:
df.to_csv(f'../data/MS MARCO/results/{rag_components.model_name}_128Q_Run_BaseLLMNORAG.csv',index=False)