In [9]:
from components import Components
from llama_index.core import Settings

from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex
from llama_index.core import PromptTemplate


from llama_index.core.node_parser import SentenceSplitter

In [2]:
#initialise components, reranker and retriever
rag_components = Components("Snowflake/snowflake-arctic-embed-m","mixedbread-ai/mxbai-embed-large-v1")

Setting Contexts


In [3]:
Settings.embed_model = rag_components.get_embedding_model()
Settings.llm = rag_components.get_groq_llm('gemma-7b-it')

Embedding model loaded!


In [4]:
vector_store = ChromaVectorStore(chroma_collection=rag_components.get_db())
storage_context = StorageContext.from_defaults(vector_store=vector_store)

Loading Papers

In [10]:
documents = SimpleDirectoryReader(input_dir='papers').load_data()
pipeline = IngestionPipeline(
    transformations=[
          SentenceSplitter(chunk_size=1000, chunk_overlap=200),
    ],
    vector_store=vector_store,
)

In [11]:
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context,similarity_top_k=10, node_postprocessors=[rag_components.get_reranker()]
) 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mixedbread-ai/mxbai-embed-large-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reranker model loaded!


In [12]:
query_engine = index.as_query_engine(streaming=True,similarity_top_k=10)


qa_prompt_template_str = """Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query based on the context given ONLY else you will be penalized. Provide your answer in an easy to read and understand format.
Further elaborate your answer by finding examples or information within the context if possible.
Query: {query_str}
Answer:
"""

qa_prompt_template = PromptTemplate(qa_prompt_template_str)
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template":qa_prompt_template}
)

In [27]:
response = query_engine.query("IS T.R.E.N.T Industries a real company?")
response.print_response_stream()
response.source_nodes[0].text

The provided text does not contain any information regarding T.R.E.N.T. Industries, so I am unable to determine whether it is a real company or not.

'Index 229\nD\ndebit, defined, 36\ndebit cards, 36–37\ndebt, 205\ndefault, 157, 166–167\ndeficit, 205\ndeposit, electronic, 33\n“dipped,” 67–68\ndividends\ndefined, 130, 144–145\ndividend-paying stocks, 145\nspecial dividends, 145–146\ntaxes on, 192\nDow Jones Industrial Average\n(Dow/Dow 30)\ndefined, 57–58\nindex funds, 56–59, 105–106\nstocks included in, 58–59\ndue diligence, 210\nduration, of U.S. Treasury notes\nversusbonds, 118–119\nE\nearly stage venture capital, 209\nearnings history, of companies, 129\nearnings per share (EPS), 131–137\nearnings season, 141–142\nEBITDA (Earnings Before Interest\nTaxes Depreciation and\nAmortization), 137–138\neconomy, 195–206\nbudgets and, 202–203\nbudget types, 203–204\ndeficit and national debt, 205\nFederal Reserve (Fed) and,\n195–197\ngross domestic product (GDP),\n198–202\nnational debt and, 204–205\neducated bet, 213education, 215\nelectronic banking, 33\nelectronic bill payment, 31–32\nelectronic funds transfer, 38\nenergy industry mutu