In [1]:
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
import os
import sys
from dotenv import load_dotenv
from typing import List
from llama_index.core import Settings
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.schema import BaseNode, TransformComponent
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.legacy.retrievers.bm25_retriever import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever
import faiss

In [2]:
#Settings.llm = Ollama(model="llama3.1:latest", request_timeout=120.0)
from langchain_groq import ChatGroq
from langchain_cohere import CohereEmbeddings
Settings.llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0)
Settings.embedding = OllamaEmbedding(model_name="nomic-embed-text:v1.5")

In [3]:
EMBED_DIMENSION=512


In [4]:
# Create FaisVectorStore to store embeddings
fais_index = faiss.IndexFlatL2(EMBED_DIMENSION)
vector_store = FaissVectorStore(faiss_index=fais_index)

In [5]:
class TextCleaner(TransformComponent):
    """
    Transformation to be used within the ingestion pipeline.
    Cleans clutters from texts.
    """
    def __call__(self, nodes, **kwargs) -> List[BaseNode]:
        
        for node in nodes:
            node.text = node.text.replace('\t', ' ') # Replace tabs with spaces
            node.text = node.text.replace(' \n', ' ') # Replace paragprah seperator with spacaes
            
        return nodes

In [6]:
# Pipeline instantiation with: 
# node parser, custom transformer, vector store and documents
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(),
        TextCleaner()
    ],
    vector_store=vector_store,
    )


# Run the pipeline to get nodes
lyft_docs = SimpleDirectoryReader(
        input_files=["../data/10k/lyft_2021.pdf"]
    ).load_data()
uber_docs = SimpleDirectoryReader(
        input_files=["../data/10k/uber_2021.pdf"]
    ).load_data()


In [7]:
nodes_lyft = pipeline.run(documents=lyft_docs)
nodes_uber = pipeline.run(documents=uber_docs)

In [8]:
lyft_index = VectorStoreIndex(nodes= nodes_lyft, embed_model=Settings.embedding)
lyft_index.storage_context.persist(persist_dir="../storage/lyft")




In [9]:
uber_index = VectorStoreIndex(nodes= nodes_uber, embed_model=Settings.embedding)
uber_index.storage_context.persist(persist_dir="../storage/uber")

In [10]:
lyft_engine = lyft_index.as_query_engine(similarity_top_k=3)
uber_engine = uber_index.as_query_engine(similarity_top_k=3)

In [11]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata

query_engine_tools = [
    QueryEngineTool(
        query_engine=lyft_engine,
        metadata=ToolMetadata(
            name="lyft_10k",
            description=(
                "Provides information about Lyft financials for year 2021. "
                "Use a detailed plain text question as input to the tool. "
                "The input is used to power a semantic search engine."
            ),
        ),
    ),
    QueryEngineTool(
        query_engine=uber_engine,
        metadata=ToolMetadata(
            name="uber_10k",
            description=(
                "Provides information about Uber financials for year 2021. "
                "Use a detailed plain text question as input to the tool. "
                "The input is used to power a semantic search engine."
            ),
        ),
    ),
]

In [21]:
query_engine_tools[0].to_langchain_structured_tool()

StructuredTool(name='lyft_10k', description='Provides information about Lyft financials for year 2021. Use a detailed plain text question as input to the tool. The input is used to power a semantic search engine.', args_schema=<class 'llama_index.core.tools.types.DefaultToolFnSchema'>, func=<BoundFunctionWrapper at 0x29c746800 for method at 0x28f665e40>)

In [22]:
!pip freeze G llama

accelerate==0.34.2
aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
annotated-types==0.7.0
anyio==4.4.0
appnope==0.1.4
asgiref==3.8.1
asttokens==2.4.1
async-timeout==4.0.3
attrs==24.2.0
backoff==2.2.1
bcrypt==4.2.0
beautifulsoup4==4.12.3
boto3==1.35.20
botocore==1.35.20
build==1.2.2
cachetools==5.5.0
certifi==2024.8.30
charset-normalizer==3.3.2
chroma-hnswlib==0.7.6
chromadb==0.5.5
click==8.1.7
cohere==5.9.2
coloredlogs==15.0.1
comm==0.2.2
contourpy==1.3.0
cycler==0.12.1
dataclasses-json==0.6.7
datasets==3.0.0
debugpy==1.8.5
decorator==5.1.1
Deprecated==1.2.14
dill==0.3.8
dirtyjson==1.0.8
diskcache==5.6.3
distro==1.9.0
evaluate==0.4.3
exceptiongroup==1.2.2
executing==2.1.0
faiss-cpu==1.8.0.post1
fastapi==0.114.2
fastavro==1.9.7
filelock==3.16.0
flatbuffers==24.3.25
fonttools==4.53.1
frozenlist==1.4.1
fsspec==2024.6.1
google-auth==2.34.0
googleapis-common-protos==1.65.0
greenlet==3.1.0
groq==0.11.0
grpcio==1.66.1
h11==0.14.0
httpcore==1.0.5
httptools==0.6.1
httpx==0.27.2
httpx-s

In [23]:

# needs llama_index-packs-agents-coa
from llama_index.packs.agents_coa import CoAAgentPack

pack = CoAAgentPack(tools=query_engine_tools, llm=Settings.llm)

In [26]:
Settings.llm

LangChainLLM(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x28e415360>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x16a189360>, completion_to_prompt=<function default_completion_to_prompt at 0x16a3a2710>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None)

In [24]:
import nest_asyncio
nest_asyncio.apply()
response = pack.run("How did Ubers revenue growth compare to Lyfts in 2021?")

==== Available Parsed Functions ====
def lyft_10k(input: string):
   """Provides information about Lyft financials for year 2021. Use a detailed plain text question as input to the tool. The input is used to power a semantic search engine."""
    ...
def uber_10k(input: string):
   """Provides information about Uber financials for year 2021. Use a detailed plain text question as input to the tool. The input is used to power a semantic search engine."""
    ...
==== Generated Chain of Abstraction ====
Abstract plan of reasoning:
To answer the question, we need to compare the revenue growth of Uber and Lyft in 2021. We will first get the revenue information for Lyft in 2021 by asking [FUNC lyft_10k("What was Lyft's revenue in 2021 and how did it change from 2020 to 2021?") = y1]. Then, we will get the revenue information for Uber in 2021 by asking [FUNC uber_10k("What was Uber's revenue in 2021 and how did it change from 2020 to 2021?") = y2]. Finally, we will compare the revenue growth 

In [14]:
print(str(response))


Based on the previous reasoning, we can conclude that Lyft's revenue growth in 2021 was 36% compared to 2020. However, due to the lack of explicit information about Uber's total revenue in 2021, we cannot accurately compare the revenue growth of Uber and Lyft in 2021. We can only infer that Uber's revenue in 2021 was lower than in 2020, with a significant decrease in All Other revenue, but the exact amount of the decrease is not specified.
