#### TOOL CALLING WITH AGENTS

In [1]:
import dotenv
%load_ext dotenv
%dotenv
import nest_asyncio
nest_asyncio.apply()

In [2]:
#DEFINE TOOLS
def add(x: int, y: int) -> int:
    """Add two numbers together"""
    return x + y

def substract(x: int, y: int) -> int:
    """Substract two numbers"""
    return x - y

def multiply(x: int, y: int) -> int:
    """Multiply two numbers"""
    return x * y

def divide(x: int, y: int) -> int:
    """Divide two numbers"""
    return x / y

def get_user_info(username: str) -> str:
    """Get user information"""

    database = {
        "Antonio": {
            "name": "Antonio Lopez",
            "age": 30,
            "email": "antonio@example.com"
        }, 
        "Nelson": {
            "name": "Nelson Rodriguez",
            "age": 25,
            "email": "nelson@example.com"
        }
    }
    
    return f"Username: {username}, Info: {database.get(username.lower(), 'User not found')}"


#CREATE TOOLS FROM PYTHON FUNCTIONS 
from llama_index.core.tools import FunctionTool

addition_tools = FunctionTool.from_defaults(fn=add)
substraction_tools = FunctionTool.from_defaults(fn=substract)
multiplication_tools = FunctionTool.from_defaults(fn=multiply)
divide_tools = FunctionTool.from_defaults(fn=divide)
get_user_info_tools = FunctionTool.from_defaults(fn=get_user_info)

tools = [addition_tools, substraction_tools, multiplication_tools, divide_tools, get_user_info_tools]

#TEST TOOLS
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-3.5-turbo")


In [3]:
# response = llm.predict_and_call(
#     tools,
#     "Add 2 and 3", 
#     verbose=True
# )

# print(str(response))

#### VECTOR SEARCH WITH METADATA

In [4]:
from llama_index.core import SimpleDirectoryReader

#read paper
documents = SimpleDirectoryReader(input_files=["./datasets/lora_paper.pdf"]).load_data()

In [5]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=1024)
nodes = splitter.get_nodes_from_documents(documents=documents)

In [6]:
len(nodes)

38

In [7]:
###LLM AND EMBEDDING MODEL
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embedding = OpenAIEmbedding(model="text-embedding-ada-002")



In [8]:
from llama_index.core import VectorStoreIndex

vector_index = VectorStoreIndex(nodes=nodes)

In [9]:
from llama_index.core.vector_stores import MetadataFilters

query_engine = vector_index.as_query_engine(
    similarity_top_k=3,
    filters=MetadataFilters.from_dicts(
        [
            {"key": "page_label", "value": "2"}
        ]
    )
)

# response = query_engine.query("Tell me about the problem statement as explained in page 2")
# print(str(response))

In [10]:
# for n in response.source_nodes:
#     print(n.metadata)
#     print("==================")
#     print(n.get_text())
#     print("==================")

In [11]:
from typing import List
from llama_index.core.vector_stores import FilterCondition
from llama_index.core.vector_stores import MetadataFilters

def vector_search_query(
        query: str, 
        page_numbers: List[str]
) -> str:
    """
    Conduct a vector search across an index using the following parameters:

    query (str): This is the text string you want to embed and search for within the index.
    page_numbers (List[str]): This parameter allows you to limit the search to 
    specific pages. If left empty, the search will encompass all pages in the index. 
    If page numbers are specified, the search will be filtered to only include those pages.
    
    """

    metadata_dicts = [
        {"key": "page_label", "value": p} for p in page_numbers
    ]

    query_engine = vector_index.as_query_engine(
        similarity_top_k=2,
        filters=MetadataFilters.from_dicts(
            metadata_dicts,
            condition=FilterCondition.OR
        )
    )

    response = query_engine.query(query)
    return response

In [12]:
vector_query_tool = FunctionTool.from_defaults(
    fn=vector_search_query,
    name="vector_search_query_tool"
)

In [13]:
# response=llm.predict_and_call(
#     [vector_query_tool],
#     "Explain the problem statement in page 2",
#     verbose=True

# )

In [14]:
# for n in response.source_nodes:
#     print(n.metadata)
#     print("==================")
#     print(n.get_text())
#     print("==================")

In [15]:
from llama_index.core import SummaryIndex
from llama_index.core.tools import QueryEngineTool

summary_index = SummaryIndex(nodes=nodes)

summary_query_engine_tool = summary_index.as_query_engine(
    use_async=True,
    response_mode="tree_summarize"
)

summary_tool = QueryEngineTool.from_defaults(
    query_engine=summary_query_engine_tool,
    name="summary_tool",
    description="Useful for summarization questions related to the Lora paper."
)

In [16]:
# response = llm.predict_and_call(
#     [summary_tool, vector_query_tool],
#     "Summarize how to apply Lora to Transfomer in page 5 in 2 sentences",
#     verbose=True
# )

#### AGENT WORKER

In [18]:
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent import AgentRunner

agent_worker = FunctionCallingAgentWorker.from_tools(
    tools=[summary_tool, vector_query_tool], 
    llm=llm,
    verbose=True,
)

agent = AgentRunner(agent_worker)

In [19]:
response = agent.query(
    "Explain low-rank structures in Deep Learning and how it applies  for llms"
)

print(str(response))

Added user message to memory: Explain low-rank structures in Deep Learning and how it applies  for llms
=== Calling Function ===
Calling function: summary_tool with args: {"input": "Explain low-rank structures in Deep Learning and how it applies for LLMs"}
=== Function Output ===
Low-rank structures in Deep Learning involve representing weight matrices in neural networks using a reduced number of dimensions. This approach helps in decreasing computational complexity and memory requirements while retaining crucial information. In the context of Large Language Models (LLMs) like GPT-2 and GPT-3, low-rank structures are employed to efficiently adapt pre-trained models to specific downstream tasks. By updating only a low-rank approximation of the weight matrices during fine-tuning, LLMs can be effectively adapted to new tasks without significantly increasing the number of trainable parameters. This method enables the adaptation of LLMs to various tasks while maintaining performance and red

In [20]:
response = agent.chat(
    "Explain what is LoRA and why and how it's used. Are exisiting solutions not enough?",
)

print(str(response))

Added user message to memory: Explain what is LoRA and why and how it's used. Are exisiting solutions not enough?
=== Calling Function ===
Calling function: summary_tool with args: {"input": "Explain what is LoRA and why and how it's used. Are existing solutions not enough?"}
=== Function Output ===
LoRA, or Low-Rank Adaptation, is a method used to adapt large-scale pre-trained language models to specific tasks or domains. It involves freezing the pre-trained model weights and introducing trainable rank decomposition matrices into each layer of the Transformer architecture. This approach significantly reduces the number of trainable parameters for downstream tasks, making training more efficient. LoRA allows for optimizing the rank decomposition matrices of dense layers during adaptation while keeping the pre-trained weights frozen. When deployed, LoRA does not introduce additional inference latency compared to fully fine-tuned models.

Existing solutions for adapting language models, 

In [29]:
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent import AgentRunner

agent_worker = FunctionCallingAgentWorker.from_tools(
    tools=[summary_tool, vector_query_tool], 
    llm=llm,
    verbose=True,
)

agent = AgentRunner(agent_worker)

In [30]:
task = agent.create_task(
    "Explain what is LoRA and why and how it's used."
    "Are existing solutions not enough?"
)

In [31]:
step_output = agent.run_step(
    task.task_id, 
    input = "Summarize the LoRA paper"
)

Added user message to memory: Summarize the LoRA paper
=== Calling Function ===
Calling function: summary_tool with args: {"input": "LoRA paper"}
=== Function Output ===
The LoRA paper introduces a method for efficient adaptation of large language models by freezing pre-trained model weights and incorporating trainable rank decomposition matrices into each layer of the Transformer architecture. This approach reduces the number of trainable parameters for downstream tasks while maintaining or even surpassing the performance of traditional fine-tuning methods. The paper includes empirical investigations into rank-deficiency during language model adaptation and demonstrates the efficacy of LoRA through experiments on tasks like E2E NLG Challenge, MNLI, and WikiSQL. Additionally, the paper discusses the Backward Feature Correction method and its application in deep learning tasks, highlighting the benefits of LoRA modules with varying ranks in adapting pre-trained models such as GPT-2 and 

In [32]:
step_output = agent.run_step(task.task_id)

=== LLM Response ===
The LoRA paper introduces a method for efficient adaptation of large language models by incorporating trainable rank decomposition matrices into each layer of the Transformer architecture. This approach reduces the number of trainable parameters for downstream tasks while maintaining or surpassing performance. The paper includes empirical investigations and experiments on tasks like E2E NLG Challenge, MNLI, and WikiSQL, demonstrating the efficacy of LoRA. Additionally, it discusses the Backward Feature Correction method and the benefits of LoRA modules with varying ranks in adapting pre-trained models like GPT-2 and GPT-3.


In [33]:
completed_step = agent.get_completed_steps(task.task_id)

print(task.task_id)
if len(completed_step) > 0:
    print(task.task_id)
    print(completed_step[0].output.sources[0].raw_output)

fa2869ea-cf7a-471b-962f-97b898bb9a82
fa2869ea-cf7a-471b-962f-97b898bb9a82
The LoRA paper introduces a method for efficient adaptation of large language models by freezing pre-trained model weights and incorporating trainable rank decomposition matrices into each layer of the Transformer architecture. This approach reduces the number of trainable parameters for downstream tasks while maintaining or even surpassing the performance of traditional fine-tuning methods. The paper includes empirical investigations into rank-deficiency during language model adaptation and demonstrates the efficacy of LoRA through experiments on tasks like E2E NLG Challenge, MNLI, and WikiSQL. Additionally, the paper discusses the Backward Feature Correction method and its application in deep learning tasks, highlighting the benefits of LoRA modules with varying ranks in adapting pre-trained models such as GPT-2 and GPT-3.


In [34]:
print(step_output.is_last)

True


In [35]:
upcoming_steps = agent.get_upcoming_steps(task.task_id)

if len(upcoming_steps) > 0:
    print(upcoming_steps[0].input)

In [37]:
# step_output = agent.run_step(task.task_id)
# print(step_output.is_last)

In [28]:
response = agent.finalize_response(task.task_id)
print(str(response))

LoRA is a method used to adapt pre-trained language models for downstream tasks by injecting trainable rank decomposition matrices into each layer of the Transformer architecture. This approach reduces the number of trainable parameters while maintaining or improving model quality compared to traditional fine-tuning methods. LoRA amplifies task-specific directions in the weight matrices of pre-trained models by updating a low-rank matrix to emphasize important but not emphasized directions in the original weight matrix. The amplification factor in LoRA varies based on the chosen rank for the adaptation matrix, with lower ranks generally amplifying task-specific directions more effectively.

As for why and how LoRA is used, it offers a more efficient way to adapt pre-trained language models for specific tasks by enhancing task-specific information in the model while reducing the number of parameters that need to be fine-tuned. This can lead to improved performance and faster adaptation 