#### Cache Using LangChain

In [1]:
from langchain.cache import InMemoryCache
from langchain.globals import set_llm_cache
from typing import Any, Dict, Tuple

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0.0,
    max_tokens=1000,
    streaming=True,
    verbose=True,
    request_timeout=60,
    max_retries=3
)
print("LLM initialized successfully.")
response = llm.invoke("What is the capital of France?")
print(response)

LLM initialized successfully.
content='The capital of France is Paris.' additional_kwargs={} response_metadata={'finish_reason': 'stop', 'model_name': 'gpt-3.5-turbo-0125', 'service_tier': 'default'} id='run--0d1b2fc5-f813-4fb4-8256-78ffc3e59153-0'


In [3]:
class DebuggableCache(InMemoryCache):
    def __init__(self):
        super().__init__()
        self._cache: Dict[Tuple[str, str], Any] = {}

    def lookup(self, prompt: str, llm_string: str):
        return self._cache.get((prompt, llm_string))

    def update(self, prompt: str, llm_string: str, return_val: Any):
        self._cache[(prompt, llm_string)] = return_val

    def view_cache(self):  # this is our custom method
        return self._cache

In [4]:
dbg_cache = DebuggableCache()
set_llm_cache(dbg_cache)

In [5]:
response = llm.invoke("What is the capital of France?")
response

AIMessage(content='The capital of France is Paris.', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'model_name': 'gpt-3.5-turbo-0125', 'service_tier': 'default'}, id='run--9550fe50-94c6-40c0-9831-2465b27bf271-0')

In [6]:
print("\nCache Contents:")
for k, v in dbg_cache.view_cache().items():
    print(f"Prompt: {k[0]} | Cached Output: {v}")


Cache Contents:
Prompt: [{"lc": 1, "type": "constructor", "id": ["langchain", "schema", "messages", "HumanMessage"], "kwargs": {"content": "What is the capital of France?", "type": "human"}}] | Cached Output: [ChatGeneration(text='The capital of France is Paris.', generation_info={'finish_reason': 'stop', 'model_name': 'gpt-3.5-turbo-0125', 'service_tier': 'default'}, message=AIMessage(content='The capital of France is Paris.', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'model_name': 'gpt-3.5-turbo-0125', 'service_tier': 'default'}, id='run--9550fe50-94c6-40c0-9831-2465b27bf271-0'))]


In [7]:
response = llm.invoke("What is the capital of France?")

In [8]:
print("\nCache Contents:")
for k, v in dbg_cache.view_cache().items():
    print(f"Prompt: {k[0]} | Cached Output: {v}")


Cache Contents:
Prompt: [{"lc": 1, "type": "constructor", "id": ["langchain", "schema", "messages", "HumanMessage"], "kwargs": {"content": "What is the capital of France?", "type": "human"}}] | Cached Output: [ChatGeneration(text='The capital of France is Paris.', generation_info={'finish_reason': 'stop', 'model_name': 'gpt-3.5-turbo-0125', 'service_tier': 'default'}, message=AIMessage(content='The capital of France is Paris.', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'model_name': 'gpt-3.5-turbo-0125', 'service_tier': 'default'}, id='run--9550fe50-94c6-40c0-9831-2465b27bf271-0', usage_metadata={'total_cost': 0}))]


In [10]:
response = llm.invoke("What is the capital of United States?")
print("LLM Response:", response)

LLM Response: content='The capital of the United States is Washington, D.C.' additional_kwargs={} response_metadata={'finish_reason': 'stop', 'model_name': 'gpt-3.5-turbo-0125', 'service_tier': 'default'} id='run--12d26486-2309-4965-8b0b-44ed7239d892-0'


In [11]:
print("\nCache Contents:")
for k, v in dbg_cache.view_cache().items():
    print(f"Prompt: {k[0]} | Cached Output: {v}")


Cache Contents:
Prompt: [{"lc": 1, "type": "constructor", "id": ["langchain", "schema", "messages", "HumanMessage"], "kwargs": {"content": "What is the capital of France?", "type": "human"}}] | Cached Output: [ChatGeneration(text='The capital of France is Paris.', generation_info={'finish_reason': 'stop', 'model_name': 'gpt-3.5-turbo-0125', 'service_tier': 'default'}, message=AIMessage(content='The capital of France is Paris.', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'model_name': 'gpt-3.5-turbo-0125', 'service_tier': 'default'}, id='run--9550fe50-94c6-40c0-9831-2465b27bf271-0', usage_metadata={'total_cost': 0}))]
Prompt: [{"lc": 1, "type": "constructor", "id": ["langchain", "schema", "messages", "HumanMessage"], "kwargs": {"content": "What is the capital of india?", "type": "human"}}] | Cached Output: [ChatGeneration(text='The capital of India is New Delhi.', generation_info={'finish_reason': 'stop', 'model_name': 'gpt-3.5-turbo-0125', 'service_tier': 'defau

In [12]:
response = llm.invoke("What is the capital of United States?")
print("LLM Response:", response)

LLM Response: content='The capital of the United States is Washington, D.C.' additional_kwargs={} response_metadata={'finish_reason': 'stop', 'model_name': 'gpt-3.5-turbo-0125', 'service_tier': 'default'} id='run--12d26486-2309-4965-8b0b-44ed7239d892-0' usage_metadata={'total_cost': 0}


In [13]:
response = llm.invoke("give me 1000 lines of essay on science and give the importance of it regarding mathematics?")
print("LLM Response:", response)

LLM Response: content='Science is a vast field of study that encompasses a wide range of disciplines, including biology, chemistry, physics, and astronomy. It is the systematic study of the natural world through observation and experimentation, with the goal of understanding how the world works and why things happen the way they do. Science has played a crucial role in shaping our understanding of the universe and has led to countless technological advancements that have improved our quality of life.\n\nOne of the key components of science is mathematics, which serves as the language of science and provides the tools necessary for making sense of the natural world. Mathematics is the study of numbers, quantities, shapes, and patterns, and it is used in science to describe and analyze the relationships between different variables. Without mathematics, it would be impossible to quantify and measure the phenomena that scientists observe, making it an essential tool for conducting scientif

In [14]:
from langchain.callbacks import get_openai_callback
with get_openai_callback() as cb:
    response = llm.invoke("Tell me a joke about LangChain and ECS")
    print("Response:", response.content)
    print("Token Usage Stats:", cb)

Response: Why did LangChain and ECS break up? Because they couldn't find a common language to communicate in!
Token Usage Stats: Tokens Used: 0
	Prompt Tokens: 0
		Prompt Tokens Cached: 0
	Completion Tokens: 0
		Reasoning Tokens: 0
Successful Requests: 0
Total Cost (USD): $0.0


In [15]:
questions = [
    "What is Retrieval-Augmented Generation?",
    "How does FAISS indexing work?",
    "What is the difference between fine-tuning and RAG?"
]

with get_openai_callback() as cb:
    for q in questions:
        answer = llm.invoke(q)
        print(f"\nQ: {q}\nA: {answer.content}")

    print("\n=== Token Usage Summary ===")
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Total Cost (USD): ${cb.total_cost:.6f}")


Q: What is Retrieval-Augmented Generation?
A: Retrieval-Augmented Generation (RAG) is a natural language processing model that combines the capabilities of both retrieval-based and generation-based models. In RAG, a retrieval component is used to retrieve relevant information from a large database or knowledge base, which is then used by a generation component to generate a response or answer to a given query or prompt. This approach allows the model to leverage the benefits of both retrieval and generation models, resulting in more accurate and contextually relevant responses. RAG has been shown to outperform traditional generation models in tasks such as question answering and dialogue generation.

Q: How does FAISS indexing work?
A: FAISS (Facebook AI Similarity Search) is a library for efficient similarity search and clustering of large datasets. It works by creating an index that organizes the data in a way that makes it faster to search for similar items.

The indexing process i