- Have a maximum capacity
- Store key-value pairs (prompt -> response)
- When capacity is reached, evict the least recently used item
- Update the "recently used" status when a key is accessed (get) or updated (put)

In [18]:
class LLMResponseCache:
    cache = {}
    recently_used = []

    def __init__(self, capacity: int):
        self.capacity = capacity

    def get(self, prompt: str) -> str | None:
        self.recently_used.append(prompt)
        return self.cache[prompt]

    def put(self, prompt: str, response: str) -> None:
        if self.capacity == len(self.cache) and prompt not in self.cache:
            self.cache.pop(self.recently_used.pop(0))
        self.recently_used.append(prompt)
        self.cache[prompt] = response

In [19]:
cache = LLMResponseCache(2)

cache.put("What is AI?", "AI stands for Artificial Intelligence...")
cache.put("Define ML", "ML is Machine Learning...")

print(cache.get("What is AI?"))  # Output: "AI stands for Artificial Intelligence..."

cache.put("What is NLP?", "NLP is Natural Language Processing...")
# Cache is full, "Define ML" was least recently used and gets evicted

print(cache.get("Define ML"))  # Output: None (evicted)
print(cache.get("What is NLP?"))  # Output: "NLP is Natural Language Processing..."

AI stands for Artificial Intelligence...
ML is Machine Learning...
NLP is Natural Language Processing...


In [20]:
cache = LLMResponseCache(3)

cache.put("prompt1", "response1")
cache.put("prompt2", "response2")
cache.put("prompt3", "response3")

cache.get("prompt1")  # Access prompt1, making it recently used

cache.put("prompt4", "response4")
# prompt2 is now LRU and gets evicted

print(cache.get("prompt2"))  # Output: None
print(cache.get("prompt1"))  # Output: "response1"
print(cache.get("prompt3"))  # Output: "response3"
print(cache.get("prompt4"))  # Output: "response4"

KeyError: 'What is AI?'