### 1. Connecting to Milvus and create database

In [51]:
#Setup database & collection
from pymilvus import connections
from pymilvus import db,Collection

from pymilvus import utility

#Names for connections, database and collections
conn_name = "cache_conn"
db_name="cache_db"
collection_name="llm_cache"

#Create a connection to Milvus
connections.add_connection(
    cache_conn={
        "host": "localhost",
        "port": "19530",
        "username" : "username",
        "password" : "password"
    })


#Connect
connections.connect(conn_name)

#Create a DB if not already present
current_dbs=db.list_database(using=conn_name)

if ( db_name not in current_dbs):
    print("Creating database :", db_name)
    resume_db = db.create_database(db_name, using=conn_name) #default db is "default"
else:
    print(db_name, ": Database already exists")

#Switch to the new database
db.using_database(db_name, using=conn_name)

cache_db : Database already exists


### 2. Creating collections

In [52]:
#Create a Collection for cache
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection
import json

#Define fields in the cache
#Autogenerated ID field for each entity
cache_id = FieldSchema(
    name="cache_id",
    dtype=DataType.INT64,
    auto_id=True,
    is_primary=True,
    max_length=32)

#Text for the input prompt
prompt_text= FieldSchema(
    name="prompt_text",
    dtype=DataType.VARCHAR,
    max_length=2048)

#Text for the LLM response
response_text= FieldSchema(
    name="response_text",
    dtype=DataType.VARCHAR,
    max_length=2048)

#Embedding for the input prompt
prompt_embedding = FieldSchema(
    name="prompt_embedding",
    dtype=DataType.FLOAT_VECTOR,
    dim=1536 #Define based on embedding used
)

#Define the schema for the cache collection
cache_schema=CollectionSchema(
    fields=[cache_id, prompt_text, response_text, prompt_embedding],
    description="Cache for LLM",
    enable_dynamic_field=True
)

#Create the collection
cache_collection=Collection(
    name=collection_name,
    schema=cache_schema,
    using=conn_name,
    shard_num=2
)

print("Schema : ", cache_collection.schema, "\n")

#Build an index for the prompt embedding field
index_params = {
    "metric_type":"L2",
    "index_type":"IVF_FLAT",
    "params" :{"nlist":1024}
}

cache_collection.create_index(
    field_name="prompt_embedding",
    index_params=index_params
)

#Flush the collection to persist
cache_collection.flush()
#Load the collection in memory
cache_collection.load()

Schema :  {'auto_id': True, 'description': 'Cache for LLM', 'fields': [{'name': 'cache_id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'prompt_text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048}}, {'name': 'response_text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048}}, {'name': 'prompt_embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1536}}], 'enable_dynamic_field': True} 



##  3 save the [prompt, response, prompt embedding] into Milvus as cache

In [53]:
from transformers import AutoTokenizer
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain.embeddings import OpenAIEmbeddings
import os
import time

 

#If you use the free tier, you may hit rate limits with the number of requests

OPENAI_API_KEY=" "
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

#Create an LLM object
#llm= OpenAI(temperature=0., model="text-davinci-003")  #The model `text-davinci-003` has been deprecated
#llm= OpenAI(temperature=0., model="gpt-3.5-turbo")

llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo", temperature=0)

#Setup embedding model for creating embeddings
# embeddings_model = OpenAIEmbeddings()
embeddings_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model="text-embedding-ada-002")
#setup threshold for similarity between vectors
similarity_threshold=0.3

search_params = {
    "metric_type": "L2", 
    "offset": 0, 
    "ignore_growing": False, 
    "params": {"nprobe": 20, "radius":similarity_threshold}
}

#create a function to run the inference loop
def get_response(question):
    
    start_time=time.time()
    # step 1  create embedding for incoming prompt
    prompt_embed=embeddings_model.embed_query(question)
    
    # step 2 Check cache if result exists
    cache_results=cache_collection.search(
        data=[prompt_embed],  #embedding of the input query to search for
        anns_field="prompt_embedding",#field to search with ANN
        param=search_params,  #earch_params with metric_type,
        limit=1, #Look for the top result only
        expr=None, # if use additional scalar conditions
        output_fields=["prompt_text", "response_text"],
        consistency_level="Strong"
    )
        
    returned_response ="None"
    
    if ( len(cache_results[0]) > 0 ):
        
        #Cache hit
        print(question, " :\n Cache hit : ",cache_results[0])
        returned_response = cache_results[0][0].entity.get("response_text")
    
    else:
        ## step 2-1  Find answer with ChatOpenAI
        messages = [
            ("human", f"{question}"),
        ]
        llm_response= llm.invoke(messages).content
        
        print(question, ":\n LLM returned :", llm_response)
        returned_response = question
        
        #step 2-1 save prompt/response to cache
        prompt_text = [question]
        prompt_embedding=[prompt_embed]
        response_text = [llm_response]
        
        #Format for data input
        insert_data=[prompt_text, response_text, prompt_embedding]
        #insert into collection(table)
        mr=cache_collection.insert(insert_data)
    
    end_time = time.time()
    print("Time elapsed :",  end_time - start_time, "\n")
    return returned_response
    

In [54]:
#Build up the cache
response=get_response("In which year was Abraham Lincoln born?")
response=get_response("What is distance between the sun and the moon?")
response=get_response("How many years have Lebron James played in the NBA?")
response=get_response("What are the advantages of the python language?")
response=get_response("What is the typical height of an elephant")


In which year was Abraham Lincoln born? :
 LLM returned : Abraham Lincoln was born on February 12, 1809.
Time elapsed : 2.2014362812042236 

What is distance between the sun and the moon? :
 LLM returned : The average distance between the sun and the moon is approximately 238,855 miles (384,400 kilometers).
Time elapsed : 1.9139692783355713 

How many years have Lebron James played in the NBA? :
 LLM returned : LeBron James has played in the NBA for 19 seasons as of the 2021-2022 season.
Time elapsed : 2.0807125568389893 

What are the advantages of the python language? :
 LLM returned : 1. Easy to learn and use: Python has a simple and easy-to-read syntax, making it a great language for beginners to learn programming.

2. Versatile: Python can be used for a wide range of applications, including web development, data analysis, artificial intelligence, machine learning, and more.

3. Large standard library: Python comes with a large standard library that provides a wide range of modules

##  4  cache hit

In [55]:
response=get_response("List some advantages of the python language")
response=get_response("How tall is an elephant?")

List some advantages of the python language  :
 Cache hit :  ["id: 451225468120105048, distance: 0.04900672659277916, entity: {'prompt_text': 'What are the advantages of the python language?', 'response_text': '1. Easy to learn and use: Python has a simple and easy-to-read syntax, making it a great language for beginners to learn programming.\\n\\n2. Versatile: Python can be used for a wide range of applications, including web development, data analysis, artificial intelligence, machine learning, and more.\\n\\n3. Large standard library: Python comes with a large standard library that provides a wide range of modules and packages for various tasks, reducing the need to write code from scratch.\\n\\n4. Community support: Python has a large and active community of developers who contribute to its development, provide support, and create libraries and frameworks that extend its capabilities.\\n\\n5. Cross-platform compatibility: Python is a cross-platform language, meaning that code writt

##  5 not cache hit

In [56]:
response=get_response("How old are you?")

How old are you? :
 LLM returned : I am an AI digital assistant, so I do not have an age in the traditional sense.
Time elapsed : 2.1015889644622803 

