# Vector Search Basics

# Create the Index

## Setup

In [None]:
#!pip install openai faiss-cpu pandas jupyter-datatables cassandra-driver

## Imports

In [None]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import dict_factory
from cassandra.query import SimpleStatement
import openai
import pandas as pd

## Keys & Environment Variables

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

# Astra DB
ASTRA_DB_KEYSPACE = os.environ['ASTRA_DB_KEYSPACE']
ASTRA_DB_SECURE_BUNDLE_PATH = os.environ['ASTRA_DB_SECURE_BUNDLE_PATH']
ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']

# OpenAI Token
openai_api_key = os.environ['OPENAI_KEY']
openai.api_key = openai_api_key

## Select a model to compute embeddings

Embeddings are numerical representations of concepts converted to number sequences, which make it easy for computers to understand the relationships between those concepts.

This new embedding model from openAI - `text-embedding-ada-002` - replaces five separate models for text search, text similarity, and code search, and outperforms our previous most capable model, Davinci, at most tasks, while being priced 99.8% lower.

In [None]:
model_id = "text-embedding-ada-002"

## Connect to Astra DB

In [None]:
cloud_config= {
  'secure_connect_bundle': ASTRA_DB_SECURE_BUNDLE_PATH
}
auth_provider = PlainTextAuthProvider('token', ASTRA_DB_APPLICATION_TOKEN)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect()
session.set_keyspace('vector_search_basics')
session

## Create Database Schema

Note the data type `vector` in the schema below.

In [None]:
# only use this to reset the schema
#session.execute(f"""DROP INDEX IF EXISTS openai_desc""")
#session.execute(f"""DROP INDEX IF EXISTS minilm_desc""")
#session.execute(f"""DROP TABLE IF EXISTS products_table""")

In [None]:
# # Create Table
session.execute(f"""CREATE TABLE IF NOT EXISTS products_table
(product_id int,
 chunk_id int,

 product_name text,
 description text,
 price text,

 openai_description_embedding vector<float, 1536>,
 minilm_description_embedding vector<float, 384>,

 PRIMARY KEY (product_id, chunk_id))""")

# # Create Index
session.execute(f"""CREATE CUSTOM INDEX IF NOT EXISTS openai_desc ON products_table (openai_description_embedding) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'""")
session.execute(f"""CREATE CUSTOM INDEX IF NOT EXISTS minilm_desc ON products_table (minilm_description_embedding) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'""")


## Load the table with data and create text embeddings

In [None]:
products_list = pd.read_csv('ProductDataset.csv')
products_list

In [None]:
for id, row in products_list.iterrows():
  # Create Embedding for each conversation row, save them to the database
  text_chunk_length = 2500
  text_chunks = [row.description[i:i + text_chunk_length] for i in range(0, len(row.description), text_chunk_length)]
  for chunk_id, chunk in enumerate(text_chunks):
    pricevalue = row.price if isinstance(row.price, str) else ""
    full_chunk = f"{chunk} price: {pricevalue}"
    embedding = openai.Embedding.create(input=full_chunk, model=model_id)['data'][0]['embedding']
    query = SimpleStatement(
                f"""
                INSERT INTO products_table
                (product_id, chunk_id, product_name, description, price, openai_description_embedding)
                VALUES (%s, %s, %s, %s, %s, %s)
                """
            )
    #display(row)

    session.execute(query, (row.product_id, chunk_id, row.product_name, row.description, pricevalue, embedding))



---


# Use the index

In the steps up to this point, we have been creating a schema and loading the table with data, including embeddings we generated through the OpenAI Embedding API.
Now we are going to query that table and use the results to give ChatGPT some context to support it's response.

## Convert a query string into a text embedding to use as part of the query

This is where the real fun starts.  Provide a question or request to be used as the query.  The source sample database is mostly consumer electronics and appliances, so imagine you're talking to a customer service rep at Best Buy or another electronics store.

Here we use the same API that we used to calculate embeddings for each row in the database, but this time we are using your input question to calculate a vector to use in a query.

In [None]:
customer_input = "What equipement would you recommend for a computer workstation setup costing less than $2000?"
embedding = openai.Embedding.create(input=customer_input, model=model_id)['data'][0]['embedding']
display(embedding)

## Find the top 5 results using ANN Similarity

Let's take a look at what a query against a vector index could look like.  The query vector has the same dimensions (number of entries in the list) as the embeddings we generated a few steps ago for each row in the database.

In [None]:
query = SimpleStatement(
    f"""
    SELECT *
    FROM products_table
    ORDER BY openai_description_embedding ANN OF {embedding} LIMIT 5;
    """
    )
#display(query)

results = session.execute(query)
top_5_products = results._current_rows

for row in top_5_products:
  print(f"""{row.product_id}, {row.product_name}, {row.description}\n""")

## Ask ChatGPT for some help

- Here we build a prompt with which we'll query ChatGPT.  Note the "roles" in this little conversation give the LLM more context about who that part of the conversation is coming from.
- This may take 10-20 seconds to return, so be patient.

In [None]:

message_objects = []
message_objects.append({"role":"system",
                        "content":"You're a chatbot helping customers with questions and helping them with product recommendations"})

message_objects.append({"role":"user",
                        "content": customer_input})

message_objects.append({"role":"user",
                        "content": "Please give me a detailed explanation of your recommendations"})

message_objects.append({"role":"user",
                        "content": "Please be friendly and talk to me like a person, don't just give me a list of recommendations"})

message_objects.append({"role":"user",
                        "content":"The computer component itself should be one from the recommended products I will provide"})

message_objects.append({"role": "assistant",
                        "content": "I found these 5 products I would recommend"})

products_list = []

for row in top_5_products:
    brand_dict = {'role': "assistant", "content": f"{row.description}"}
    products_list.append(brand_dict)

message_objects.extend(products_list)
message_objects.append({"role": "assistant", "content":"Here's my summarized recommendation of products, and why it would suit you:"})

completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=message_objects
)
print(completion.choices[0].message['content'])