# Vector Search Descriptions

1. Source consumer focused descriptions for each product using LLM.
2. Create embeddings from better descriptions.
3. Store new embeddings.

## Imports

In [None]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import dict_factory
from cassandra.query import SimpleStatement
import openai
import pandas as pd

## Keys & Environment Variables

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

# Astra DB
ASTRA_DB_KEYSPACE = os.environ['ASTRA_DB_KEYSPACE']
ASTRA_DB_SECURE_BUNDLE_PATH = os.environ['ASTRA_DB_SECURE_BUNDLE_PATH']
ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']

# OpenAI Token
openai_api_key = os.environ['OPENAI_API_KEY']
openai.api_key = openai_api_key

## Select a model to compute embeddings

Embeddings are numerical representations of concepts converted to number sequences, which make it easy for computers to understand the relationships between those concepts.

This new embedding model from openAI - `text-embedding-ada-002` - replaces five separate models for text search, text similarity, and code search, and outperforms our previous most capable model, Davinci, at most tasks, while being priced 99.8% lower.

In [None]:
model_id = "text-embedding-ada-002"

## Connect to Astra DB

In [None]:
cloud_config= {
  'secure_connect_bundle': ASTRA_DB_SECURE_BUNDLE_PATH
}
auth_provider = PlainTextAuthProvider('token', ASTRA_DB_APPLICATION_TOKEN)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect()
session.set_keyspace('vector_search_basics')
session

## Database Schema

> **Note:** The following blocks only need be run when you create the schema. Otherwise use them at your dicretion.

Note the data type `vector` in the schema below.

### Drop Schema

> **Note:** Only run this block when you want to DROP the schema.

In [None]:
# only use this to DROP the schema
session.execute(f"""DROP INDEX IF EXISTS openai_desc""")
session.execute(f"""DROP INDEX IF EXISTS consumer_desc""")
session.execute(f"""DROP INDEX IF EXISTS combined_desc""")
session.execute(f"""DROP INDEX IF EXISTS minilm_desc""")

session.execute(f"""DROP TABLE IF EXISTS products_table""")

### Create Schema

> **Note:** Only run this block when you want to CREATE the schema.

In [None]:
# CREATE the schema

session.execute(f"""CREATE TABLE IF NOT EXISTS products_table
(product_id int,
 chunk_id int,

 product_name text,
 description text,
 consumer_description text,
 price text,
 
 openai_description_embedding vector<float, 1536>,
 consumer_description_embedding vector<float, 1536>,
 combined_description_embedding vector<float, 1536>,

 PRIMARY KEY (product_id, chunk_id))""")

# Create Index
session.execute("CREATE CUSTOM INDEX IF NOT EXISTS openai_desc ON products_table (openai_description_embedding) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'")
session.execute("CREATE CUSTOM INDEX IF NOT EXISTS consumer_desc ON products_table (consumer_description_embedding) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'")
session.execute("CREATE CUSTOM INDEX IF NOT EXISTS combined_desc ON products_table (combined_description_embedding) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'")


## Create embeddings and Store in DB 

### Read CSV file

In [None]:
products_list = pd.read_csv('ProductDataset.csv')
products_list

### Generate consumer description based on product_description from openAI

In [None]:
products_list['consumer_description'] = ""

# Iterate over products
for id, row in products_list.iterrows():

    print (row.product_name)

    ### GENERATE CONSUMER DESCRIPTION ###
    print ("    - generating consumer description")

    # Create Prompt
    message_objects = []
    message_objects.append({"role":"user",
     "content": f"Provide a single paragraph consumer level description of the product: {row.product_name}"})

    # Generate consumer description
    completion = openai.ChatCompletion.create(model="gpt-3.5-turbo",messages=message_objects)
    consumer_description = completion.choices[0].message['content']
    
    # Update DataFrame with completion
    products_list.at[id,'consumer_description'] = consumer_description


    ### GENERATE EMBEDDINGS ###
    print ("    - generating embeddings")
    
    # Get price
    pricevalue = row.price if isinstance(row.price, str) else ""

    # append price to description
    original = f"{row.description} price: {pricevalue}"
    # append price to consumer description
    consumer = f"{consumer_description} price: {pricevalue}"
    # append price to combined description
    combined = f"{consumer_description} {row.description} price: {pricevalue}"
    
    # Create  embedding
    embedding = openai.Embedding.create(input=original, model=model_id)['data'][0]['embedding']
    # Create consumer embedding
    embedding_consumer = openai.Embedding.create(input=consumer, model=model_id)['data'][0]['embedding']
    # Create combined embedding
    embedding_combined = openai.Embedding.create(input=combined, model=model_id)['data'][0]['embedding']


    ### WRITE TO DATABASE ###
    print ("    - writing to database")
    
    # Insert Data and Embedding into database
    query = SimpleStatement(
                f"""
                INSERT INTO products_table
                (product_id, chunk_id, product_name, description, consumer_description, price, openai_description_embedding, consumer_description_embedding, combined_description_embedding)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
                """
            )
    session.execute(query, (row.product_id, 0, row.product_name, row.description, consumer_description, pricevalue, embedding, embedding_consumer, embedding_combined))


In [None]:
# Write new product file
products_list.to_csv('ProductDatasetCombined.csv')

## Convert a query string into a text embedding to use as part of the query

In [None]:
customer_input = "recommend a camera for novice photographer"
embedding = openai.Embedding.create(input=customer_input, model=model_id)['data'][0]['embedding']
display(embedding)

## Find the top 5 results using ANN Similarity

Let's take a look at what a query against a vector index could look like.  The query vector has the same dimensions (number of entries in the list) as the embeddings we generated a few steps ago for each row in the database.

In [None]:
query = SimpleStatement(
    f"""
    SELECT product_id, product_name, description, consumer_description, price, similarity_dot_product(consumer_description_embedding, {embedding}) as sim
    FROM products_table
    ORDER BY consumer_description_embedding ANN OF {embedding} LIMIT 5;
    """
    )
#display(query)

results = session.execute(query)
top_5_products = results._current_rows

for row in top_5_products:
  #print(row)
  print(f"""{row.sim}: {row.product_name}\n{row.consumer_description}\n\n""")