# Populate Vector DB

## Fetch Data from Web Source

We load a number of HTML pages using `request` module. Each of those pages contains lots of superfluous content so we extract only the relevant article context.

The parsed data is saved into a Pickle file so that we don't have to crawl the website again if we need to recreate the vector database.

#### Utility functions

In [None]:
import tiktoken

"""
Function to calculate the number of tokens in a text string.
"""

encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_string(string: str) -> int:
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
import re
import unicodedata

def clean_text(text: str):
    """
    Function to clean text from web pages
    """
    
    # Normalize line breaks to \n\n (two new lines)
    text = text.replace("\r\n", "\n\n")
    text = text.replace("\r", "\n\n")

    # Replace two or more spaces with a single space
    text = re.sub(" {2,}", " ", text)

    # Remove leading spaces before removing trailing spaces
    text = re.sub("^[ \t]+", "", text, flags=re.MULTILINE)

    # Remove trailing spaces before removing empty lines
    text = re.sub("[ \t]+$", "", text, flags=re.MULTILINE)

    # Remove empty lines
    text = re.sub("^\s+", "", text, flags=re.MULTILINE)

    # remove unicode Non Breaking Space
    text = unicodedata.normalize('NFKC', text)

    return text

#### Get list of URLs from File

In [None]:
#read urls from file named links.txt
with open('urls_tv.txt', 'r') as file:
    urls = file.readlines()
    urls = [url.strip() for url in urls]

# prepend "https://www.thegoodguys.com.au/" to each url
urls = ["https://www.thegoodguys.com.au" + url for url in urls]

# For debugging, override the list and use only a single URL
#urls = ["https://www.thegoodguys.com.au/lg-50-inches-ut8050-4k-uhd-led-smart-tv-24-50ut8050psb",
#        "https://www.thegoodguys.com.au/hisense-100-inches-q7nau-4k-qled-smart-tv-24-100q7nau",
#        "https://www.thegoodguys.com.au/apple-watch-se-gps-40mm-starlight-aluminium-case-with-starlight-sport-band-sm-mr9u3zpa"]

print (f"Number of URLs: {len(urls)}")

#### Crawl URLs

Iterate through the URLs and create a LangChain Document object for each page.

In [None]:
from website_parser import website_parser
from langchain.docstore.document import Document
import pickle

data = []
for url in urls:

    # Parse website using Beautiful Soup.
    item = website_parser(url)

    url = url.replace("https://www.thegoodguys.com.au", "")
    
    metadata = {
        'source': url,
        'title': item['title'],
        'price': item['price'],
        'img': item['img'],
        'features': item['key_features'],
        'specs': item['tech_specs'],
        'product_features': item['product_features'],
    }

    document = Document(page_content=item['description'], metadata=metadata)
    #data.append(document)
    
    with open(f"pickles/{url}.pkl", "wb") as f:
        pickle.dump(document, f)

print (f"Number of Documents: {len(data)}")

**Write Crawled Data to Disk**

*WARNING: Only run this block if you want to recreate the Pickle file*

In [None]:
import pickle

#write data to file in a way that it can be reconstituted into a list of documents
with open("website_data.pkl", "wb") as f:
    pickle.dump(data, f)

## Create the Vector Store

**Read Crawled Data from Disk**

In [None]:
# Read help_data.pkl and recreate data object as list of documents
import pickle
with open("website_data.pkl", "rb") as f:
    data = pickle.load(f)

print(len(data))

In [None]:
# Read Pickle Files in pickles directory and recreate data object as list of documents
import os
import pickle

data = []
for file in os.listdir("pickles"):
    if file.endswith(".pkl"):
        with open(f"pickles/{file}", "rb") as f:
            document = pickle.load(f)
            data.append(document)

### Reformat Data

We need to try different embeding apporaches to see what works best:
- Text Embedding
    - Description only
    - Concatenate description, features, specs, price
- Knowledge Graph
    - Description only
    - Concatenate description, features, specs, price

#### Concatenated

In [None]:
# Generate consumer description
from openai import OpenAI
client = OpenAI()


def generate_consumer_description(product):
    # Create Prompt
    message_objects = []
    message_objects.append({"role":"user",
     "content": f"Provide a single paragraph consumer level description of the product: {product}"})
    
    completion = client.chat.completions.create(model="gpt-4o",messages=message_objects)
    #completion = openai.ChatCompletion.create(model="gpt-3.5-turbo",messages=message_objects)
    consumer_description = completion.choices[0].message.content

    return consumer_description

In [None]:
from langchain.docstore.document import Document

data_concatenated = []
for document in data:
    # Generate consumer description
    consumer_description = generate_consumer_description(document.metadata['title'])

    #content  = f"Description:\n{document.page_content}\n\n"
    content  = f"{document.page_content}\n\n"
    content += f"{consumer_description}\n\n"
    #content += f"Title:\n{document.metadata['title']}\n\n"
    #content += f"Key Features:\n{document.metadata['features']}\n\n"
    #content += f"{document.metadata['features']}\n\n"
    #content += f"Technical Specifications:\n{document.metadata['specs']}"
    #content += f"Product Features:\n{document.metadata['product_features']}"
    content = clean_text(content)
    #print(content)
    
    source = document.metadata['source']
    source = source.replace("https://www.thegoodguys.com.au", "")

    brand = document.metadata['title']
    # split brand using delimet of nbsp &nbsp
    brand = brand.split("\xa0")[0]
    #brand = brand.split(" ")[0]

    metadata = {
        'source': source,
        'brand': clean_text(brand),
        'title': clean_text(document.metadata['title']),
        'price': document.metadata['price'],
        'img': document.metadata['img'],
    }

    document_concatenated = Document(page_content=content, metadata=metadata)
    data_concatenated.append(document_concatenated)

print(len(data_concatenated))

**Remove Non Television Products**

In [59]:
data_new = []
for d in data_concatenated:
    title = d.metadata['title']
    # if title contains "antenna", or "mount" then skip
    if "antenna" in title.lower() or "mount" in title.lower() or "wall bracket" in title.lower() or "stand" in title.lower() or "bracket" in title.lower():
        continue
    if "tv" not in title.lower():
        continue
    data_new.append(d)

### Split the data into chunks

In [63]:
from langchain.text_splitter import TokenTextSplitter

CHUNK_SIZE = 1500

# Chunk the data
print("Splitting Data")
text_splitter = TokenTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=100)
docs = text_splitter.split_documents(data_new)
print(f"Number of chunks: {len(docs)}")

Splitting Data
Number of chunks: 385


### AstraDB Connectivity

In [64]:
import os
from dotenv import load_dotenv

load_dotenv()

ASTRA_DB_API_ENDPOINT = os.environ.get("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_APPLICATION_TOKEN = os.environ.get("ASTRA_DB_APPLICATION_TOKEN")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
ASTRA_DB_KEYSPACE = "cashrewards"
ASTRA_DB_COLLECTION = "goodguys_ai_description"

os.environ["LANGCHAIN_TRACING_V2"] = "true"

### Define the Vector Store

In [65]:
#from langchain_community.vectorstores.astradb import AstraDB
from langchain_astradb import AstraDBVectorStore

from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", api_key=OPENAI_API_KEY)

# Set up the vector store
print(f"Setup Vector Store: {ASTRA_DB_KEYSPACE} - {ASTRA_DB_COLLECTION}")
vectorstore = AstraDBVectorStore(
    embedding=embeddings,
    namespace=ASTRA_DB_KEYSPACE,
    collection_name=ASTRA_DB_COLLECTION,
    token=ASTRA_DB_APPLICATION_TOKEN,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    #metric="dot_product",
)

Setup Vector Store: cashrewards - goodguys_ai_description


### Store data and embeddings in Astra DB

In [66]:
import time

print("Adding texts to Vector Store")

BLOCK_SIZE = 50
# iterate through docs in sets of BLOCK_SIZE
for i in range(0, len(docs), BLOCK_SIZE):
    print(f"Adding {i} to {i+BLOCK_SIZE}", end=' ')
    texts, metadatas = zip(*((doc.page_content, doc.metadata) for doc in docs[i:i+BLOCK_SIZE]))
    inserted_ids = vectorstore.add_texts(texts=texts, metadatas=metadatas)
    print(f"Inserted {len(inserted_ids)} documents.")
    # pause for 1 seconds
    time.sleep(1)

Adding texts to Vector Store
Adding 0 to 50 Inserted 50 documents.
Adding 50 to 100 Inserted 50 documents.
Adding 100 to 150 Inserted 50 documents.
Adding 150 to 200 Inserted 50 documents.
Adding 200 to 250 Inserted 50 documents.
Adding 250 to 300 Inserted 50 documents.
Adding 300 to 350 Inserted 50 documents.
Adding 350 to 400 Inserted 35 documents.


## Delete all Documents in Collection

*WARNING*: This code will delete all documents from the collection

In [None]:
from astrapy.db import AstraDB

# Initialize the AstraDB client
db = AstraDB(
    namespace=ASTRA_DB_KEYSPACE,
    token=ASTRA_DB_APPLICATION_TOKEN,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
)


# Retrieve collections
collections_response = db.get_collections()

# validate that ASTRA_DB_COLLECTION exists in collections_response["status"]["collections"]
if ASTRA_DB_COLLECTION in collections_response["status"]["collections"]:
    print(f"Collection \"{ASTRA_DB_COLLECTION}\" exists")

    # Access an existing collection
    collection = db.collection(ASTRA_DB_COLLECTION)

    # Delete all documents in the collection
    res = collection.delete_many(filter={})

    # Print the result
    print(res)