# Populate Vector DB

`conda activate env_ragstack`

## Fetch Data from Web Source

We load a number of HTML pages using `request` module. Each of those pages contains lots of superfluous content so we extract only the relevant article context.

#### Utility functions

In [1]:
import re
import unicodedata

"""
Function to clean text from web pages
"""
def clean_text(text: str):
    # Normalize line breaks to \n\n (two new lines)
    text = text.replace("\r\n", "\n\n")
    text = text.replace("\r", "\n\n")

    # Replace two or more spaces with a single space
    text = re.sub(" {2,}", " ", text)

    # Remove leading spaces before removing trailing spaces
    text = re.sub("^[ \t]+", "", text, flags=re.MULTILINE)

    # Remove trailing spaces before removing empty lines
    text = re.sub("[ \t]+$", "", text, flags=re.MULTILINE)

    # Remove empty lines
    text = re.sub("^\s+", "", text, flags=re.MULTILINE)

    # remove unicode Non Breaking Space
    text = unicodedata.normalize('NFKC', text)

    return text

In [2]:
import tiktoken

"""
Function to calculate the number of tokens in a text string.
"""

encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_string(string: str) -> int:
    num_tokens = len(encoding.encode(string))
    return num_tokens

#### Website to TXT

In [3]:
import logging
import requests
from bs4 import BeautifulSoup

def website_to_txt(job_url: str):

    item = {}
    item_title = ""
    item_price = ""
    item_description = ""
    product_description = ""
    product_details = ""

    try:
        page = requests.get(job_url)

        if page.status_code != 200:
            print(f"Failed to retrieve the job posting at {job_url}. Status code: {page.status_code}")

        # Parse the HTML content of the job posting using BeautifulSoup
        soup = BeautifulSoup(page.text, 'html.parser')

        # Find the page title element and get the text
        item_title = soup.find('h1', {'class': 'product__title'})
        if item_title is not None:
            item_title = item_title.text.strip()
        else:
            item_title = ""


        # Find the item price
        item_price = soup.find('div', {'class': 'product__price'})
        if item_price is not None:
            item_price = item_price.find_all('span')
            item_price = [item_price.text for item_price in item_price]
            item_price = item_price[0]
            #strip $ fro the price
            item_price = item_price[1:]
        else:
            item_price = ""


        # Find the page description element
        item_description = soup.find('div', {'class': 'product__accordion'})
        if item_description is not None:

            # Product Description
            product_description = item_description.find('div', {'id': 'accordion-panel-1'})
            if product_description is not None:
                product_description = product_description.text.strip()
            else: 
                product_description = ""

            # Product Details
            product_details = item_description.find('div', {'id': 'accordion-panel-2'})
            if product_details is not None:
                # from the unordered list, get the text of each list item
                product_details = product_details.find_all('li')
                product_details = [li.text for li in product_details]
                product_details = "\n".join(product_details)
            else: 
                product_details = ""        
            
        else:
            item_description = ""
    
        # Find the item image
        item_img = soup.find('div', {'class': 'product__media'})
        if item_img is not None:
            item_img = item_img.find('img', {'class': 'component-image__image'})
            item_img = item_img['src']
            item_img = item_img.split('?')[0]
        else:
            item_img = ""


    except Exception as e:
        logging.error(f"Could not get the description from the URL: {job_url}")
        logging.error(e)
        exit()

    item['title'] = item_title
    item['price'] = item_price
    item['img'] = item_img
    item['description'] = f"Description:\n{clean_text(product_description)}\n\nDetails:\n{product_details}"


    return item

#### Get URLs from File

In [5]:
#read urls from file named links.txt
with open('links2.txt', 'r') as file:
    urls = file.readlines()
    urls = [url.strip() for url in urls]

# prepend "https://blueillusion.com/" to each url
urls = ["https://blueillusion.com" + url for url in urls]

# For debugging, use only a single URL
#urls = ["https://blueillusion.com/products/waist-tab-linen-culotte-216716lnm-chambray-cross-dye"]

print (f"Number of URLs: {len(urls)}")

Number of URLs: 227


#### Crawl URLs

In [6]:
from langchain.docstore.document import Document

data = []
for url in urls:
    item = website_to_txt(url)
    
    metadata = {
        'source': url,
        'title': item['title'],
        'price': item['price'],
        'img': item['img'],
        'language': 'en'
    }

    document = Document(page_content=item['description'], metadata=metadata)

    data.append(document)

print (f"Number of Documents: {len(data)}")

Number of Documents: 227


In [8]:
print(data[199])

page_content='Description:\nIntroducing ‘The Trapeze’ from French eyewear specialists IZIPIZI. These comfortable, lightweight frames are perfect for navigating the warm weather. Expertly crafted in a lightweight rubber texture, these sunglasses offer 100% UV protection. With a large, structured trapeze silhouette that is flattering to many, these sunglasses also showcase a bright pink colour that makes for a modern aesthetic. Take these sunglasses with you on your next outing or holiday.Style/SKU: A19230.493\n\nDetails:\nIncludes storage pouch\nRubber texture\nTrapeze silhouette\nFlexible arms\n100% UV protection\nPink' metadata={'source': 'https://blueillusion.com/products/trapeze-sunglasses-pink-a19230-pink', 'title': 'Trapeze Sunglasses Pink', 'price': '40.00', 'img': '//blueillusion.com/cdn/shop/files/trapezesunglassespink00006.jpg', 'language': 'en'}


**Write Crawled Data to Disk**

In [9]:
import pickle

#write data to file in a way that it can be reconstituted into a list of documents
with open("help_data2.pkl", "wb") as f:
    pickle.dump(data, f)

**Read Crawled Data from Disk**

In [None]:
# Read help_data.pkl and recreate data object as list of documents
import pickle
with open("help_data.pkl", "rb") as f:
    data = pickle.load(f)

print(len(data))
print(data[10])

## Create the Vector Store

#### Split the data into chunks

In [10]:
from langchain.text_splitter import TokenTextSplitter

CHUNK_SIZE = 1000

# Chunk the data
print("Splitting Data")
text_splitter = TokenTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=50)
docs = text_splitter.split_documents(data)
print(f"Number of chunks: {len(docs)}")

Splitting Data
Number of chunks: 227


#### AstraDB Connectivity

In [11]:
import os
from dotenv import load_dotenv

load_dotenv()

ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"]
ASTRA_VECTOR_ENDPOINT = os.environ["ASTRA_VECTOR_ENDPOINT_BO"]
ASTRA_DB_KEYSPACE = "blueillusion"
ASTRA_DB_COLLECTION = "catalogue"
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
AWS_ACCESS_KEY_ID = os.environ["AWS_ACCESS_KEY_ID"]
AWS_SECRET_ACCESS_KEY = os.environ["AWS_SECRET_ACCESS_KEY"]

#### Define the Vector Store

In [12]:
from langchain_community.vectorstores.astradb import AstraDB

#from langchain_community.embeddings import CohereEmbeddings
#embeddings = CohereEmbeddings(model="embed-english-v3.0")

#from langchain_openai import OpenAIEmbeddings
#embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

from langchain_community.embeddings import BedrockEmbeddings
embeddings = BedrockEmbeddings(credentials_profile_name="fieldops", region_name="us-east-1")

# Set up the vector store
print(f"Setup Vector Store: {ASTRA_DB_KEYSPACE} - {ASTRA_DB_COLLECTION}")
vectorstore = AstraDB(
    embedding=embeddings,
    namespace=ASTRA_DB_KEYSPACE,
    collection_name=ASTRA_DB_COLLECTION,
    token=ASTRA_DB_APPLICATION_TOKEN,
    api_endpoint=ASTRA_VECTOR_ENDPOINT,
)

Setup Vector Store: blueillusion - catalogue


#### Store data and embeddings in Astra DB

In [13]:
import time

print("Adding texts to Vector Store")

BLOCK_SIZE = 50
# iterate through docs in sets of BLOCK_SIZE
for i in range(0, len(docs), BLOCK_SIZE):
    print(f"Adding {i} to {i+BLOCK_SIZE}", end=' ')
    texts, metadatas = zip(*((doc.page_content, doc.metadata) for doc in docs[i:i+BLOCK_SIZE]))
    inserted_ids = vectorstore.add_texts(texts=texts, metadatas=metadatas)
    print(f"Inserted {len(inserted_ids)} documents.")
    # pause for 1 seconds
    time.sleep(1)

Adding texts to Vector Store
Adding 0 to 50 Inserted 50 documents.
Adding 50 to 100 Inserted 50 documents.
Adding 100 to 150 Inserted 50 documents.
Adding 150 to 200 Inserted 50 documents.
Adding 200 to 250 Inserted 27 documents.


## Delete all Documents in Collection

In [None]:
from astrapy.db import AstraDB

# Initialize the AstraDB client
db = AstraDB(
    namespace=ASTRA_DB_KEYSPACE,
    token=ASTRA_DB_APPLICATION_TOKEN,
    api_endpoint=ASTRA_VECTOR_ENDPOINT,
)

# Retrieve collections
collections_response = db.get_collections()

# validate that ASTRA_DB_COLLECTION exists in collections_response["status"]["collections"]
if ASTRA_DB_COLLECTION in collections_response["status"]["collections"]:
    print(f"Collection \"{ASTRA_DB_COLLECTION}\" exists")

    # Access an existing collection
    collection = db.collection(ASTRA_DB_COLLECTION)

    # Delete all documents in the collection
    res = collection.delete_many(filter={})

    # Print the result
    print(res)