# Create Vector Database with data

- Crawl website
- Extract relevant content section from HTML
- Split into chunks
- Generate vectors using Embedding model
- Store vectors in Astra DB

## Fetch Data from Web Source

#### Utility functions

In [1]:
import re

#
# Function to clean text from web pages
#

def clean_text(text: str):
    # Normalize line breaks to \n\n (two new lines)
    text = text.replace("\r\n", "\n\n")
    text = text.replace("\r", "\n\n")

    # Replace two or more spaces with a single space
    text = re.sub(" {2,}", " ", text)

    # Remove leading spaces before removing trailing spaces
    text = re.sub("^[ \t]+", "", text, flags=re.MULTILINE)

    # Remove trailing spaces before removing empty lines
    text = re.sub("[ \t]+$", "", text, flags=re.MULTILINE)

    # Remove empty lines
    text = re.sub("^\s+", "", text, flags=re.MULTILINE)

    return text


In [2]:
import tiktoken

#
# Function to calculate the number of tokens in a text string.
#

encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_string(string: str) -> int:
    num_tokens = len(encoding.encode(string))
    return num_tokens

#### Get data files

We load a number of HTML pages using the LangChain WebBaseLoader. Each of those pages contains lots of superfluous content so we extract only the relevant article context.

In [None]:
#
# Build a list of URLs to scrape from a text file.
#

# read urls from txt file
with open('help_urls.txt') as f:
    urls = f.readlines()

# For debugging - Overide urls from text file 
#urls = ["/help/personal/home-loans/apply-for-a-home-loan/choosing-which-home-loan-is-right-for-you.html"]


# remove whitespace characters like `\n` at the end of each line
urls = [x.strip() for x in urls]

# prepend "https://www.macquarie.com.au" to each url
urls = ["https://www.macquarie.com.au" + x for x in urls]

display(urls)

In [None]:
#
# Scrape web data from the URLs
#

from langchain.document_loaders import WebBaseLoader
import re

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
}

print("Loading Data")
url_loaders = WebBaseLoader(urls, header_template=headers)
data = url_loaders.load()

#
# Extract only the actual Article content from the web page and clean
#
print(f"Cleaning Data for {len(data)} documents")

for i, d in enumerate(data):
    d.page_content = ""
    source = d.metadata['source']
    thedoc = WebBaseLoader(source, header_template=headers).scrape()
    # extract only the Container Article content from the web page
    td = thedoc.findAll('div', class_='parsys')
    for t in td:
        if len(t['class']) == 1 and t['class'][0] == 'parsys':
            d.page_content = clean_text(t.text)
            data[i] = d
            # print i padded to 5 charactyers
            print (f"Doc: {i:3d}    Tokens: {num_tokens_from_string(d.page_content):6d}")
            break

#### Split the data into chunks

In [None]:
from langchain.text_splitter import TokenTextSplitter

CHUNK_SIZE = 500

# Chunk the data
print("Splitting Data")
text_splitter = TokenTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=50)
docs = text_splitter.split_documents(data)
print(f"Number of chunks: {len(docs)}")

## Store data in Astra Vector DB

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
ASTRA_DB_API_ENDPOINT = os.environ['ASTRA_DB_API_ENDPOINT']
ASTRA_DB_COLLECTION = "mac_help"

#### Embedding function

Define the embedding model and create a function to generate vectors

In [None]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

def embed(text_to_embed):
    embedding = list(embeddings.embed_query(text_to_embed))
    return embedding

#### Initialise Astra Vector DB

In [None]:
# Initialise Database

from astrapy.db import AstraDB

# Initialization
db = AstraDB(
  token=ASTRA_DB_APPLICATION_TOKEN,
  api_endpoint=ASTRA_DB_API_ENDPOINT,
)

#### Initialise the Collection to use in Astra

In [None]:
# Create Collection

col = db.create_collection(ASTRA_DB_COLLECTION, dimension=1536)

#### Assemble and Write data

- Assemble chunks into JSON format, with the vector generated for each chunk
- Store the chunks into Astra Vector DB

In [None]:
# Insert Multiple Chunks

# initialise documents list
documents = []

# iterate over chunks, calculate embedding, and append to documents list
for i, d in enumerate(docs):
    documents.append(
        {
            "source": d.metadata['source'],
            "title": d.metadata['title'],
            "description": d.metadata['description'],
            "language": d.metadata['language'],
            "content": d.page_content,
            "$vector": embed(d.page_content)
        }
    )

# insert documents list into collection
res = col.insert_many(documents)