# Help Chat

**Use natural language to query Help Centre**

## Fetch Data from Web Source

#### Utility functions

In [1]:
#
# Function to clean text from web pages
#

import re

def clean_text(text: str):
    # Normalize line breaks to \n\n (two new lines)
    text = text.replace("\r\n", "\n\n")
    text = text.replace("\r", "\n\n")

    # Replace two or more spaces with a single space
    text = re.sub(" {2,}", " ", text)

    # Remove leading spaces before removing trailing spaces
    text = re.sub("^[ \t]+", "", text, flags=re.MULTILINE)

    # Remove trailing spaces before removing empty lines
    text = re.sub("[ \t]+$", "", text, flags=re.MULTILINE)

    # Remove empty lines
    text = re.sub("^\s+", "", text, flags=re.MULTILINE)

    return text


In [2]:
#
# Function to calculate the number of tokens in a text string.
#

import tiktoken

"""
Function to calculate the number of tokens in a text string.
"""

encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_string(string: str) -> int:
    num_tokens = len(encoding.encode(string))
    return num_tokens

#### Get data files

We load a number of HTML pages using the LangChain WebBaseLoader. Each of those pages contains lots of superfluous content so we extract only the relevant article context.

In [3]:
#
# Build a list of URLs to scrape from a text file.
#

# read urls from txt file
with open('help_urls.txt') as f:
    urls = f.readlines()

# For debugging - Overide urls from text file 
urls = ["/help/personal/home-loans/apply-for-a-home-loan/choosing-which-home-loan-is-right-for-you.html"]


# remove whitespace characters like `\n` at the end of each line
urls = [x.strip() for x in urls]

# prepend "https://www.macquarie.com.au/" to each url
urls = ["https://www.macquarie.com.au/" + x for x in urls]

display(urls)

['https://www.macquarie.com.au//help/personal/home-loans/apply-for-a-home-loan/choosing-which-home-loan-is-right-for-you.html']

In [4]:
#
# Scrape web data from the URLs
#

from langchain.document_loaders import WebBaseLoader
import re

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
}

print("Loading Data")
url_loaders = WebBaseLoader(urls, header_template=headers)
data = url_loaders.load()

#
# Extract only the actual Article content from the web page and clean
#
print(f"Cleaning Data for {len(data)} documents")
for i, d in enumerate(data):
    d.page_content = ""
    #source = 
    thedoc = WebBaseLoader(d.metadata['source'], header_template=headers).scrape()
    # extract only the Container Article content from the web page
    td = thedoc.findAll('div', class_='parsys')
    for t in td:
        if len(t['class']) == 1 and t['class'][0] == 'parsys':
            d.page_content = clean_text(t.text)
            data[i] = d
            print (f"Doc: {i:3d}    Tokens: {num_tokens_from_string(d.page_content):6d}")
            break

Loading Data
Cleaning Data for 1 documents
Doc:   0    Tokens:    341


#### Split the data into chunks

In [5]:
from langchain.text_splitter import TokenTextSplitter

CHUNK_SIZE = 500

# Chunk the data
print("Splitting Data")
text_splitter = TokenTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=50)
docs = text_splitter.split_documents(data)
print(f"Number of chunks: {len(docs)}")

Splitting Data
Number of chunks: 1


## Create the Vector Store

#### Astra DB Connectivity

In [6]:
import os
from dotenv import load_dotenv

ASTRA_DB_KEYSPACE = os.environ['ASTRA_DB_KEYSPACE']
ASTRA_DB_SECURE_BUNDLE_PATH = os.environ['ASTRA_DB_SECURE_BUNDLE_PATH']
ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

In [7]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider


def getCluster():
    """
    Create a Cluster instance to connect to Astra DB.
    Uses the secure-connect-bundle and the connection secrets.
    """
    cloud_config = {"secure_connect_bundle": ASTRA_DB_SECURE_BUNDLE_PATH}
    auth_provider = PlainTextAuthProvider("token", ASTRA_DB_APPLICATION_TOKEN)
    return Cluster(cloud=cloud_config, auth_provider=auth_provider)


def get_astra():
    """
    This function is used by LangChain Vectorstore.
    """
    cluster = getCluster()
    astraSession = cluster.connect()
    return astraSession, ASTRA_DB_KEYSPACE

#### Define the Vector Store

In [8]:
from langchain.vectorstores import Cassandra
from langchain.embeddings.openai import OpenAIEmbeddings

# define Embedding model
embeddings = OpenAIEmbeddings()

# Set up the vector store
print("Setup Vector Store")
session, keyspace = get_astra()
vectorstore = Cassandra(
    embedding=embeddings,
    session=session,
    keyspace=keyspace,
    table_name="helpcentre_db",
)

Setup Vector Store


## Store data and embeddings in Astra DB

In [None]:
print("Adding texts to Vector Store")
texts, metadatas = zip(*((doc.page_content, doc.metadata) for doc in docs))
vectorstore.add_texts(texts=texts, metadatas=metadatas)