# Help Chat

**Use natural language to query Help Centre**

## Fetch Data from Web Source

#### Utility functions

In [1]:
import re

"""
Function to clean text from web pages
"""
def clean_text(text: str):
    # Normalize line breaks to \n\n (two new lines)
    text = text.replace("\r\n", "\n\n")
    text = text.replace("\r", "\n\n")

    # Replace two or more spaces with a single space
    text = re.sub(" {2,}", " ", text)

    # Remove leading spaces before removing trailing spaces
    text = re.sub("^[ \t]+", "", text, flags=re.MULTILINE)

    # Remove trailing spaces before removing empty lines
    text = re.sub("[ \t]+$", "", text, flags=re.MULTILINE)

    # Remove empty lines
    text = re.sub("^\s+", "", text, flags=re.MULTILINE)

    return text


In [2]:
import tiktoken

"""
Function to calculate the number of tokens in a text string.
"""

encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_string(string: str) -> int:
    num_tokens = len(encoding.encode(string))
    return num_tokens

#### Get data files

We load a number of HTML pages using the LangChain WebBaseLoader. Each of those pages contains lots of superfluous content so we extract only the relevant article context.

In [3]:
# read urls from txt file
with open('help_urls.txt') as f:
    urls = f.readlines()

# for debugging
#urls = ["/help/disaster-help/disaster-preparation/"]


# remove whitespace characters like `\n` at the end of each line
urls = [x.strip() for x in urls]

# prepend Root URL to each url
urls = ["https://www.westpac.com.au" + x for x in urls]

display(urls)

['https://www.westpac.com.au/about-westpac',
 'https://www.westpac.com.au/about-westpac/careers',
 'https://www.westpac.com.au/about-westpac/global-locations',
 'https://www.westpac.com.au/about-westpac/inclusion-and-diversity',
 'https://www.westpac.com.au/about-westpac/indigenous',
 'https://www.westpac.com.au/about-westpac/innovation',
 'https://www.westpac.com.au/about-westpac/investor-centre',
 'https://www.westpac.com.au/about-westpac/media',
 'https://www.westpac.com.au/about-westpac/our-foundations',
 'https://www.westpac.com.au/about-westpac/sustainability',
 'https://www.westpac.com.au/about-westpac/sustainability/initiatives-for-you/indigenous-banking',
 'https://www.westpac.com.au/about-westpac/sustainability/our-positions-and-perspectives/difficult-circumstances',
 'https://www.westpac.com.au/about-westpac/westpac-group',
 'https://www.westpac.com.au/about-westpac/westpac-wire',
 'https://www.westpac.com.au/business-banking',
 'https://www.westpac.com.au/business-banking/b

In [4]:
from langchain.document_loaders import WebBaseLoader
import re

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
}

print("Loading Data")
url_loaders = WebBaseLoader(urls, header_template=headers)
data = url_loaders.load()

"""
Extract only the actual Article content from the web page and clean
"""
print("Cleaning Data")
for i, d in enumerate(data):
    d.page_content = ""
    source = d.metadata['source']
    thedoc = WebBaseLoader(source, header_template=headers).scrape()
    # extract only the Content Div from the web page
    td = thedoc.find('div', class_='content').text
    d.page_content = clean_text(td)
    data[i] = d

print (f"Number of documents: {len(urls)}")
print (f"Number of tokens: {num_tokens_from_string(data[0].page_content)}")


Loading Data
Cleaning Data
Number of documents: 825
Number of tokens: 461


#### Split the data into chunks

In [5]:
from langchain.text_splitter import TokenTextSplitter

CHUNK_SIZE = 500

# Chunk the data
print("Splitting Data")
text_splitter = TokenTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=50)
docs = text_splitter.split_documents(data)
print(f"Number of chunks: {len(docs)}")

Splitting Data
Number of chunks: 1158


## Create the Vector Store

#### Astra DB Connectivity

In [6]:
import os
from dotenv import load_dotenv

load_dotenv()

ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"]
ASTRA_VECTOR_ENDPOINT = os.environ["ASTRA_VECTOR_ENDPOINT"]
ASTRA_DB_KEYSPACE = os.environ["ASTRA_DB_KEYSPACE"]
ASTRA_DB_COLLECTION = os.environ["ASTRA_DB_COLLECTION"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

#### Define the Vector Store

In [7]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.astradb import AstraDB


# define Embedding model
embeddings = OpenAIEmbeddings()

# Set up the vector store
print("Setup Vector Store")
vectorstore = AstraDB(
    embedding=embeddings,
    namespace=ASTRA_DB_KEYSPACE,
    collection_name=ASTRA_DB_COLLECTION,
    token=ASTRA_DB_APPLICATION_TOKEN,
    api_endpoint=ASTRA_VECTOR_ENDPOINT,
)

Setup Vector Store


## Store data and embeddings in Astra DB

In [8]:
print("Adding texts to Vector Store")
texts, metadatas = zip(*((doc.page_content, doc.metadata) for doc in docs))
vectorstore.add_texts(texts=texts, metadatas=metadatas)

Adding texts to Vector Store


['e0c9e4af20ec41ca89453879ff32ed3d',
 'a66c86501e764a9b8a2f527d751ed10e',
 '45fb35359fc642eea5ab8be52b89d465',
 '3b2b35f3d9bc45dc98f8d65b7b422afc',
 'c6c136489cb548d2ad06df15ef6a07e9',
 'a8882e9d600f4a6e8b6b9498b194d1c7',
 '28d2fbe032054b5fba5cb083a3a668ae',
 'e9854e72df4349faa512d4a2996654f2',
 '1c28912670d54f47acea52f51a15467a',
 '2437ee5f049b4c9ab4a25e261750ab58',
 'f9c4aa7e0c624162809c806ceb4f08c0',
 '6f3be05a01b9447ebeb129b410bffa76',
 '5635bf6c86ab489c86b3ba9e863e14a9',
 '02664570b7c042a297d4e5e8028195b0',
 '60ad16c6deb342e193b8a9eb1871dcbf',
 '4cbdaaaa9cea4a51b08ea50a3aa310d7',
 'de7bf721e3cb406189d2292203db2466',
 'd1c77b6336fd4ef28e6a89a5f68cc7d4',
 'de56c0408ce74a6fa0f123f5d59833b3',
 '8565bf6566c14ffca4794b6692de88bb',
 '5f78a3261ad34a4b8a250be721340867',
 '080bd1b4c69145bd9bd2f46b741822c6',
 '0ee817dfb0a84b46bf3e4a59f65ebd9d',
 '53f3f72bcee84d0e82938bcc5550b1dc',
 'f0eed058d5db4e74bb209b9b76e5c9c6',
 '1870a11f45be491a985d2c7adf621374',
 '710185de0e99481691dd2defe171050f',
 