# OpenAI driven search of ICWA Legistation
This uses the Western Australian [Strata Titles Act 1985](https://www.legislation.wa.gov.au/legislation/statutes.nsf/main_mrtitle_938_homepage.html)


# Initialisation

Now load the .env file to get the API keys in a secure way. The path should be the full path to the .env file. If this work it returns `True`

In [1]:
from dotenv import load_dotenv
load_dotenv() #get API keys

True

# Prepare documents and create database

In [2]:

def counts(texts):
  '''Create some basic statistics on the corpus'''

  if len(texts) == 0:
    print("No texts")
    return

  charCounts = [len(text.page_content) for text in texts]
  wordCounts = [len(text.page_content.split()) for text in texts]
  print(f"There are {len(texts)} chunks\nAverage character count {sum(charCounts)/len(charCounts):.0f}\nAverage word count {sum(wordCounts)/len(wordCounts):.0f}")
  

## Prepare context 



In [3]:
import docx
import re

def readDocx(file_path, n=5):
    '''Break document at headings up to level n (5) and return a plain text 
       document with paragraphs seperated by two newlines (\n\n)'''
    
    doc = docx.Document(file_path)
    skip_toc=True
    sections = []
    current_section = {'heading': "Document", 'level': 0, 'content': ""}

    for paragraph in doc.paragraphs:
        text = re.sub(r"\s+", ' ', paragraph.text)
        if paragraph.style.name.startswith(tuple(f"Heading {i+1}" for i in range(n))) or \
            paragraph.text.startswith(("Schedule", "Notes", "Defined terms")) or \
                re.search(r'^\d+\.', paragraph.text):
            #save old section 
            if current_section['heading'] or current_section['content']:
                sections.append(current_section)
            
            # and start a new section
            current_section = {'heading' : text,
                               #'level'   : int(re.search("Heading (\d+)", paragraph.style.name).group(1)),
                               'content' : text
                            }
        else:
            # join this paragraph text to prior ones in this section
            current_section['content'] = "\n\n".join([current_section['content'], text])

    # Add the last section
    if  current_section['heading'] or current_section['content']:
        sections.append(current_section)

    #Return list of sections
    return sections 

from langchain.schema import Document
def makeDocument(filepath, source, URL, n=5):
    '''Break legistation by headings down to level n. This chunks up the 
       document to sizes chatGPT can digest while ensuring the clauses in
        the legislation are kept together '''

    return [Document(page_content = section['content'], metadata = {'title':section['heading'], 'source': source, 'URL': URL}) 
                for section in readDocx(filepath, n) ]


### Strata Act
The legislation is a word document that can be readily manipulated using the `docx` module.

In [4]:

legislationDocs = makeDocument(
    filepath= r'documents/Strata Titles Act 1985.docx', 
    source= 'Strata Titles Act 1985',
    URL=r'https://www.legislation.wa.gov.au/legislation/prod/filestore.nsf/FileURL/mrdoc_45344.htm/$FILE/Strata%20Titles%20Act%201985%20-%20%5B07-m0-00%5D.html?OpenElement')
tocLen = [i for i, doc in enumerate(legislationDocs) if  doc.metadata['title'] == 'Defined terms']
print("drop first", tocLen[0], "items")
legislationDocs = legislationDocs[tocLen[0]:]
counts(legislationDocs)


drop first 321 items
There are 384 chunks
Average character count 1331
Average word count 232


### Strata Regulations

Create docus from [regulations](Strata Titles (General) Regulations 2019)

In [5]:
regsDocs = makeDocument(
    filepath=r'documents/Strata Titles (General) Regulations 2019.docx',
    source='Strata Titles (General) Regulations 2019',
    URL=r'https://www.legislation.wa.gov.au/legislation/prod/filestore.nsf/FileURL/mrdoc_46058.htm/$FILE/Strata%20Titles%20(General)%20Regulations%202019%20-%20%5B00-j0-00%5D.html?OpenElement') #drop toc and title
tocLen = [i for i, doc in enumerate(regsDocs) if  doc.metadata['title'] == 'Defined terms']
print("drop first", tocLen[0], "items")
regsDocs = regsDocs[tocLen[0]:]
counts(regsDocs)

drop first 231 items
There are 264 chunks
Average character count 858
Average word count 143


In [14]:

def pp(self) -> str:
    return f'''<details><summary><a href="{self.metadata.get('URL')}" target="_blank">{self.metadata.get('source')}</a>: {self.metadata.get('title')}</summary>''' +\
                f'''{self.page_content} </details>''' 
    



In [16]:
from IPython.core.display import HTML

HTML(pp(legislationDocs[0]))

### Make FAQ

Ingest the web page [faq](https://strata.wa.gov.au/strata-titles/support-and-resources/faqs)


In [6]:
import requests
from bs4 import BeautifulSoup
from langchain.schema import Document

def getFAQdocs():
    # Send a GET request to the URL
    url = r"https://strata.wa.gov.au/strata-titles/support-and-resources/faqs"
    response = requests.get(url)

    # Create BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    questions = soup.find_all("dt", class_="faq__question")
    answers = soup.find_all("dd", class_="faq__answer")

    return [Document(page_content = '\n\n'.join([ question.get_text(" ", strip=True), answer.get_text(" ", strip=True)]), 
            metadata = {'title':question.get_text(" ", strip=True), 
                        'source':'Landgate FAQ', 
                        'URL':url}
        ) 
        for question, answer in zip(questions, answers) ]

faqDocs =  getFAQdocs()
counts(faqDocs)

There are 104 chunks
Average character count 570
Average word count 96


###  Landgate Guide to Strata
Add the text from the pdf from Landgage called [Guide to strata titles]{https://www0.landgate.wa.gov.au/docvault.nsf/web/PS_STPM/$file/2725LAND_Landgate_Guide-to-Strata-Titles_Revised_July2022.pdf}

In [7]:
guideDocs = makeDocument(
    filepath=r'documents/2725LAND_Landgate_Guide-to-Strata-Titles_Revised_July2022.docx',
    source='Landgate Guide to Strata Titles Revised July2022',
    URL=r'https://www0.landgate.wa.gov.au/docvault.nsf/web/PS_STPM/$file/2725LAND_Landgate_Guide-to-Strata-Titles_Revised_July2022.pdf',
    n=3
) 
tocLen = [i for i, doc in enumerate(guideDocs) if  doc.metadata['title'] == 'Terms and Definitions']
print("drop first", tocLen[0], "items")
regsDocs = guideDocs[tocLen[0]:]
counts(guideDocs)

drop first 6 items
There are 97 chunks
Average character count 1570
Average word count 263


## Create the Pinecone database
Initialise the pinecode instance base on the API keys in .env. 

Depending on the user input, use the existing index or create a new one from the documents. Create a "similarity" document retriever based on the database.

In [8]:
import pinecone 
import os

pinecone.init(
    api_key= os.environ.get('PINECONE_API_KEY') ,  # find at app.pinecone.io
    environment=os.environ.get('PINECONE_ENV')     # next to api key in console
)

INDEX = os.environ.get('INDEX')
NAMESPACE="SCA_H5"

  from tqdm.autonotebook import tqdm


In [9]:
#pinecone.create_index(INDEX, dimension=1536)
pinecone.Index(INDEX).delete(namespace=NAMESPACE, deleteAll=True)

{}

In [10]:
allDocs = legislationDocs + regsDocs + faqDocs + guideDocs
counts(allDocs)
for name, docs in zip(["legislationDocs", "regsDocs", "faqDocs", "guideDocs"], [legislationDocs, regsDocs, faqDocs, guideDocs]):
    print()
    print(name)
    counts(docs)

There are 676 chunks
Average character count 1290
Average word count 222

legislationDocs
There are 384 chunks
Average character count 1331
Average word count 232

regsDocs
There are 91 chunks
Average character count 1643
Average word count 276

faqDocs
There are 104 chunks
Average character count 570
Average word count 96

guideDocs
There are 97 chunks
Average character count 1570
Average word count 263


In [11]:
allDocs[10].metadata

{'title': '8. Freehold schemes and leasehold schemes',
 'source': 'Strata Titles Act 1985',
 'URL': 'https://www.legislation.wa.gov.au/legislation/prod/filestore.nsf/FileURL/mrdoc_45344.htm/$FILE/Strata%20Titles%20Act%201985%20-%20%5B07-m0-00%5D.html?OpenElement'}

In [12]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
    
Pinecone.from_documents(allDocs, 
                        index_name=os.environ.get('INDEX'), 
                        namespace="SCA_H5", 
                        embedding=OpenAIEmbeddings())


<langchain.vectorstores.pinecone.Pinecone at 0x7b29f16473a0>

# Create and test SImon


Define a Q&A chain that 'stuffs' the retrieved chunks into the prompt to provide context. Using OpenAI deterministic (temperature=0) model `gpt-3.5-turbo`.  According to OpenAI 'gpt-3.5-turbo' is the 
> Most capable GPT-3.5 model and optimized for chat at 1/10th the cost of text-davinci-003.

In [1]:
import pinecone
import os

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

pinecone.init(
    api_key= os.environ.get('PINECONE_API_KEY') ,  # find at app.pinecone.io
    environment=os.environ.get('PINECONE_ENV')     # next to api key in console
)

db = Pinecone.from_existing_index(index_name=os.environ.get('INDEX'), 
                                   namespace='SCA_H5', 
                                   embedding=OpenAIEmbeddings())

  from tqdm.autonotebook import tqdm


In [3]:
from langchain.prompts import PromptTemplate
prompt_template = """
You are a helpful Strata legal expert in Western Australia answering questions about the "Strata Titles Act 1985" from a lot owner.

Start the answer with "An owner should always refer to their bylaws and strata plan in conjenction with the legislation".

Provide a detailed answer using the information from the legislation provided below. List relevant sections of the act. 

Do not make up answers. If you do not know say "I do not know".

{context}

Question: {question}

Answer in plain english"
"""
PROMPT = PromptTemplate(template=prompt_template, 
                        input_variables=["context", "question"])

retriever=db.as_retriever(search_type="similarity", 
                            search_kwargs={"k":4})

#retriever=db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k":3, "score_threshold":0.5})

qa = RetrievalQA.from_chain_type(
                    llm=ChatOpenAI(temperature=0), # uses 'gpt-3.5-turbo' which is cheaper and better 
                    chain_type="stuff", 
                    retriever=retriever, 
                    chain_type_kwargs={"prompt": PROMPT}, 
                    return_source_documents=True)

In [4]:
topics = db.similarity_search_with_score("pets", k = 4, namespace='SCA_H5')
topics

[(Document(page_content='\n\n\n\n\n\n', metadata={'URL': 'https://www0.landgate.wa.gov.au/docvault.nsf/web/PS_STPM/$file/2725LAND_Landgate_Guide-to-Strata-Titles_Revised_July2022.pdf', 'source': 'Landgate Guide to Strata Titles Revised July2022', 'title': 'Document'}),
  0.807949603),
 (Document(page_content='Where to get assistance', metadata={'URL': 'https://www0.landgate.wa.gov.au/docvault.nsf/web/PS_STPM/$file/2725LAND_Landgate_Guide-to-Strata-Titles_Revised_July2022.pdf', 'source': 'Landgate Guide to Strata Titles Revised July2022', 'title': 'Where to get assistance'}),
  0.762131333),
 (Document(page_content='Where to get assistance', metadata={'URL': 'https://www0.landgate.wa.gov.au/docvault.nsf/web/PS_STPM/$file/2725LAND_Landgate_Guide-to-Strata-Titles_Revised_July2022.pdf', 'source': 'Landgate Guide to Strata Titles Revised July2022', 'title': 'Where to get assistance'}),
  0.762131333),
 (Document(page_content='Special by-laws\n\nShort-stay accommodation by-laws\n\nUnder the 

In [7]:
qa("I want to make keep a pet.")

NotImplementedError: 

In [5]:
from IPython.display import display, Markdown

import textwrap

def wrap_text_preserve_newlines(text, width=110):

    lines = text.split('\n')
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    wrapped_text = '\n'.join(wrapped_lines)
    return wrapped_text

def process_llm_response(llm_response, sources=True, content=False):
    display(Markdown(wrap_text_preserve_newlines(llm_response['result'])))
    if sources:
      display(Markdown('\n\nSources:'))
      for source in llm_response["source_documents"]:
        display(Markdown(f"{source.metadata.get('title')} ({source.metadata.get('score')})"))
        if content:
          display(Markdown(f'{wrap_text_preserve_newlines(source.page_content)}'))

def Simon(query, sources=True, content=False):
  
  result = qa(query)
  process_llm_response(result, sources=sources, content=content)
  return (result)

In [6]:
result = Simon("What does the legisltation cover", sources=True, content=False)

NotImplementedError: 

In [None]:
result = Simon("I am an owner in a 250 lot complex. I want to renovate. Do I need approval? How long do I need to wait", 
               sources=True, content=True)

In [None]:
result = Simon("I am an owner in a 250 lot complex. I want to own a pet. Do I need approval? How long do I need to wait", 
               sources=True, content=True)