# OpenAI driven search of ICWA Legistation
This uses the Western Australian [Strata Titles Act 1985](https://www.legislation.wa.gov.au/legislation/statutes.nsf/main_mrtitle_938_homepage.html)


# Initialisation

Now load the .env file to get the API keys in a secure way. The path should be the full path to the .env file. If this work it returns `True`

In [None]:
from dotenv import load_dotenv
load_dotenv() #get API keys

# Prepare documents and create database

In [6]:

def counts(texts):
  '''Create some basic statistics on the corpus'''

  if len(texts) == 0:
    print("No texts")
    return

  charCounts = [len(text.page_content) for text in texts]
  wordCounts = [len(text.page_content.split()) for text in texts]
  print(f"There are {len(texts)} chunks\nAverage character count {sum(charCounts)/len(charCounts):.0f}\nAverage word count {sum(wordCounts)/len(wordCounts):.0f}")
  

## Prepare legislation

The legislation is a word document that can be readily manipulated using the `docx` module.

In [17]:
import docx
import re
legislation_path = r'documents/Strata Titles Act 1985.docx'

Based on the following we see the document uses 4 levels of headings as follows
- Heading 2: The Parts of the legislation.
- Heading 3: Divisions
- Heading 5: These are used as to create sub-headings in the level 2 & 3 headings. 

The cover page and TOC are the first part of document and are discarded. 

In [None]:
headings = tuple(set( paragraph.style.name for paragraph in docx.Document(legislation_path).paragraphs if paragraph.style.name.startswith("Heading")))
print(sorted(headings))
toc = [": ".join([paragraph.style.name, re.sub(r"\s+", ' ', paragraph.text)]) for paragraph in docx.Document(legislation_path).paragraphs if paragraph.style.name.startswith("Heading")]
toc

### Chunk up the legislation

In [18]:
import docx
import re
legislation_path = r'documents/Strata Titles Act 1985.docx'

def readLegislation(file_path, n=5):
    '''Break document at headings up to level n (5) and return a plain text 
       document with paragraphs seperated by two newlines (\n\n)'''
    
    doc = docx.Document(file_path)
    skip_toc=True
    sections = []
    current_section = {'heading': "Document", 'level': 0, 'content': ""}

    for paragraph in doc.paragraphs:
        text = re.sub(r"\s+", ' ', paragraph.text)
        if paragraph.style.name.startswith(tuple(f"Heading {i+1}" for i in range(n))) or \
            paragraph.text.startswith(("Schedule", "Notes", "Defined terms")) or \
                re.search(r'^\d+\.', paragraph.text):
            #save old section 
            if current_section['heading'] or current_section['content']:
                sections.append(current_section)
            
            # and start a new section
            current_section = {'heading' : text,
                               #'level'   : int(re.search("Heading (\d+)", paragraph.style.name).group(1)),
                               'content' : text
                            }
        else:
            # join this paragraph text to prior ones in this section
            current_section['content'] = "\n\n".join([current_section['content'], text])

    # Add the last section
    if  current_section['heading'] or current_section['content']:
        sections.append(current_section)

    #Return list of sections
    return sections 

from langchain.schema import Document
def makeLegislationDocs(n):
    '''Break legistation by headings down to level n. This chunks up the 
       document to sizes chatGPT can digest while ensuring the clauses in
        the legislation are kept together '''

    return [Document(page_content = section['content'], metadata = {'title':section['heading']}) 
                for section in readLegislation(legislation_path, n) ]

chunk_H5 = makeLegislationDocs(5)[325:] #drop toc and title
counts(chunk_H5)


There are 380 chunks
Average character count 1343
Average word count 235


In [None]:

from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
chunk_H5_split = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300).split_documents(chunk_H5)
counts(chunk_H5_split)

### Make FAQ

Ingest the web page [faq](https://strata.wa.gov.au/strata-titles/support-and-resources/faqs)


In [8]:
import requests
from bs4 import BeautifulSoup
from langchain.schema import Document

def getFAQdocs():
    # Send a GET request to the URL
    url = "https://strata.wa.gov.au/strata-titles/support-and-resources/faqs"
    response = requests.get(url)

    # Create BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    questions = soup.find_all("dt", class_="faq__question")
    answers = soup.find_all("dd", class_="faq__answer")

    return [Document(page_content = '\n\n'.join([ question.get_text(" ", strip=True), answer.get_text(" ", strip=True)]), 
            metadata = {'title':question.get_text(" ", strip=True)}) 
        for question, answer in zip(questions, answers) ]

faqDocs =  getFAQdocs()
counts(faqDocs)

There are 104 chunks
Average character count 570
Average word count 96


## Create the Pinecone database
Initialise the pinecode instance base on the API keys in .env. 

Depending on the user input, use the existing index or create a new one from the documents. Create a "similarity" document retriever based on the database.

In [43]:
import pinecone 
import os

pinecone.init(
    api_key= os.environ.get('PINECONE_API_KEY') ,  # find at app.pinecone.io
    environment=os.environ.get('PINECONE_ENV')     # next to api key in console
)

INDEX = os.environ.get('INDEX')
NAMESPACE="SCA_H5"

In [44]:
pinecone.create_index(INDEX, dimension=1536)
pinecone.Index(INDEX).delete(namespace=NAMESPACE, deleteAll=True)

{}

In [45]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
    
Pinecone.from_documents(chunk_H5 + faqDocs, 
                                   index_name=os.environ.get('INDEX'), 
                                   namespace="SCA_H5", 
                                   embedding=OpenAIEmbeddings())


<langchain.vectorstores.pinecone.Pinecone at 0x7d288bb93940>

# Create and test SImon


Define a Q&A chain that 'stuffs' the retrieved chunks into the prompt to provide context. Using OpenAI deterministic (temperature=0) model `gpt-3.5-turbo`.  According to OpenAI 'gpt-3.5-turbo' is the 
> Most capable GPT-3.5 model and optimized for chat at 1/10th the cost of text-davinci-003.

In [49]:
import pinecone
import os

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

pinecone.init(
    api_key= os.environ.get('PINECONE_API_KEY') ,  # find at app.pinecone.io
    environment=os.environ.get('PINECONE_ENV')     # next to api key in console
)

db = Pinecone.from_existing_index(index_name=os.environ.get('INDEX'), 
                                   namespace='SCA_H5', 
                                   embedding=OpenAIEmbeddings())

In [50]:
from langchain.prompts import PromptTemplate
prompt_template = """
You are a helpful Strata legal expert in Western Australia answering questions about the "Strata Titles Act 1985" from a lot owner.

Start the answer with "An owner should always refer to their bylaws and strata plan in conjenction with the legislation".

Provide a detailed answer using the information from the legislation provided below. List relevant sections of the act. 

Do not make up answers. If you do not know say "I do not know".

{context}

Question: {question}

Answer in plain english"
"""
PROMPT = PromptTemplate(template=prompt_template, 
                        input_variables=["context", "question"])

retriever=db.as_retriever(search_type="similarity", 
                            search_kwargs={"k":4})

#retriever=db.as_retriever(search_type="similarity_score_threshold", 
#                          search_kwargs={"k":3, "score_threshold":0.5})

qa = RetrievalQA.from_chain_type(
                    llm=ChatOpenAI(temperature=0), # uses 'gpt-3.5-turbo' which is cheaper and better 
                    chain_type="stuff", 
                    retriever=retriever, 
                    chain_type_kwargs={"prompt": PROMPT}, 
                    return_source_documents=True)

In [54]:
topics = db.similarity_search_with_score("pets", k = 4, namespace='SCA_H5')
topics

[(Document(page_content='12. Additional duties of owners and occupiers\n\n An owner or occupier of a lot must not —\n\n (a) use the lot for a purpose that may be illegal or injurious to the reputation of the building; or\n\n (b) make undue noise in or about the lot or common property; or\n\n (c) keep animals on the lot or the common property after notice in that behalf given to that person by the council.\n\n [Bylaw 12 inserted: No. 58 of 1995 s. 88(5); amended: No. 74 of 2003 s. 112(22); No. 30 of 2018 s. 110.]', metadata={'title': '12. Additional duties of owners and occupiers'}),
  0.753297567),
 (Document(page_content='10. Floor coverings\n\n An owner of a lot must ensure that all floor space within the lot (other than that comprising kitchen, laundry, lavatory or bathroom) is covered or otherwise treated to an extent sufficient to prevent the transmission therefrom of noise likely to disturb the peaceful enjoyment of an owner or occupier of another lot.\n\n [By-law 10, formerly by

In [52]:
qa("I want to make keep a pet.")

{'query': 'I want to make keep a pet.',
 'result': 'An owner or occupier of a lot can keep a pet on the lot or common property unless the council has given notice that it is not allowed. However, it is important to check the bylaws and strata plan for any specific rules about pets.',
 'source_documents': [Document(page_content='12. Additional duties of owners and occupiers\n\n An owner or occupier of a lot must not —\n\n (a) use the lot for a purpose that may be illegal or injurious to the reputation of the building; or\n\n (b) make undue noise in or about the lot or common property; or\n\n (c) keep animals on the lot or the common property after notice in that behalf given to that person by the council.\n\n [Bylaw 12 inserted: No. 58 of 1995 s. 88(5); amended: No. 74 of 2003 s. 112(22); No. 30 of 2018 s. 110.]', metadata={'title': '12. Additional duties of owners and occupiers'}),
  Document(page_content='A developer has approached our strata to re-develop, but I can’t afford a lawyer

In [None]:
from IPython.display import display, Markdown

import textwrap

def wrap_text_preserve_newlines(text, width=110):

    lines = text.split('\n')
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    wrapped_text = '\n'.join(wrapped_lines)
    return wrapped_text

def process_llm_response(llm_response, sources=True, content=False):
    display(Markdown(wrap_text_preserve_newlines(llm_response['result'])))
    if sources:
      display(Markdown('\n\nSources:'))
      for source in llm_response["source_documents"]:
        display(Markdown(f"{source.metadata.get('title')} ({source.metadata.get('score')})"))
        if content:
          display(Markdown(f'{wrap_text_preserve_newlines(source.page_content)}'))

def Simon(query, sources=True, content=False):
  
  result = qa(query)
  process_llm_response(result, sources=sources, content=content)
  return (result)

In [None]:
result = Simon("What does the legisltation cover", sources=True, content=False)

In [None]:
result = Simon("I am an owner in a 250 lot complex. I want to renovate. Do I need approval? How long do I need to wait", 
               sources=True, content=True)

In [None]:
result = Simon("I am an owner in a 250 lot complex. I want to own a pet. Do I need approval? How long do I need to wait", 
               sources=True, content=True)