# OpenAI driven search of ICWA Legistation
This uses the Western Australian [Strata Titles Act 1985](https://www.legislation.wa.gov.au/legislation/statutes.nsf/main_mrtitle_938_homepage.html)


# Initialisation

Now load the .env file to get the API keys in a secure way. The path should be the full path to the .env file. If this work it returns `True`

In [1]:
from dotenv import load_dotenv
load_dotenv() #get API keys

True

# Prepare legislation

The legislation is a word document that can be readily manipulated using the `docx` module.

In [None]:
import docx
import re
legislation_path = r'documents/Strata Titles Act 1985.docx'

Based on the following we see the document uses 4 levels of headings as follows
- Heading 2: The Parts of the legislation.
- Heading 3: Divisions
- Heading 5: These are used as to create sub-headings in the level 2 & 3 headings. 

The cover page and TOC are the first part of document and are discarded. 

In [None]:
headings = tuple(set( paragraph.style.name for paragraph in docx.Document(legislation_path).paragraphs if paragraph.style.name.startswith("Heading")))
print(sorted(headings))
toc = [": ".join([paragraph.style.name, re.sub(r"\s+", ' ', paragraph.text)]) for paragraph in docx.Document(legislation_path).paragraphs if paragraph.style.name.startswith("Heading")]
toc

## Chunk up the legislation

In [None]:
import re

def read_document_sections(file_path, n=5):
    '''Break document at headings up to level n (5) and return a plain text 
       document with paragraphs seperated by two newlines (\n\n)'''
    
    doc = docx.Document(file_path)
    skip_toc=True
    sections = []
    current_section = {'heading': "Document", 'level': 0, 'content': ""}

    for paragraph in doc.paragraphs:
        text = re.sub(r"\s+", ' ', paragraph.text)
        if paragraph.style.name.startswith(tuple(f"Heading {i+1}" for i in range(n))) or \
            paragraph.text.startswith(("Schedule", "Notes", "Defined terms")) or \
                re.search(r'^\d+\.', paragraph.text):
            #save old section 
            if current_section['heading'] or current_section['content']:
                sections.append(current_section)
            
            # and start a new section
            current_section = {'heading' : text,
                               #'level'   : int(re.search("Heading (\d+)", paragraph.style.name).group(1)),
                               'content' : text
                            }
        else:
            # join this paragraph text to prior ones in this section
            current_section['content'] = "\n\n".join([current_section['content'], text])

    # Add the last section
    if  current_section['heading'] or current_section['content']:
        sections.append(current_section)

    #Return list of sections
    return sections 

from langchain.schema import Document
def makeDocs(n):
    '''Break legistation by headings down to level n. This chunks up the 
       document to sizes chatGPT can digest while ensuring the clauses in
        the legislation are kept together '''

    return [Document(page_content = section['content'], metadata = {'title':section['heading']}) 
                for section in read_document_sections(legislation_path, n) ]


def counts(texts):
  '''Create some basic statistics on the corpus'''

  if len(texts) == 0:
    print("No texts")
    return

  charCounts = [len(text.page_content) for text in texts]
  wordCounts = [len(text.page_content.split()) for text in texts]
  print(f"There are {len(texts)} chunks\nAverage character count {sum(charCounts)/len(charCounts):.0f}\nAverage word count {sum(wordCounts)/len(wordCounts):.0f}")
  

In [None]:
chunk_H5 = makeDocs(5)[325:] #drop toc and title
counts(chunk_H5)

In [None]:

from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
chunk_H5_split = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300).split_documents(chunk_H5)
counts(chunk_H5_split)

## Create the Pinecone database
Initialise the pinecode instance base on the API keys in .env. 

Depending on the user input, use the existing index or create a new one from the documents. Create a "similarity" document retriever based on the database.

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import pinecone 
import os

pinecone.init(
    api_key= os.environ.get('PINECONE_API_KEY') ,  # find at app.pinecone.io
    environment=os.environ.get('PINECONE_ENV')     # next to api key in console
)

def create_namespace(namespace, documents, embeddings):
    
    INDEX = os.environ.get('INDEX')
    
    if INDEX not in pinecone.list_indexes():
        print(f"Creating new index {INDEX}")
        pinecone.create_index(INDEX, dimension=1536)
    
    pinecone.Index(INDEX).delete(namespace=namespace, deleteAll=True)
    
    return Pinecone.from_documents(documents, 
                                   index_name=os.environ.get('INDEX'), 
                                   namespace=namespace, 
                                   embedding=embeddings)

db5 = create_namespace("SCA_H5", chunk_H5_split, 
                       embeddings= OpenAIEmbeddings())

### Testing retrieval

# Create and test SImon


Define a Q&A chain that 'stuffs' the retrieved chunks into the prompt to provide context. Using OpenAI deterministic (temperature=0) model `gpt-3.5-turbo`.  According to OpenAI 'gpt-3.5-turbo' is the 
> Most capable GPT-3.5 model and optimized for chat at 1/10th the cost of text-davinci-003.

In [2]:
import pinecone
import os

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

pinecone.init(
    api_key= os.environ.get('PINECONE_API_KEY') ,  # find at app.pinecone.io
    environment=os.environ.get('PINECONE_ENV')     # next to api key in console
)

db = Pinecone.from_existing_index(index_name=os.environ.get('INDEX'), 
                                   namespace='SCA_H5', 
                                   embedding=OpenAIEmbeddings())

  from tqdm.autonotebook import tqdm


In [6]:
from langchain.prompts import PromptTemplate
prompt_template = """
You are a helpful Strata legal expert in Western Australia answering questions about the "Strata Titles Act 1985" from a lot owner.

Start the answer with "An owner should always refer to their bylaws and strata plan in conjenction with the legislation".

Provide a detailed answer using the information from the legislation provided below. List relevant sections of the act. 

Do not make up answers. If you do not know say "I do not know".

{context}

Question: {question}

Answer in plain english"
"""
PROMPT = PromptTemplate(template=prompt_template, 
                        input_variables=["context", "question"])

retriever=db.as_retriever(search_type="similarity", 
                            search_kwargs={"k":4})

#retriever=db.as_retriever(search_type="similarity_score_threshold", 
#                          search_kwargs={"k":3, "score_threshold":0.5})

qa = RetrievalQA.from_chain_type(
                    llm=ChatOpenAI(temperature=0), # uses 'gpt-3.5-turbo' which is cheaper and better 
                    chain_type="stuff", 
                    retriever=retriever, 
                    chain_type_kwargs={"prompt": PROMPT}, 
                    return_source_documents=True)

In [4]:
topics = db.similarity_search_with_score("I want to make alterations", k = 4, namespace='SCA_H5')
topics

[(Document(page_content='13. Notice of alteration to lot\n\n An owner of a lot must not alter or permit the alteration of the structure of the lot except as may be permitted and provided for under the Act and the bylaws and in any event must not alter the structure of the lot without giving to the strata company, not later than 14 days before commencement of the alteration, a written notice describing the proposed alteration.\n\n [Bylaw 13 inserted: No. 58 of 1995 s. 88(5); amended: No. 30 of 2018 s. 111.]', metadata={'title': '13. Notice of alteration to lot'}),
  0.767253637),
 (Document(page_content='90. Order dispensing with approval for structural alteration of lot\n\n (1) The Tribunal may, on the application of an owner of a lot in a strata titles scheme, by order, exempt a particular structural alteration to the lot from the application of this Division.\n\n (2) An order may be made under this section —\n\n (a) whether or not the necessary approval for the alteration has been so

In [7]:
qa("I want to make alterations.")

{'query': 'I want to make alterations.',
 'result': 'An owner should always refer to their bylaws and strata plan in conjunction with the legislation. According to the Strata Titles Act 1985, an owner must not alter the structure of their lot without giving written notice to the strata company at least 14 days before the commencement of the alteration. The owner must also obtain prior written approval from the owner of the other lot and the strata company, expressed by resolution without dissent, for a strata scheme with more than two lots. For a surveystrata scheme, the owner must obtain prior written approval from the owner of the other lot and the strata company, expressed by resolution without dissent, if the structures on the lot will not conform to plot ratio restrictions or open space requirements for the lot. The Tribunal may exempt a particular structural alteration from the application of this Division if it is reasonable and will not cause significant inconvenience or detrim

In [None]:
from IPython.display import display, Markdown

import textwrap

def wrap_text_preserve_newlines(text, width=110):

    lines = text.split('\n')
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    wrapped_text = '\n'.join(wrapped_lines)
    return wrapped_text

def process_llm_response(llm_response, sources=True, content=False):
    display(Markdown(wrap_text_preserve_newlines(llm_response['result'])))
    if sources:
      display(Markdown('\n\nSources:'))
      for source in llm_response["source_documents"]:
        display(Markdown(f"{source.metadata.get('title')} ({source.metadata.get('score')})"))
        if content:
          display(Markdown(f'{wrap_text_preserve_newlines(source.page_content)}'))

def Simon(query, sources=True, content=False):
  
  result = qa(query)
  process_llm_response(result, sources=sources, content=content)
  return (result)

In [None]:
result = Simon("What does the legisltation cover", sources=True, content=False)

In [None]:
result = Simon("I am an owner in a 250 lot complex. I want to renovate. Do I need approval? How long do I need to wait", 
               sources=True, content=True)

In [None]:
result = Simon("I am an owner in a 250 lot complex. I want to own a pet. Do I need approval? How long do I need to wait", 
               sources=True, content=True)