# OpenAI driven search of ICWA Legistation



# Initialisation

Now load the .env file to get the API keys in a secure way. The path should be the full path to the .env file. If this work it returns `True`

In [1]:
legislation_path = r'documents/Motor Vehicle (Catastrophic Injuries) Act 2016 - [00-f0-00].docx_3FOpenElement.docx'

import docx
from dotenv import load_dotenv
load_dotenv() #get API keys

True

# Prepare legislation

The legislation "Motor Vehicle (Catastrophic Injuries) Act 2016." is a word document that can be readily manipulated using the `docx` module.

In [2]:
print(set( paragraph.style.name for paragraph in docx.Document(legislation_path).paragraphs if paragraph.style.name.startswith("Heading")))
[":".join([paragraph.style.name, paragraph.text]) for paragraph in docx.Document(legislation_path).paragraphs if paragraph.style.name.startswith(('Heading 2', 'Heading 3'))]

{'Heading 3', 'Heading 5', 'Heading 2'}


['Heading 2:Part 1\xa0—\xa0Preliminary',
 'Heading 2:Part 2\xa0—\xa0Participation in catastrophic injuries support\xa0scheme',
 'Heading 2:Part 3\xa0—\xa0Assessment of treatment, care and support\xa0needs',
 'Heading 2:Part 4\xa0—\xa0Payments under catastrophic injuries support scheme',
 'Heading 2:Part 5\xa0—\xa0Dispute resolution',
 'Heading 2:Part 6\xa0—\xa0Miscellaneous',
 'Heading 2:Part 7\xa0—\xa0Other Acts amended',
 'Heading 3:Division 1\xa0—\xa0Civil Liability Act\xa02002 amended',
 'Heading 3:Division 2\xa0—\xa0Insurance Commission of Western Australia Act\xa01986 amended',
 'Heading 3:Division 3\xa0—\xa0Motor Vehicle (Third Party Insurance) Act\xa01943\xa0amended']


The document uses 3 levels of headings as follows
- Heading 2: The Parts of the legislation.
- Heading 3: Divisions
- Heading 5: These are used as to create sub-headings in the level 2 & 3 headings. 

The cover page and TOC are the first part of document and are discarded. 

In [3]:
import re

def read_document_sections(file_path, n=5):
    '''Break document at headings up to level n (5) and return a plain text 
       document with paragraphs seperated by two newlines (\n\n)'''
    
    doc = docx.Document(file_path)
    skip_toc=True
    sections = []
    current_section = {'heading': "Document", 'level': 0, 'content': ""}

    for paragraph in doc.paragraphs:

        if paragraph.style.name.startswith(tuple(f"Heading {i+1}" for i in range(n))):
            #save old section 
            if current_section['heading'] or current_section['content']:
                sections.append(current_section)
            
            # and start a new section
            current_section = {'heading' : paragraph.text,
                               'level'   : int(re.search("Heading (\d+)", paragraph.style.name).group(1)),
                               'content' : paragraph.text
                            }
        else:
            # join this paragraph text to prior ones in this section
            current_section['content'] = "\n\n".join([current_section['content'], paragraph.text])

    # Add the last section
    if  current_section['heading'] or current_section['content']:
        sections.append(current_section)

    #Retrun list of section discarding the coverpage and TOC
    return sections 

from langchain.schema import Document
def makeDocs(n):
    '''Break legistation by headings down to level n. This chunks up the 
       document to sizes chatGPT can digest while ensuring the clauses in
        the legislation are kept together '''

    return [Document(page_content = section['content'], metadata = {'title':section['heading']}) 
                for section in read_document_sections(legislation_path, n) ]


def counts(texts):
  '''Create some basic statistics on the corpus'''

  if len(texts) == 0:
    print("No texts")
    return

  charCounts = [len(text.page_content) for text in texts]
  wordCounts = [len(text.page_content.split()) for text in texts]
  print(f"There are {len(texts)} chunks\nAverage character count {sum(charCounts)/len(charCounts):.0f}\nAverage word count {sum(wordCounts)/len(wordCounts):.0f}")


In [21]:
chunk_H3 = makeDocs(3)[1:] #Skip the TOC
counts(chunk_H3)

There are 10 chunks
Average character count 5137
Average word count 847


In [6]:
chunk_H5 = makeDocs(5)[1:]#Skip the TOC
counts(chunk_H5)

There are 64 chunks
Average character count 801
Average word count 132


# Create the Pinecone database
Initialise the pinecode instance base on the API keys in .env. 

Depending on the user input, use the existing index or create a new one from the documents. Create a "similarity" document retriever based on the database.

In [8]:
from langchain.vectorstores import Pinecone
import pinecone 
import os

def create_namespace(namespace, documents, embeddings):
    
    INDEX = os.environ.get('INDEX')
    
    if INDEX not in pinecone.list_indexes():
        print(f"Creating new index {INDEX}")
        pinecone.create_index(INDEX, dimension=1536)
    
    pinecone.Index(INDEX).delete(namespace=namespace, deleteAll=True)
    
    return Pinecone.from_documents(documents, embeddings, index_name=INDEX, namespace=namespace)

  from tqdm.autonotebook import tqdm


In [12]:

from langchain.embeddings.openai import OpenAIEmbeddings

pinecone.init(
    api_key= os.environ.get('PINECONE_API_KEY') ,  # find at app.pinecone.io
    environment=os.environ.get('PINECONE_ENV')     # next to api key in console
)


embeddings = OpenAIEmbeddings()

In [22]:
db3 = create_namespace("H3", chunk_H3[1:], embeddings)

In [15]:
db5 = create_namespace("H5", chunk_H5[1:], embeddings)

# Create and test the Alice

Define a Q&A chain that 'stuffs' the retrieved chunks into the prompt to provide context. Using OpenAI deterministic (temperature=0) model `gpt-3.5-turbo`.  According to OpenAI 'gpt-3.5-turbo' is the 
> Most capable GPT-3.5 model and optimized for chat at 1/10th the cost of text-davinci-003.

In [16]:
from langchain.llms import OpenAIChat
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
                    llm=OpenAIChat(temperature=0.0), #, uses 'gpt-3.5-turbo' which is cheaper and better 
                    chain_type="stuff", 
                    retriever=db5.as_retriever(search_type="similarity", search_kwargs={"k":10}), 
                    return_source_documents=True)



In [17]:
from IPython.display import display, Markdown

import textwrap

def wrap_text_preserve_newlines(text, width=110):

    lines = text.split('\n')
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    wrapped_text = '\n'.join(wrapped_lines)
    return wrapped_text

def process_llm_response(llm_response, sources=True, content=False):
    display(Markdown(wrap_text_preserve_newlines(llm_response['result'])))
    if sources:
      display(Markdown('\n\nSources:'))
      for source in llm_response["source_documents"]:
        display(Markdown(f"{source.metadata.get('title')}"))
        if content:
          display(Markdown(f'{wrap_text_preserve_newlines(source.page_content)}'))

def Alice(query, sources=True, content=False):
  
  instructions = '''You are an expert in Western Australia "Motor Vehicle (Third Party Insurance) Act" 
                    answering questions from a citizen. Only use information provided to you from the 
                    legislation below. If you do not know say "I do not know"'''
  result = qa({"query": f'{instructions} \n\n {query}'})
  process_llm_response(result, sources=sources, content=content)
  return (result)

In [18]:
result = Alice("What does the legisltation cover", sources=False, content=False)

The Motor Vehicle (Third Party Insurance) Act 1943 covers motor vehicle injuries resulting from a motor
vehicle accident that occurs in Western Australia on or after the day on which the relevant section comes into
operation, with certain exceptions such as injuries resulting from a motor sports event or a terrorist act. It
also establishes requirements for compulsory third party insurance for motor vehicles and outlines the powers
and responsibilities of the Insurance Commission of Western Australia in relation to motor vehicle accidents
and injuries.

In [19]:
result = Alice("When was it enacted", sources=False, content=False)

The information provided does not include the date when the Motor Vehicle (Third Party Insurance) Act was
enacted.

In [20]:
result = Alice("What am I covered for?", sources=False, content=False)

The Motor Vehicle (Third Party Insurance) Act 1943 applies to motor vehicle injuries resulting from a motor
vehicle accident that occurs in Western Australia on or after the day on which the relevant section comes into
operation. However, there are exceptions to this coverage, such as if the owner or driver of a motor vehicle
has incurred liability for negligence in respect of the injury, if the motor vehicle accident occurs on
private land and no motor vehicle involved has insurance coverage, or if the injury results from a motor
vehicle taking part in a motor sports event or a terrorist act. For more specific information on coverage, it
is recommended to consult the Act or seek legal advice.

# Experimental 

This is not yet working

In [None]:

from langchain.chains.question_answering import load_qa_chain
from langchain.chains import AnalyzeDocumentChain
from langchain.chat_models import ChatOpenAI


qa_chain = load_qa_chain(llm=ChatOpenAI(temperature=0.0), #, uses 'gpt-3.5-turbo' which is cheaper and better 
                         chain_type="map_reduce")

qa_document_chain = AnalyzeDocumentChain(combine_docs_chain=qa_chain)
doc =  docx.Document(legislation_path)
text_doc = "\n\n".join([para.text for para in doc.paragraphs])
qa_document_chain.run(input_document=text_doc, question="What is the purpose of this legislation")