# OpenAI driven search of ICWA Legistation



# Initialisation

Now load the .env file to get the API keys in a secure way. The path should be the full path to the .env file. If this work it returns `True`

In [1]:
legislation_path = r'documents/Motor Vehicle (Catastrophic Injuries) Act 2016 - [00-f0-00].docx_3FOpenElement.docx'

import docx
from dotenv import load_dotenv
load_dotenv() #get API keys

True

# Prepare legislation

The legislation "Motor Vehicle (Catastrophic Injuries) Act 2016." is a word document that can be readily manipulated using the `docx` module.

In [2]:
print(set( paragraph.style.name for paragraph in docx.Document(legislation_path).paragraphs if paragraph.style.name.startswith("Heading")))
[":".join([paragraph.style.name, paragraph.text]) for paragraph in docx.Document(legislation_path).paragraphs if paragraph.style.name.startswith(('Heading 2', 'Heading 3', 'Heading 5'))]

{'Heading 5', 'Heading 3', 'Heading 2'}


['Heading 2:Part 1\xa0—\xa0Preliminary',
 'Heading 5:1.\tShort title',
 'Heading 5:2.\tCommencement',
 'Heading 5:3.\tTerms used',
 'Heading 5:4.\tMotor vehicle accident',
 'Heading 5:5.\tMotor vehicle injury to which Act applies',
 'Heading 5:6.\tTreatment, care and support needs',
 'Heading 5:7.\tAct binds Crown',
 'Heading 2:Part 2\xa0—\xa0Participation in catastrophic injuries support\xa0scheme',
 'Heading 5:8.\tEligibility to participate',
 'Heading 5:9.\tApplication to participate',
 'Heading 5:10.\tAcceptance as participant',
 'Heading 5:11.\tInterim participation',
 'Heading 5:12.\tLifetime participation',
 'Heading 5:13.\tFormer interim participant may apply to become participant',
 'Heading 5:14.\tSuspension of participation',
 'Heading 2:Part 3\xa0—\xa0Assessment of treatment, care and support\xa0needs',
 'Heading 5:15.\tAssessment of participant’s treatment, care and support needs',
 'Heading 5:16.\tRegulations about assessment of treatment, care and support needs',
 'Headi


The document uses 3 levels of headings as follows
- Heading 2: The Parts of the legislation.
- Heading 3: Divisions
- Heading 5: These are used as to create sub-headings in the level 2 & 3 headings. 

The cover page and TOC are the first part of document and are discarded. 

In [3]:
import re

def read_document_sections(file_path, n=5):
    '''Break document at headings up to level n (5) and return a plain text 
       document with paragraphs seperated by two newlines (\n\n)'''
    
    doc = docx.Document(file_path)
    skip_toc=True
    sections = []
    current_section = {'heading': "Document", 'level': 0, 'content': ""}

    for paragraph in doc.paragraphs:

        if paragraph.style.name.startswith(tuple(f"Heading {i+1}" for i in range(n))):
            #save old section 
            if current_section['heading'] or current_section['content']:
                sections.append(current_section)
            
            # and start a new section
            current_section = {'heading' : paragraph.text,
                               'level'   : int(re.search("Heading (\d+)", paragraph.style.name).group(1)),
                               'content' : paragraph.text
                            }
        else:
            # join this paragraph text to prior ones in this section
            current_section['content'] = "\n\n".join([current_section['content'], paragraph.text])

    # Add the last section
    if  current_section['heading'] or current_section['content']:
        sections.append(current_section)

    #Retrun list of section discarding the coverpage and TOC
    return sections 

from langchain.schema import Document
def makeDocs(n):
    '''Break legistation by headings down to level n. This chunks up the 
       document to sizes chatGPT can digest while ensuring the clauses in
        the legislation are kept together '''

    return [Document(page_content = section['content'], metadata = {'title':section['heading']}) 
                for section in read_document_sections(legislation_path, n) ]


def counts(texts):
  '''Create some basic statistics on the corpus'''

  if len(texts) == 0:
    print("No texts")
    return

  charCounts = [len(text.page_content) for text in texts]
  wordCounts = [len(text.page_content.split()) for text in texts]
  print(f"There are {len(texts)} chunks\nAverage character count {sum(charCounts)/len(charCounts):.0f}\nAverage word count {sum(wordCounts)/len(wordCounts):.0f}")


In [4]:
chunk_H5 = makeDocs(5)[1:]#Skip the TOC
counts(chunk_H5)

There are 64 chunks
Average character count 801
Average word count 132


# Create the Pinecone database
Initialise the pinecode instance base on the API keys in .env. 

Depending on the user input, use the existing index or create a new one from the documents. Create a "similarity" document retriever based on the database.

In [5]:
from langchain.vectorstores import Pinecone
import pinecone 
import os

def create_namespace(namespace, documents, embeddings):
    
    INDEX = os.environ.get('INDEX')
    
    if INDEX not in pinecone.list_indexes():
        print(f"Creating new index {INDEX}")
        pinecone.create_index(INDEX, dimension=1536)
    
    pinecone.Index(INDEX).delete(namespace=namespace, deleteAll=True)
    
    return Pinecone.from_documents(documents, embeddings, index_name=INDEX, namespace=namespace)

  from tqdm.autonotebook import tqdm


In [6]:

from langchain.embeddings.openai import OpenAIEmbeddings

pinecone.init(
    api_key= os.environ.get('PINECONE_API_KEY') ,  # find at app.pinecone.io
    environment=os.environ.get('PINECONE_ENV')     # next to api key in console
)


embeddings = OpenAIEmbeddings()

In [7]:
db5 = create_namespace("ICWA5", chunk_H5, embeddings)

# Create and test the Alice

Define a Q&A chain that 'stuffs' the retrieved chunks into the prompt to provide context. Using OpenAI deterministic (temperature=0) model `gpt-3.5-turbo`.  According to OpenAI 'gpt-3.5-turbo' is the 
> Most capable GPT-3.5 model and optimized for chat at 1/10th the cost of text-davinci-003.

In [8]:
import pinecone
import os

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

pinecone.init(
    api_key= os.environ.get('PINECONE_API_KEY') ,  # find at app.pinecone.io
    environment=os.environ.get('PINECONE_ENV')     # next to api key in console
)

db = Pinecone.from_existing_index(index_name=os.environ.get('INDEX'), 
                                   namespace='ICWA5', 
                                   embedding=OpenAIEmbeddings())

In [10]:
topics = db.similarity_search_with_score("I've had a car crash and have an injury", k = 4, namespace='ICWA5')
topics

[(Document(page_content='Part 2\xa0—\xa0Participation in catastrophic injuries support\xa0scheme', metadata={'title': 'Part 2\xa0—\xa0Participation in catastrophic injuries support\xa0scheme'}),
  0.819114208),
 (Document(page_content='8.\tEligibility to participate\n\n\t(1)\tA person is eligible to be a participant in the CISS if\xa0— \n\n\t(a)\tthe person suffers a motor vehicle injury to which this Act applies; and\n\n\t(b)\tthe motor vehicle injury is a catastrophic injury.\n\n\t(2)\tParticipation in the CISS is as an interim participant or as a lifetime participant and, for that purpose, the regulations may establish criteria for eligibility for interim participation in the CISS and criteria for eligibility for lifetime participation in the CISS.\n\n\t(3)\tA person is not eligible to be a participant in the CISS in respect of a motor vehicle injury if the person has been awarded damages, pursuant to a final judgment entered by a court or a binding settlement, in respect of the fut

In [13]:
from langchain.prompts import PromptTemplate
prompt_template = """
You are an expert in Western Australia "Motor Vehicle (Third Party Insurance) Act" answering questions from a citizen. 

Provide a detailed answer using the information from the legislation provided below. List relevant sections of the act. 

Do not make up answers. If you do not know say "I do not know".

{context}

Question: {question}

Answer in plain english"
"""
PROMPT = PromptTemplate(template=prompt_template, 
                        input_variables=["context", "question"])

retriever=db.as_retriever(search_type="similarity", 
                            search_kwargs={"k":4})

qa = RetrievalQA.from_chain_type(
                    llm=ChatOpenAI(temperature=0), # uses 'gpt-3.5-turbo' which is cheaper and better 
                    chain_type="stuff", 
                    retriever=retriever, 
                    chain_type_kwargs={"prompt": PROMPT}, 
                    return_source_documents=True)

In [14]:
from IPython.display import display, Markdown

import textwrap

def wrap_text_preserve_newlines(text, width=110):

    lines = text.split('\n')
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    wrapped_text = '\n'.join(wrapped_lines)
    return wrapped_text

def process_llm_response(llm_response, sources=True, content=False):
    display(Markdown(wrap_text_preserve_newlines(llm_response['result'])))
    if sources:
      display(Markdown('\n\nSources:'))
      for source in llm_response["source_documents"]:
        display(Markdown(f"{source.metadata.get('title')} ({source.metadata.get('score')})"))
        if content:
          display(Markdown(f'{wrap_text_preserve_newlines(source.page_content)}'))

def Alice(query, sources=True, content=False):
  
  result = qa(query)
  process_llm_response(result, sources=sources, content=content)
  return (result)

In [15]:
result = Alice("What does the legisltation cover", sources=False, content=False)

The legislation covers regulations that can adopt codes or other laws related to motor vehicle insurance in
Western Australia. It also allows for the Governor to make regulations related to matters necessary for giving
effect to the Motor Vehicle (Third Party Insurance) Act, including provisions for applications, information
provision, and penalties for offenses.

In [16]:
result = Alice("When was it enacted", sources=False, content=False)

The Motor Vehicle (Third Party Insurance) Act was amended in 1943. The exact date of enactment is not provided
in the information given.

In [17]:
result = Alice("What am I covered for?", sources=False, content=False)

If you have a policy of insurance that complies with the Motor Vehicle (Third Party Insurance) Act, you are
also covered for the risk of suffering a catastrophic injury that results from a motor vehicle accident
involving the vehicle mentioned in the policy. This coverage only extends to necessary and reasonable expenses
incurred by or on behalf of the injured person in relation to their assessed treatment, care, and support
needs under the Catastrophic Injuries Support Scheme.

# Experimental 

This is not yet working

In [None]:

from langchain.chains.question_answering import load_qa_chain
from langchain.chains import AnalyzeDocumentChain
from langchain.chat_models import ChatOpenAI


qa_chain = load_qa_chain(llm=ChatOpenAI(temperature=0.0), #, uses 'gpt-3.5-turbo' which is cheaper and better 
                         chain_type="map_reduce")

qa_document_chain = AnalyzeDocumentChain(combine_docs_chain=qa_chain)
doc =  docx.Document(legislation_path)
text_doc = "\n\n".join([para.text for para in doc.paragraphs])
qa_document_chain.run(input_document=text_doc, question="What is the purpose of this legislation")