# OpenAI driven search of ICWA Legistation



# Initialisation
Set up libraries and load API keys

In [2]:
import docx

from langchain.llms import OpenAIChat
from langchain.embeddings.openai import OpenAIEmbeddings

from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, MarkdownTextSplitter
from langchain.chains import RetrievalQA

from langchain.vectorstores import Pinecone
import pinecone 
#Ignore TQdm Warning

import os


Now load the .env file to get the API keys in a secure way. The path should be the full path to the .env file. If this work it returns `True`

In [3]:
from dotenv import load_dotenv
load_dotenv() #get API keys
legislation_path = r'documents/Motor Vehicle (Catastrophic Injuries) Act 2016 - [00-f0-00].docx_3FOpenElement.docx'

# Prepare legislation

The legislation "Motor Vehicle (Catastrophic Injuries) Act 2016." is a word document that can be readily manipulated using the `docx` module.

In [4]:

print(set( paragraph.style.name for paragraph in docx.Document(legislation_path).paragraphs if paragraph.style.name.startswith("Heading")))
[":".join([paragraph.style.name, paragraph.text]) for paragraph in docx.Document(legislation_path).paragraphs if paragraph.style.name.startswith(('Heading 2', 'Heading 3'))]

{'Heading 2', 'Heading 3', 'Heading 5'}


['Heading 2:Part 1\xa0—\xa0Preliminary',
 'Heading 2:Part 2\xa0—\xa0Participation in catastrophic injuries support\xa0scheme',
 'Heading 2:Part 3\xa0—\xa0Assessment of treatment, care and support\xa0needs',
 'Heading 2:Part 4\xa0—\xa0Payments under catastrophic injuries support scheme',
 'Heading 2:Part 5\xa0—\xa0Dispute resolution',
 'Heading 2:Part 6\xa0—\xa0Miscellaneous',
 'Heading 2:Part 7\xa0—\xa0Other Acts amended',
 'Heading 3:Division 1\xa0—\xa0Civil Liability Act\xa02002 amended',
 'Heading 3:Division 2\xa0—\xa0Insurance Commission of Western Australia Act\xa01986 amended',
 'Heading 3:Division 3\xa0—\xa0Motor Vehicle (Third Party Insurance) Act\xa01943\xa0amended']


The document uses 3 levels of headings as follows
- Heading 2: The Parts of the legislation.
- Heading 3: Divisions
- Heading 5: These are used as to create sub-headings in the level 2 & 3 headings. 

The cover page and TOC are the first part of document and are discarded. 

In [21]:
def read_document_sections(file_path, n=5):
    '''Break document at headings up to level n (5) and return a plain text 
       document with paragraphs seperated by two newlines (\n\n)'''
    
    doc = docx.Document(file_path)
    sections = []
    current_section = ""

    for paragraph in doc.paragraphs:
        if paragraph.style.name.startswith(tuple(f"Heading {i+1}" for i in range(n))):
            #save old section 
            if len(current_section) > 0:
                sections.append(current_section)
            
            # and start a new section with the heading text
            current_section = paragraph.text
        else:
            # join this paragraph text to prior ones in this section
            current_section = "\n\n".join([current_section, paragraph.text])

    # Add the last section
    if len(current_section) > 0:
        sections.append(current_section)

    #Retrun list of section discarding the coverpage and TOC
    return sections 

document = read_document_sections(legislation_path, 5)

# Print the sections
for section in document[1:]:
    print(section)
    print('---')

Part 1 — Preliminary
---
1.	Short title

		This is the Motor Vehicle (Catastrophic Injuries) Act 2016.
---
2.	Commencement

		This Act comes into operation as follows —

	(a)	sections 1 and 2 — on the day on which this Act receives the Royal Assent;

	(b)	the rest of the Act — on a day fixed by proclamation, and different days may be fixed for different provisions.
---
3.	Terms used

	(1)	In this Act, unless the contrary intention appears — 

	assessed treatment, care and support needs has the meaning given in section 18(2);

	catastrophic injuries support scheme (CISS) means the scheme provided for in this Act for the lifetime care and support of certain people catastrophically injured in motor vehicle accidents;

	catastrophic injury means a motor vehicle injury that satisfies the prescribed criteria for eligibility for participation in the CISS;

	Commission means the body continued as the Insurance Commission of Western Australia under the Insurance Commission of Western Australia 

In [23]:
from typing import Dict, List, Union
from langchain.docstore.document import Document

import re

def counts(texts):
  '''Create some basic statistics on the corpus'''

  if len(texts) == 0:
    print("No texts")
    return

  charCounts = [len(text.page_content) for text in texts]
  wordCounts = [len(text.page_content.split()) for text in texts]
  print(f"There are {len(texts)} chunks\nAverage character count {sum(charCounts)/len(charCounts):.0f}\nAverage word count {sum(wordCounts)/len(wordCounts):.0f}")


In [None]:

chunk_H0 = [Document(page_content=section) for section in document if read_document_sections(legislation_path, 0)]
counts(chunk_H0)

In [24]:

chunk_H1 = [Document(page_content=section) for section in document if read_document_sections(legislation_path, 1)]
counts(chunk_H1)

There are 11 chunks
Average character count 4998
Average word count 821


In [25]:

chunk_H5= [Document(page_content=section) for section in document if read_document_sections(legislation_path, 2)[1:]]
counts(chunk_H5)

There are 65 chunks
Average character count 844
Average word count 139


In [26]:

chunk_H3= [Document(page_content=section) for section in document if read_document_sections(legislation_path, 3)[1:]]
counts(chunk_H3)

There are 65 chunks
Average character count 844
Average word count 139


In [27]:

chunk_H5= [Document(page_content=section) for section in document if read_document_sections(legislation_path, 5)[1:]]
counts(chunk_H5)

There are 65 chunks
Average character count 844
Average word count 139


# Create the Pinecone database
Initialise the pinecode instance base on the API keys in .env. 

Depending on the user input, use the existing index or create a new one from the documents. Create a "similarity" document retriever based on the database.

In [None]:
def create_namespace(namespace, documents, embeddings):
    
    INDEX = os.environ.get('INDEX')
    
    if INDEX not in pinecone.list_indexes():
        print(f"Creating new index {INDEX}")
        pinecone.create_index(INDEX, dimension=1536)
    
    pinecone.Index(INDEX).delete(namespace=namespace, deleteAll=True)
    
    return Pinecone.from_documents(documents, embeddings, index_name=INDEX, namespace=namespace)

In [None]:
pinecone.init(
    api_key= os.environ.get('PINECONE_API_KEY') ,  # find at app.pinecone.io
    environment=os.environ.get('PINECONE_ENV')     # next to api key in console
)


embeddings = OpenAIEmbeddings()
db = create_namespace("markdown-1200", chunks, embeddings)

retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":5})


# Create and test the Gideon Chatbot

Define a Q&A chain that 'stuffs' the retrieved chunks into the prompt to provide context. Using OpenAI deterministic (temperature=0) model `gpt-3.5-turbo`.  According to OpenAI 'gpt-3.5-turbo' is the 
> Most capable GPT-3.5 model and optimized for chat at 1/10th the cost of text-davinci-003.

In [None]:
qa = RetrievalQA.from_chain_type(
                    llm=OpenAIChat(temperature=0.0), #, uses 'gpt-3.5-turbo' which is cheaper and better 
                    chain_type="stuff", 
                    retriever=retriever, 
                    return_source_documents=True)


In [None]:
from IPython.display import display, Markdown

import textwrap

def wrap_text_preserve_newlines(text, width=110):

    lines = text.split('\n')
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    wrapped_text = '\n'.join(wrapped_lines)
    return wrapped_text

def process_llm_response(llm_response, sources=True, content=False):
    display(Markdown(wrap_text_preserve_newlines(llm_response['result'])))
    if sources:
      display(Markdown('\n\nSources:'))
      for source in llm_response["source_documents"]:
        display(Markdown(f"### {source.metadata.get('title')} [{source.metadata.get('source')}]"))
        if content:
          display(Markdown(f'{wrap_text_preserve_newlines(source.page_content)}'))

def IRBSBot(query, sources=True, content=False):
  
  instructions = '''You are an experienced IBRS industry analyst speaking to an IT professional in this topic. 
                    If you do not know say "I do not know"'''
  result = qa({"query": f'{instructions} \n\n {query}'})
  process_llm_response(result, sources=sources, content=content)
  return (result)

In [None]:
result = IRBSBot('how do I protect against losing personal information (PII)', content=False)

In [None]:
result = IRBSBot('how do I protect against losing personal information (PII). Explain how to do this thinking it through step by step', content=False)