In [1]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [2]:
# account for deprecation of LLM model
import datetime
# Get the current date
current_date = datetime.datetime.now().date()

# Define the date after which the model should be set to "gpt-3.5-turbo"
target_date = datetime.date(2024, 6, 12)

# Set the model variable based on the current date
if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

In [3]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [4]:
persist_directory = 'docs/chroma'

In [5]:
document0 = 'ISO9001.pdf'
document1 = 'machinelearning-lecture01.pdf'
document2 = 'MachineLearning-Lecture02.pdf'
document3 = 'MachineLearning-Lecture03.pdf'

In [6]:
documents = [document0, document1, document2, document3]

In [7]:
for document in documents:
    loader = PyPDFLoader(document)
    docs = loader.load()
    #docs[0].metadata['protection'] = 'manager'
    #docs[0].metadata
    chunk_size = 1000
    chunk_overlap = 150
    separators = ['\n\n', '\n', '(?<=\.)', ' ', '']
    rc_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=separators
    )
    splits = rc_text_splitter.split_documents(docs)
    embedding = OpenAIEmbeddings()
    db = Chroma(embedding_function=embedding)
    db.add_documents(splits)

In [8]:
db.get()

{'ids': ['059e693a-ac90-11ee-ad63-40c2ba0a25ec',
  '059e693b-ac90-11ee-9012-40c2ba0a25ec',
  '059e693c-ac90-11ee-86bd-40c2ba0a25ec',
  '059e693d-ac90-11ee-9521-40c2ba0a25ec',
  '059e693e-ac90-11ee-a54c-40c2ba0a25ec',
  '059e693f-ac90-11ee-9f68-40c2ba0a25ec',
  '059e6940-ac90-11ee-bb20-40c2ba0a25ec',
  '059e6941-ac90-11ee-9d66-40c2ba0a25ec',
  '059e6942-ac90-11ee-8106-40c2ba0a25ec',
  '059e6943-ac90-11ee-bced-40c2ba0a25ec',
  '059e6944-ac90-11ee-b372-40c2ba0a25ec',
  '059e6945-ac90-11ee-ac02-40c2ba0a25ec',
  '059e6946-ac90-11ee-afae-40c2ba0a25ec',
  '059e6947-ac90-11ee-8bca-40c2ba0a25ec',
  '059e6948-ac90-11ee-9c11-40c2ba0a25ec',
  '059e6949-ac90-11ee-90a6-40c2ba0a25ec',
  '059e694a-ac90-11ee-a643-40c2ba0a25ec',
  '059e694b-ac90-11ee-a287-40c2ba0a25ec',
  '059e694c-ac90-11ee-9a6f-40c2ba0a25ec',
  '059e694d-ac90-11ee-b334-40c2ba0a25ec',
  '059e694e-ac90-11ee-a773-40c2ba0a25ec',
  '059e694f-ac90-11ee-8049-40c2ba0a25ec',
  '059e6950-ac90-11ee-971b-40c2ba0a25ec',
  '059e6951-ac90-11ee-b3ba-

PDF preprocessing

In [9]:
import fitz  # PyMuPDF
import regex as re
import tiktoken

In [10]:
pdf_path = 'ISO9001.pdf'

In [11]:
def correct_text(curr_title, previous_bold_text, bolded_string):
    pattern = '(\d+(\.\d+)*)'
    matches_previous = re.findall(pattern, previous_bold_text)
    matches_now = re.findall(pattern, bolded_string)
    #print(bolded_title_no)
    if len(matches_previous) != 0 and len(matches_now) == 0 and matches_previous[0][0] == previous_bold_text.strip():
        #print(matches_previous[0][0], '---', curr_title)
        n = len(previous_bold_text)
        titles_with_text[curr_title][-1] = titles_with_text[curr_title][-1][:-(n+1)]

In [12]:
def find_title(titles_on_page, bolded_string, curr_title, previous_bold_text):
    pattern = '(\d+(\.\d+)*)'
    splits = re.split(pattern, bolded_string)
    bolded_title = splits[-1].strip()

    print('---', bolded_title, '---')
    if bolded_string.strip() == 'Clarification of new structure, terminology and concepts':
        return bolded_string.strip()
    if bolded_string.strip() == 'management systems developed by ISO/TC 176':
        print('ovde')
        return previous_bold_text + bolded_string.strip()
    
    #print(titles_on_page)
    for title in titles_on_page:
        split = title.split('\t')
        if len(split) > 1:
            title = split[1].strip()
        #print(title, '---', bolded_string)
        if title == bolded_title:
            correct_text(curr_title, previous_bold_text, bolded_string)
            #print(title)
            return title
    #print()

    return None

In [13]:
titles_with_text = {}

margin_texts = [
    'ISO 9001:2015(E)',
    'Normen-Download-Beuth-DQS GmbH Deutsche Gesellschaft zur Zertifizierung von-KdNr.1838919-LfNr.7191817001-2015-09-23 10:48',
    '© ISO 2015 – All rights reserved'
]

pattern = '(\d+(\.\d+)+)'

with fitz.open(pdf_path) as pdf_document:
    outlines = pdf_document.get_toc()
    first_title_page = outlines[0][2]-1
    last_title_page = outlines[-1][2]-1
    curr_title = ''
    previous_bold_text = ''
   
    for page_number in range(first_title_page, last_title_page+1):
        titles_on_page = [outline[1] for outline in outlines if outline[2]-1 == page_number]

        # read page text as a dictionary, suppressing extra spaces in CJK fonts
        page = pdf_document.load_page(page_number)
        blocks = page.get_text("dict", flags=11)["blocks"]
        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                for s in l["spans"]:  # iterate through the text spans
                    s['text'] = s['text'].replace('�', ' ')
                    
                    if s['text'].strip() in margin_texts:
                        continue

                    ind = 0
                    if 'Bold' in s['font']:
                        #print(s['text'])
                        #print(previous_bold_text)
                        title = find_title(titles_on_page, s['text'], curr_title, previous_bold_text)
                        previous_bold_text = s['text']
                        
                        if title is not None:    
                            curr_title = title

                            if curr_title in titles_with_text.keys():
                                titles_with_text[curr_title].append('')
                            else:
                                titles_with_text[curr_title] = ['']
                            ind = 1
                            break
                        
                    if curr_title != '' and ind == 0:
                        titles_with_text[curr_title][-1] += s['text']
                        titles_with_text[curr_title][-1] += ' ' 
#titles_with_text

---  ---
--- Foreword ---
---  ---
---  ---
---  ---
--- Introduction ---
--- General ---
---  ---
---  ---
---  ---
---  ---
--- Quality management principles ---
--- Process approach ---
--- General ---
---  ---
---  ---
---  ---
--- — Schematic representation of the elements of a single process ---
--- Plan-Do-Check-Act cycle ---
--- — Representation of the structure of this International Standard in the PDCA cycle ---
---  ---
---  ---
---  ---
---  ---
--- Plan ---
--- Do ---
--- Check ---
--- Act ---
--- Risk-based thinking ---
--- Relationship with other management system standards ---
---  ---
---  ---
---  ---
---  ---
---  ---
---  ---
---  ---
--- Quality management systems — Requirements ---
--- Scope ---
--- Normative references ---
--- Terms and definitions ---
--- Context of the organization ---
--- Understanding the organization and its context ---
--- INTERNATIONAL STANDARD ---
---  ---
---  ---
---  ---
--- Understanding the needs and expectations of interested partie

retrieving

In [14]:
def split(txt, seps):
    default_sep = seps[0]

    # we skip seps[0] because that's the default separator
    for sep in seps[1:]:
        txt = txt.replace(sep, default_sep)
    return [i.strip() for i in txt.split(default_sep)]

In [15]:
def add_texts_to_titles(titles, texts, insert_pos=-1):
    for curr_title, text in zip(titles, texts):
        if curr_title in titles_with_text.keys():
            if insert_pos != -1:
                titles_with_text[curr_title].insert(insert_pos, '')
                titles_with_text[curr_title][insert_pos] += text
                insert_pos += 1
            else:
                titles_with_text[curr_title].append('')
                titles_with_text[curr_title][-1] += text
        else:
            titles_with_text[curr_title] = ['']
            titles_with_text[curr_title][-1] += text

In [16]:
splits = ['0.1   General', '0.2   Quality management principles', '0.3   Process approach', '0.3.1   General',
    '0.3.2   Plan-Do-Check-Act cycle', '0.3.3   Risk-based thinking', '0.4   Relationship with other management system standards']
titles = [split.split('   ')[1:][0] for split in splits]
texts = split(titles_with_text['Introduction'][0], splits)
add_texts_to_titles(titles, texts[1:], 0)
titles_with_text['Introduction'][0] = texts[0]
titles_with_text['Relationship with other management system standards'][0] = titles_with_text['Relationship with other management system standards'][0].split('Quality management systems — Requirements')[0]

In [17]:
splits = ['A.1 Structure and terminology', 'A.2 Products and services', 'A.3 Understanding the needs and expectations of interested parties',
   'A.4 Risk-based thinking', 'A.5 Applicability', 'A.6 Documented information', 'A.7 Organizational knowledge',
   'A.8 Control of externally provided processes, products and services']
titles = [' '.join(split.split(' ')[1:]) for split in splits]
texts = split(titles_with_text['Clarification of new structure, terminology and concepts'][0], splits)
add_texts_to_titles(titles, texts[1:], -1)
titles_with_text['Clarification of new structure, terminology and concepts'][0] = texts[0]
titles_with_text['Control of externally provided processes, products and services'][0] = titles_with_text['Control of externally provided processes, products and services'][1].split('Annex B')[0]

In [18]:
titles_with_text['Other International Standards on quality management and quality management systems developed by ISO/TC 176'] = titles_with_text['Other International Standards on quality management and quality management systems developed by ISO/TC 176'][0].split('. — ')
titles_with_text['Other International Standards on quality management and quality management systems developed by ISO/TC 176'][-1] = titles_with_text['Other International Standards on quality management and quality management systems developed by ISO/TC 176'][-1].split('Table B.1')[0]

In [19]:
titles_with_text.pop('Bibliography')

['[1]  ISO 9004,  Managing for the sustained success of an organization — A quality management approach [2]  ISO 10001,  Quality management — Customer satisfaction — Guidelines for codes of conduct for  organizations [3]  ISO 10002,  Quality management — Customer satisfaction — Guidelines for complaints handling in  organizations [4]  ISO 10003,  Quality management — Customer satisfaction — Guidelines for dispute resolution  external to organizations [5]  ISO 10004,  Quality management — Customer satisfaction — Guidelines for monitoring and measuring [6]  ISO 10005,  Quality management systems — Guidelines for quality plans [7]  ISO 10006,  Quality management systems — Guidelines for quality management in projects [8]  ISO 10007,  Quality management systems — Guidelines for configuration management [9]  ISO 10008,  Quality management — Customer satisfaction — Guidelines for business-to-consumer  electronic commerce transactions [10]  ISO 10012,  Measurement management systems — Require

In [20]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

for title in titles_with_text.keys():
    #print(title)
    for i in range(len(titles_with_text[title])):
        #print(i)
        print(title, ': ', num_tokens_from_string(titles_with_text[title][i], "cl100k_base"))

Foreword :  473
Introduction :  0
Scope :  165
Normative references :  73
Terms and definitions :  23
Context of the organization :  0
Understanding the organization and its context :  144
Understanding the needs and expectations of interested parties :  84
Understanding the needs and expectations of interested parties :  164
Determining the scope of the quality management system :  207
Quality management system and its processes :  255
Leadership :  0
Leadership and commitment :  0
General :  460
General :  332
General :  242
General :  57
General :  103
General :  28
General :  174
General :  92
General :  38
General :  99
Customer focus :  78
Policy :  0
Establishing the quality policy :  67
Communicating the quality policy :  42
Organizational roles, responsibilities and authorities :  134
Planning :  0
Actions to address risks and opportunities :  280
Quality objectives and planning to achieve them :  163
Planning of changes :  85
Support :  0
Resources :  0
People :  30
Infrastru

Prompting

In [35]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

In [38]:
chat = ChatOpenAI(temperature=0.0, model=llm_model)

In [128]:
delimiter = '```'
template_str = f"""
    You will be company manager helper.
    You will be given subsection of an ISO 9001 document delimited by {delimiter}.
    Your task is to determine steps our company needs to do to satisfy this subsection.
    Also, identify if there are any notes to follow.
    Format your answer as JSON document.
    Subsection: {delimiter}{titles_with_text['Quality management system and its processes'][0]}{delimiter}
"""

In [129]:
prompt_template = ChatPromptTemplate.from_template(template_str)

In [130]:
request = prompt_template.format_messages()

In [131]:
summarized_text = (chat(request).content)
print(summarized_text)

{
  "Steps": [
    "Establish a quality management system that meets the requirements of ISO 9001",
    "Determine the processes needed for the quality management system and their application throughout the organization",
    "Determine the inputs required and the outputs expected from these processes",
    "Determine the sequence and interaction of these processes",
    "Determine and apply the criteria and methods needed to ensure the effective operation and control of these processes",
    "Determine the resources needed for these processes and ensure their availability",
    "Assign the responsibilities and authorities for these processes",
    "Address the risks and opportunities as determined in accordance with the requirements of 6.1",
    "Evaluate these processes and implement any changes needed to ensure that these processes achieve their intended results",
    "Improve the processes and the quality management system",
    "Maintain documented information to support the opera

Self-query retriever

In [80]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The document the chunk is from, should be `ISO9001.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the document",
        type="integer",
    ),
    AttributeInfo(
        name="section",
        description=f"Name of a section from a document. You should look up for section name in this list: {contents}.",
        type="string",
    )
]

In [81]:
document_content_description = "ISO 9001 certification document"
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    db,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [86]:
question = f'Tell me requirements from `Context of the organization` section.'

In [87]:
retriever.get_relevant_documents(question)

[]

In [79]:
#for document in docs:
#    print(document.metadata)
#    print(document.page_content)
#    print('------', '\n')

Contextual compression retriever

In [22]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [23]:
# Wrap our vectorstore
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

In [24]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=retriever
)

In [25]:
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)



Document 1:

"intended for, or required by, a customer." "The organization shall determine external and internal issues that are relevant to its purpose and its strategic direction and that affect its ability to achieve the intended result(s) of its quality management system."
----------------------------------------------------------------------------------------------------
Document 2:

"This International Standard specifies requirements for a quality management system when an organization: a) needs to demonstrate its ability to consistently provide products and services that meet customer and applicable statutory and regulatory requirements, and b) aims to enhance customer satisfaction through the effective application of the system, including processes for improvement of the system and the assurance of conformity to customer and applicable statutory and regulatory requirements."
----------------------------------------------------------------------------------------------------
Doc

In [21]:
%store titles_with_text

Stored 'titles_with_text' (dict)


In [3]:
import json

In [4]:
with open('titles_with_text_ISO_9001.json', 'w', encoding='utf-8') as f:
    json.dump(titles_with_text, f, ensure_ascii=False, indent=4)