In [65]:
# %pip install -Uq \
# langchain langchain-community langchain-huggingface langchain-chroma \
# pypdf transformers accelerate Xformers InstructorEmbedding \
# sentencepiece bitsandbytes tiktoken chromadb typer semantic_split \
# cryptography

# Document Pre-Processing

Document pre-processing is split into two parts: 
- Clean trailing/extra spaces
- Splitting text into smaller chunks

In [66]:
import os

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

In [67]:
path = "../assets/ncvs_documents/"
loader = DirectoryLoader(path=path,
                         glob="*.pdf",
                         loader_cls=PyPDFLoader)
documents = loader.load()
len(documents)

626

## Clean trailing or extra whitespaces.

In [68]:
import re

for i in range(len(documents)):
  cleaned_docs = re.sub("\s\W\s", " ", documents[i].page_content)  # remove weird dashes
  cleaned_docs = re.sub("\s+", " ", cleaned_docs)    # remove trailing spaces
  documents[i].page_content = cleaned_docs

## Splitting into Chunks

There are two such splitters that are recommended here, one is the `RecursiveCharacterTextSplitter`from Langchain and the other is the [`semantic-split`](https://github.com/agamm/semantic-split) by Agamm.

> Known issues:  
> `semantic-split` uses string as arguments, meaning we _could_ very well lose page metadata of the content itself. Workaround is in progress.

In [69]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                          chunk_overlap=250,
                                          separators=["\n\n",
                                                      "\n",
                                                      " ",
                                                      ".",
                                                      ",",
                                                      "\u200b",  # Zero-width space
                                                      "\uff0c",  # Fullwidth comma
                                                      "\u3001",  # Ideographic comma
                                                      "\uff0e",  # Fullwidth full stop
                                                      "\u3002",  # Ideographic full stop
                                                      ""])

In [70]:
text = splitter.split_documents(documents)
print(f"Recursive: {len(text)}")

Recursive: 3954


# ChromaDB Collections

Text chunks processed will be passed through an embedding model and saved into
a ChromaDB database (collection).

In [71]:
from os import walk

mypath = "../assets/ncvs_documents/"
filenames = next(walk(mypath), (None, None, []))[2]  # [] if no file
filenames.sort()
filenames

['CHAPTER-1_INTRODUCTION_v.4.4_1708919173888_0.pdf',
 'CHAPTER-2_CONSTRUCTION_v._4.4_1708919219897_0.pdf',
 'CHAPTER-3_EQUIPMENT_v.4.4_1708919228097_0.pdf',
 'CHAPTER-4_LIFE_SAVING_APPLIANCES_v.4.4_1708919237619_0.pdf',
 'CHAPTER-5_MACHINERY_AND_ELECTRICAL_v.4.4_1708919243730_0.pdf',
 'CHAPTER-6_LOAD_LINES_v.4.4_1708919269763_0.pdf',
 'CHAPTER-7-TONNAGE_MEASUREMENT_V.4.4_1708919257504_0.pdf',
 'CHAPTER-8_MANNING_v.4.4_1708919279022_0.pdf',
 'CHAPTER-9_MANAGEMENT_OPERATIONAL_v.4.4_1708919289106_0.pdf',
 'SK_Dirjen_Hubla_No._UM.008-9-20-DJPL_-_2012_1708919307382_0.pdf']

In [3]:
import os
import chromadb
from chromadb.utils import embedding_functions
from chromadb import Documents, EmbeddingFunction, Embeddings
from sentence_transformers import SentenceTransformer

dir = "db"
client = chromadb.PersistentClient(path=dir)

class MyEmbeddingFunction(EmbeddingFunction[Documents]):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = SentenceTransformer('BAAI/bge-m3', device='cuda')
        # consider using voyage-law-2

    def __call__(self, input: Documents) -> Embeddings:
        # embed the documents
        sentences = input
        embeddings = self.model.encode(sentences)
        return embeddings.tolist()

embedding_function = MyEmbeddingFunction()

collection = client.get_or_create_collection(name="ncvs-indonesia",
                                            embedding_function=embedding_function)

In [73]:
# If something went wrong, remove the collections
# client.delete_collection(name="ncvs-indonesia")

## Batching and Inserting

Create batches for the chunks and upsert into database (add if nonexistent or update if exists) 


In [4]:
from chromadb.utils.batch_utils import create_batches
batches = create_batches(api=client, 
                         ids=["NCVS{n:04}".format(n=i) for i in range(1, len(text)+1)],
                         documents=[s.page_content for s in text], 
                         metadatas=[s.metadata for s in text])

for batch in batches: 
    collection.upsert(
        ids=batch[0], 
        documents=batch[3], 
        metadatas=batch[2])

NameError: name 'text' is not defined

# Generation Model

In [5]:
from openai import OpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PROMPT_TEMPLATE = """
Use the following context (delimited by <ctx></ctx>), \
chat history (delimited by <hs></hs>) and source \
(delimted by <src></src>) to answer the question:
---------------------
{context}
---------------------
{history}
---------------------
{source}
---------------------
Question: {query}
"""

In [6]:
def generate_prompt_items(query: str): 
    retrieve = collection.query(
        query_texts=[query], 
        n_results=5
    )

    context = [["<ctx>" + s + "</ctx>" for s in chunk] for chunk in retrieve.get("documents")]
    context = "".join("\n\n".join(chunk) for chunk in context)

    source = [["<src>" + "Source: " + s["source"] + ", page: " + str(s["page"]) + "</src>" for s in chunk] for chunk in retrieve.get("metadatas")]
    source  = "".join("\n\n".join(chunk) for chunk in source)
    return context, source
    

In [7]:
def generate_openai_response(query): 
    context, source = generate_prompt_items(query)
    client = OpenAI(api_key=OPENAI_API_KEY)

    return client.chat.completions.create(
        model="gpt-3.5-turbo", 
        messages = [
            {
                "role":"system", 
                "content": "You will be provided a context (delimited by <ctx></ctx>) and the context source \
                            (delimited by <src></src>). Answer the question only based on the context given. \
                            Include the sources used in the answer you generated after the final paragraph, \
                            formatted with bullets for each different sources and sort it in a ascending manner."
            },
            {
                "role": "assistant",
                "content": context
            },
            {
                "role": "assistant",
                "content": source
            },
            {
                "role": "user",
                "content": query
            }
        ],
        temperature=0
    ).choices[0].message.content

In [8]:
question = "How many categories does a lifeboat have?"
print("Question: {}\n\nAnswer: {}".format(question, 
                                           generate_openai_response(question)))

Add of existing embedding ID: NCVS0001
Add of existing embedding ID: NCVS0002
Add of existing embedding ID: NCVS0003
Add of existing embedding ID: NCVS0004
Add of existing embedding ID: NCVS0005
Add of existing embedding ID: NCVS0006
Add of existing embedding ID: NCVS0007
Add of existing embedding ID: NCVS0008
Add of existing embedding ID: NCVS0009
Add of existing embedding ID: NCVS0010
Add of existing embedding ID: NCVS0011
Add of existing embedding ID: NCVS0012
Add of existing embedding ID: NCVS0013
Add of existing embedding ID: NCVS0014
Add of existing embedding ID: NCVS0015
Add of existing embedding ID: NCVS0016
Add of existing embedding ID: NCVS0017
Add of existing embedding ID: NCVS0018
Add of existing embedding ID: NCVS0019
Add of existing embedding ID: NCVS0020
Add of existing embedding ID: NCVS0021
Add of existing embedding ID: NCVS0022
Add of existing embedding ID: NCVS0023
Add of existing embedding ID: NCVS0024
Add of existing embedding ID: NCVS0025
Add of existing embedding

Question: How many categories does a lifeboat have?

Answer: A lifeboat has two categories: Category A and Category B. Category A lifeboats are required to comply with international conventions, codes, and amendments related to lifeboats, while Category B lifeboats are open boats constructed with rigid sides and designed to have ample stability and sufficient freeboard when loaded with equipment and the specified number of persons according to its capacity. These categories have specific construction and capacity requirements outlined for each type of lifeboat.

Sources:
- CHAPTER-4_LIFE_SAVING_APPLIANCES_v.4.4_1708919237619_0.pdf, page: 7


In [79]:
question = "Can you elaborate to me what are the differences of each life jacket categories?" 
print("Question: {}\n\nAnswer: {}".format(question, 
                                           generate_openai_response(question)))

Question: Can you elaborate to me what are the differences of each life jacket categories?

Answer: The differences between each life jacket category are outlined in the context provided. Here is a breakdown of the key distinctions:

1. Category A Life Jackets:
   - These life jackets are required to have buoyancy that is not decreased by more than 5 percent after being immersed in fresh water for 24 hours.
   - They should enable the wearer to swim short distances and climb up a lifeboat.
   - Each life jacket must be equipped with a whistle attached by a line and a self-illuminating light when immersed in water.

2. Category B Life Jackets:
   - Category B life jackets must comply with specific requirements, including having a whistle firmly attached to each life jacket.
   - They should be marked with clear instructions for donning and must be suitable for all body weight ranges.

3. Category C Life Jackets:
   - Category C life jackets, excluding those relying on inflation for buoy

In [80]:
question = "Explain in detail about a vessel's minimum anchor mass."
print("Question: {}\n\nAnswer: {}".format(question, 
                                           generate_openai_response(question)))

Question: Explain in detail about a vessel's minimum anchor mass.

Answer: The minimum anchor mass for a vessel is determined based on its measured length and operating profile. For vessels more than 24 meters in length, the minimum mass per anchor is calculated using the equipment number specified in the regulations. On the other hand, for vessels up to 24 meters in length, the minimum mass per anchor is determined by either applying values from specific tables for vessels up to 24 meters or calculating the mass from the equipment number as per the regulations.

The formula used to calculate the mass of a single anchor is MA = MT * fV * fA, where MA is the mass of a single anchor in kilograms, MT is the tabular mass of the anchor from specific tables, fV is the speed factor (1 if maximum speed is less than Vm, 0.75 if maximum speed is Vm or more), and fA varies depending on the type of anchor used (1 for standard anchors, 0.7 for high holding power anchors, and 0.55 for extra high hol