In [1]:
# %pip install -Uq \
# langchain langchain-community langchain-huggingface langchain-chroma \
# pypdf transformers accelerate Xformers InstructorEmbedding \
# sentencepiece bitsandbytes tiktoken chromadb typer semantic_split \
# cryptography

# REMEMBER TO INSTALL TORCH WITH CUDA

# Document Pre-Processing

Document pre-processing is split into two parts: 
- Clean trailing/extra spaces
- Splitting text into smaller chunks

In [2]:
import os

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

In [3]:
path = "../assets/ncvs_documents/"
loader = DirectoryLoader(path=path,
                         glob="*.pdf",
                         loader_cls=PyPDFLoader)
documents = loader.load()
len(documents)

626

## Clean trailing or extra whitespaces.

In [4]:
import re

for i in range(len(documents)):
  cleaned_docs = re.sub("\s+", " ", documents[i].page_content)    # remove trailing spaces
  documents[i].page_content = cleaned_docs

## Splitting into Chunks

We split the text into smaller chunks. For this experiment, we will stick to 512 `chunk_size` and 250 `chunk_overlap` to persist context between retrievals.
This is also because we have to take into account the embedding model's maximum sequence length. 

In [5]:
splitter = RecursiveCharacterTextSplitter(chunk_size=512,
                                          chunk_overlap=250,
                                          separators=["\n\n",
                                                      "\n",
                                                      " ",
                                                      ".",
                                                      ",",
                                                      "\u200b",  # Zero-width space
                                                      "\uff0c",  # Fullwidth comma
                                                      "\u3001",  # Ideographic comma
                                                      "\uff0e",  # Fullwidth full stop
                                                      "\u3002",  # Ideographic full stop
                                                      ""])

In [6]:
text = splitter.split_documents(documents)
print(f"Recursive: {len(text)}")

Recursive: 10410


# ChromaDB Collections

Text chunks processed will be passed through an embedding model and saved into
a ChromaDB database (collection).

we define a custom embedding function to use for embedding the texts before storing it into the database. 

In [7]:
import os
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from chromadb.utils.batch_utils import create_batches
from sentence_transformers import SentenceTransformer

dir = "db"
client = chromadb.PersistentClient(path=dir)

class MyEmbeddingFunction(EmbeddingFunction[Documents]):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = SentenceTransformer('all-distilroberta-v1', device='cuda')
        # consider using voyage-law-2

    def __call__(self, input: Documents) -> Embeddings:
        # embed the documents
        sentences = input
        embeddings = self.model.encode(sentences)
        return embeddings.tolist()

embedding_function = MyEmbeddingFunction()


retrieve an existing collection from the database if it exsists. if not, we shall create a new collection
and populate it with the documents.

In [8]:
try: 
    collection = client.get_collection(name="ncvs-idn", embedding_function=embedding_function)

except ValueError: 
    collection = client.create_collection(name="ncvs-idn", embedding_function=embedding_function)
    batches = create_batches(api=client, 
                            ids=["NCVS{n:03}".format(n=i) for i in range(1, len(text)+1)],
                            documents=[s.page_content for s in text], 
                            metadatas=[s.metadata for s in text])

    for batch in batches: 
        collection.upsert(
            ids=batch[0], 
            documents=batch[3], 
            metadatas=batch[2])

# Generation Model

In theory, we could use any generative models. But since there are hardware constraints, we opted to use OpenAI's models.

In [10]:
from openai import OpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA
from pprint import pprint

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PROMPT_TEMPLATE = """
Use the following context (delimited by <ctx></ctx>), \
chat history (delimited by <hs></hs>) and source \
(delimted by <src></src>) to answer the question:
---------------------
{context}
---------------------
{history}
---------------------
{source}
---------------------
Question: {query}
"""
k=7

define a function to format the retrieved documents from the database

In [11]:
def generate_prompt_items(query: str): 
    retrieve = collection.query(
        query_texts=[query], 
        n_results=k
    )

    context = [["<ctx>" + s + "</ctx>" for s in chunk] for chunk in retrieve.get("documents")]
    context = "".join("\n\n".join(chunk) for chunk in context)

    source = [["<src>" + "Source: " + s["source"] + ", page: " + str(s["page"]) + "</src>" for s in chunk] for chunk in retrieve.get("metadatas")]
    source  = "".join("\n\n".join(chunk) for chunk in source)
    return context, source
    

define a function to abstract the process of generating a response from the OpenAI model. 

In [12]:
def generate_openai_response(query): 
    context, source = generate_prompt_items(query)
    client = OpenAI(api_key=OPENAI_API_KEY)

    return client.chat.completions.create(
        model="gpt-3.5-turbo", 
        messages = [
            {
                "role":"system", 
                "content": "You will be provided a context (delimited by <ctx></ctx>) and the context source \
                            (delimited by <src></src>). Answer the question only based on the context given. \
                            Include the sources used in the answer you generated after the final paragraph, \
                            formatted with bullets for each different sources and sort it in a ascending manner."
            },
            {
                "role": "assistant",
                "content": context
            },
            {
                "role": "assistant",
                "content": source
            },
            {
                "role": "user",
                "content": query
            }
        ],
        temperature=0
    ).choices[0].message.content

# Evaluating the retrievals

## Generate a dataset for evaluating the answer given

To evaluate the retrieval quality, we will use `hitrate@k`, `precision@k`, `recall@k`, and mean reciprocal rank (MRR). Creating 
a custom dataset for example queries and expected answer source will be sufficient for this purpose.

In [13]:
import pandas as pd
data = {
    'Question': [
        'What is Gross Tonnage and how do I calculate it?', 
        'Can you explain to me what are the differences of each life jacket categories?', 
        'How can I determine if my crew count is sufficient?', 
        'What kinds of certificates are stated in the NCVS Book?', 
        'Explain to me more about vessel operational area', 
        'Bottom shell plating and how to calculate them', 
        'List the requirements for radio equipments on a vessel', 
        'How does one construct and evaluate a MES?', 
        'Does a life raft have categories? If so how much?', 
    ], 
    'Answer': [
        ['chapter-7'], 
        ['chapter-4'], 
        ['chapter-8'], 
        ['chapter-9'], 
        ['chapter-1'], 
        ['chapter-2'],
        ['chapter-3'],
        ['chapter-4'],
        ['chapter-4']
    ]
}
df = pd.DataFrame(data)

queries = df['Question']
targets = df['Answer']

## Define function to calculate `hitrate@k`

In [14]:
def hitrate_at_k(query, source_targets): 
    retrieval_results = collection.query(
        query_texts=[query], 
        n_results=k)
    
    source = [["<src>" + "Source: " + s["source"] + ", page: " + str(s["page"]) + "</src>" for s in chunk] for chunk in retrieval_results.get("metadatas")]

    count = 0
    for target in source_targets: 
        for retrieved in source[0]: 
            if target in retrieved.lower(): 
                count += 1
                continue
    
    return count/k


## Define function to calculate `recall@k`

In [15]:
def recall_at_k(query, source_targets): 
    retrieval_results = collection.query(
        query_texts=[query], 
        n_results=k)
    
    source = [["<src>" + "Source: " + s["source"] + ", page: " + str(s["page"]) + "</src>" for s in chunk] for chunk in retrieval_results.get("metadatas")]

    count = 0
    for target in source_targets: 
        for retrieved in source[0]: 
            if target in retrieved.lower(): 
                count += 1
                break
    
    return count/len(source_targets)

## Define function to calculate `precision@k`

In [16]:
def precision_at_k(query, source_targets): 
    retrieval_results = collection.query(
        query_texts=[query], 
        n_results=k)
    
    source = [["<src>" + "Source: " + s["source"] + ", page: " + str(s["page"]) + "</src>" for s in chunk] for chunk in retrieval_results.get("metadatas")]

    count = 0
    for target in source_targets: 
        for retrieved in source[0]: 
            if target in retrieved.lower(): 
                count += 1
                continue
    
    return count/len(source[0])

## Define function to calculate mean reciprocal ranks

In [17]:
def mean_reciprocal_rank(query, source_targets): 
    retrieval_results = collection.query(
        query_texts=[query], 
        n_results=k)
    
    source = [["<src>" + "Source: " + s["source"] + ", page: " + str(s["page"]) + "</src>" for s in chunk] for chunk in retrieval_results.get("metadatas")]

    reciprocal_ranks = [1/(i+1) for target in source_targets for i, result in enumerate(source[0]) if target in result.lower()]
    try: 
        return sum(reciprocal_ranks)/len(reciprocal_ranks)
    except ZeroDivisionError: 
        return 0

## Calculate average of each metrics

In [18]:
hit = []
pre = [] 
rec = []
mrr = []

for i, (q, a)in enumerate(zip(queries, targets)): 
    hit.append(hitrate_at_k(q, a))
    pre.append(precision_at_k(q, a))
    rec.append(recall_at_k(q, a))
    mrr.append(mean_reciprocal_rank(q, a))

print(f"Average hitrate over {k} queries: {sum(hit)/len(hit)}")
print(f"Average precision over {k} queries: {sum(pre)/len(pre)}")
print(f"Average recall over {k} queries: {sum(rec)/len(rec)}")
print(f"Average mean reciprocal ranks over {k} queries: {sum(mrr)/len(mrr)}")

Average hitrate over 7 queries: 0.8095238095238095
Average precision over 7 queries: 0.8095238095238095
Average recall over 7 queries: 0.8888888888888888
Average mean reciprocal ranks over 7 queries: 0.32507243638196015


# Testing the output

In [20]:
for q, ans in zip(queries, targets): 
    print("\n===================================\n")
    print("Question: {}\n\nAnswer: {}".format(q, 
                                           generate_openai_response(q)))



Question: What is Gross Tonnage and how do I calculate it?

Answer: Gross Tonnage (GT) is the total volume of a ship's enclosed spaces, both below and above the deck. To calculate Gross Tonnage, you can use the following formula: GT = 0.25 x (V), where V is the total volume of enclosed spaces under the deck (V1) plus the volume of enclosed spaces above the deck (V2). The cubic volume of the spaces under the deck (V1) can be determined using measurements from the vessel's Lines Plan or by multiplying the Length (L), Width (W), and Depth (D). The function of rooms on the vessel is no longer considered in the calculation of Gross Tonnage.

Sources:
- CHAPTER-7-TONNAGE_MEASUREMENT_V.4.4_1708919257504_0.pdf (page 3)


Question: Can you explain to me what are the differences of each life jacket categories?

Answer: The different categories of life jackets have specific requirements and purposes:

1. Category A Life Jacket:
   - Must not burn or melt after being engulfed by fire in 2 second