In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

import pymupdf
from time import sleep
import re
import json
import uuid


from llama_cpp import Llama
from openai import OpenAI
from langchain.embeddings.base import Embeddings
from langchain_experimental.text_splitter import SemanticChunker
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

from optimization.src.optimizer import ChunkOptimizer
from Evaluation.src.prompts import *
from Evaluation.src.data_generator import DataGenerator

In [10]:
embed_model_path = 'C:/Users/shour/.cache/lm-studio/models/second-state/All-MiniLM-L6-v2-Embedding-GGUF/all-MiniLM-L6-v2-Q4_0.gguf'
embed_model = Llama(model_path=embed_model_path, 
                    embedding=True,
                    verbose=False)

In [11]:
documents = {}

insurance_act = 'insurance-information\Insurance Act,1938 - incorporating all amendments till 20212021-08-12.pdf'

doc = pymupdf.open(insurance_act)

complete_text = ""
for page in doc.pages(12):
    text = page.get_text()
    text = text[2:]
    paras = text.split('\n \n \n1')[:-1]
    for para in paras:
        complete_text += para

text = re.sub(r'\d\*{1,3}', '', complete_text)
text = re.sub(r'\n\d+', '', text)
text = re.sub(r'\(\d+\)', '', text)
text = text.replace('*', '').replace('\n \n', '\n').replace('  ', ' ').replace('  ', ' ').replace('[', '').replace(']', '').replace(' \n', '\n').replace(' .', '.').replace('..', '.')
text = text.strip()

documents['insurance_act'] = text

  insurance_act = 'insurance-information\Insurance Act,1938 - incorporating all amendments till 20212021-08-12.pdf'


In [12]:
policyholder_file = 'insurance-information\Draft IRDAI(Protection of Policyholders’ Interests and Allied Matters of Insurers) Regulations, 2024.pdf'

doc = pymupdf.open(policyholder_file)

complete_text = ""
for page in doc.pages(2):
    text = page.get_text()
    complete_text += text
complete_text = complete_text[235:]

text = re.sub(r'\d+\s*\|\s*P\s*a\s*g\s*e', '', complete_text)
text = re.sub(r'\(\d+\)', '', text)
text = re.sub(r'\n\d+', '', text)
text = text.replace('  ', '').replace('\n \n', '\n').replace('\n\n', '\n').replace('. \n', '').replace('*','').replace('__', '_')

documents['policyholder_file'] = text

  policyholder_file = 'insurance-information\Draft IRDAI(Protection of Policyholders’ Interests and Allied Matters of Insurers) Regulations, 2024.pdf'


In [13]:
handbook_path = 'insurance-information\Life Insurance Handbook (English).pdf'

doc = pymupdf.open(handbook_path)

complete_text = ""
for page in doc.pages(2):
    text = page.get_text()
    complete_text += text

text = re.sub(r'\n\d+', '', complete_text)
text = text.replace('  ', '').replace('\n \n', '\n').replace('\n\n', '\n').replace('. \n', '').replace('•', '')

documents['handbook_path'] = text

  handbook_path = 'insurance-information\Life Insurance Handbook (English).pdf'


In [14]:
class MyLocalEmbeddings(Embeddings):
    def embed_documents(self, texts):
        return [np.array(embed_model.create_embedding(text)['data'][0]['embedding']) for text in texts]

    def embed_query(self, text):
        return np.array(embed_model.create_embedding(text)['data'][0]['embedding'])

embeddings = MyLocalEmbeddings()
semantic_splitter = SemanticChunker(embeddings)
optimizer = ChunkOptimizer(embed_model_path=embed_model_path)

In [15]:
def find_scores(split_embeddings):
    similarity_scores = []
    for i in range(len(split_embeddings.keys())):
        if i == len(split_embeddings.keys()) - 1:
            score = 0
        else:
            score = cosine_similarity(split_embeddings[i]['Embedding'], split_embeddings[i+1]['Embedding'])[0][0]
        similarity_scores.append(score)
        
        split_embeddings[i]['Score'] = score
    return similarity_scores

In [16]:
doc_splits = {}
for doc in documents.keys():
    semantic_splits  = semantic_splitter.split_text(documents[doc])
    semantic_splits = [s for s in semantic_splits if len(s) > 100]
    print(len(semantic_splits))
    split_embeddings = {idx : {'Text' : x, 'Embedding' : np.array(embed_model.create_embedding(x)['data'][0]['embedding']).reshape(1, -1)} for idx, x in enumerate(semantic_splits)}
    semantic_scores = find_scores(split_embeddings)
    pct = np.percentile(semantic_scores, 90)
    optimized_chunks = optimizer.optimize_chunks(semantic_splits, 1200, 200, pct)
    doc_splits[doc] = optimized_chunks

22
After 1 iterations, the number of splits are : 92. The highest similarity score is : 0.7347771055203827
8
After 1 iterations, the number of splits are : 23. The highest similarity score is : 0.7523336281335997
9
After 1 iterations, the number of splits are : 22. The highest similarity score is : 0.7420542612206414


In [17]:
doc_splits.keys()

dict_keys(['insurance_act', 'policyholder_file', 'handbook_path'])

In [18]:
client = QdrantClient(path="insurance-reference")
collection_name = "references"

try:
    if client.get_collection(collection_name):
            client.delete_collection(collection_name=collection_name)
except ValueError:
    print("Collection not found")

client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

Collection not found


True

In [19]:
doc_splits.keys()

dict_keys(['insurance_act', 'policyholder_file', 'handbook_path'])

In [20]:
for chunk in doc_splits['insurance_act'].values():
    print(chunk.keys())
    print(type(chunk['Embedding']))
    flat_list = chunk['Embedding'].flatten().tolist()
    print(type(flat_list))
    print(flat_list)
    break

dict_keys(['Text', 'Embedding', 'Score'])
<class 'numpy.ndarray'>
<class 'list'>
[-0.08339032530784607, 0.03319903835654259, -0.12843045592308044, -0.048387423157691956, -0.0861189067363739, 0.08272852748632431, 0.1226252093911171, 0.10190168023109436, -0.05110381171107292, 0.1296033412218094, 0.11121117323637009, -0.010384131222963333, 0.034575507044792175, -0.044297073036432266, 0.0406312420964241, -0.018348803743720055, -0.008194297552108765, 0.00021478976123034954, -0.06266988068819046, 0.21246035397052765, 0.14120078086853027, 0.049494821578264236, -0.05115668475627899, -0.0938510000705719, 0.05027088522911072, -0.12999308109283447, -0.015067113563418388, -0.08437658101320267, -0.06898294389247894, 0.014538431540131569, 0.01794782653450966, 0.020928770303726196, 0.020816704258322716, 0.054151929914951324, 0.03513755276799202, -0.04842439293861389, -0.06459749490022659, -0.01624877378344536, 0.03209373727440834, -0.030868005007505417, -0.016592228785157204, 0.09819316864013672, -0.

In [21]:
for key in doc_splits.keys():
    points = [
    PointStruct(
        id = str(uuid.uuid4()),
        vector = chunk['Embedding'].flatten().tolist(),
        payload = {
        "text": chunk['Text']
        }
    )
    for chunk in doc_splits[key].values()]

    operation_info = client.upsert(
        collection_name='references',
        wait=True,
        points=points
    )