In [24]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import json
import pymupdf
from llama_cpp import  Llama
import re
from langchain.embeddings.base import Embeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from optimization.src.optimizer import ChunkOptimizer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid
from openai import OpenAI
import re
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from langchain import HuggingFacePipeline
import nest_asyncio
import torch

In [25]:
embed_model = Llama(model_path='C:/Users/shour/.cache/lm-studio/models/second-state/All-MiniLM-L6-v2-Embedding-GGUF/all-MiniLM-L6-v2-Q4_0.gguf', 
                    embedding=True,
                    verbose=False)

In [26]:
documents = {}

insurance_act = 'insurance-information\Insurance Act,1938 - incorporating all amendments till 20212021-08-12.pdf'

doc = pymupdf.open(insurance_act)

complete_text = ""
for page in doc.pages(12):
    text = page.get_text()
    text = text[2:]
    paras = text.split('\n \n \n1')[:-1]
    for para in paras:
        complete_text += para

text = re.sub(r'\d\*{1,3}', '', complete_text)
text = re.sub(r'\n\d+', '', text)
text = re.sub(r'\(\d+\)', '', text)
text = text.replace('*', '').replace('\n \n', '\n').replace('  ', ' ').replace('  ', ' ').replace('[', '').replace(']', '').replace(' \n', '\n').replace(' .', '.').replace('..', '.')
text = text.strip()
documents['insurance_act'] = text

  insurance_act = 'insurance-information\Insurance Act,1938 - incorporating all amendments till 20212021-08-12.pdf'


In [27]:
policyholder_file = 'insurance-information\Draft IRDAI(Protection of Policyholders’ Interests and Allied Matters of Insurers) Regulations, 2024.pdf'

doc = pymupdf.open(policyholder_file)

complete_text = ""
for page in doc.pages(2):
    text = page.get_text()
    complete_text += text
complete_text = complete_text[235:]

text = re.sub(r'\d+\s*\|\s*P\s*a\s*g\s*e', '', complete_text)
text = re.sub(r'\(\d+\)', '', text)
text = re.sub(r'\n\d+', '', text)
text = text.replace('  ', '').replace('\n \n', '\n').replace('\n\n', '\n').replace('. \n', '').replace('*','').replace('__', '_')
documents['policyholder'] = text

  policyholder_file = 'insurance-information\Draft IRDAI(Protection of Policyholders’ Interests and Allied Matters of Insurers) Regulations, 2024.pdf'


In [28]:
handbook_path = 'insurance-information\Life Insurance Handbook (English).pdf'

doc = pymupdf.open(handbook_path)

complete_text = ""
for page in doc.pages(2):
    text = page.get_text()
    complete_text += text

text = re.sub(r'\n\d+', '', complete_text)
text = text.replace('  ', '').replace('\n \n', '\n').replace('\n\n', '\n').replace('. \n', '').replace('•', '')
documents['handbook'] = text

  handbook_path = 'insurance-information\Life Insurance Handbook (English).pdf'


In [29]:
class MyLocalEmbeddings(Embeddings):
    def embed_documents(self, texts):
        return [np.array(embed_model.create_embedding(text)['data'][0]['embedding']) for text in texts]

    def embed_query(self, text):
        return np.array(embed_model.create_embedding(text)['data'][0]['embedding'])
    
embeddings = MyLocalEmbeddings()
semantic_splitter = SemanticChunker(embeddings)

char_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=2000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)

token_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=2000,
    chunk_overlap=0,
    is_separator_regex=False
)

recursive_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4",
    chunk_size=2000,
    chunk_overlap=0
)

In [30]:
semantic_splits  = semantic_splitter.split_text(documents['insurance_act'])
char_splits      = char_splitter.split_text(documents['insurance_act'])
token_splits     = token_splitter.split_text(documents['insurance_act'])
recursive_splits = recursive_splitter.split_text(documents['insurance_act'])

In [31]:
splits = [semantic_splits, char_splits, token_splits, recursive_splits]

for split in splits:
    print("Statistics for method are: ")
    split_lens = {len(x): len(embed_model.tokenize(text=x.encode('utf-8'))) for x in split}
    print(f"Number of splits are : {len(split)}")
    print(f"Minimum length of split is : {min(split_lens.keys())}")
    print(f"Maximum length of split is : {max(split_lens.keys())}")
    print(f"The average length of split is : {sum(list(split_lens.keys())) / len(list(split_lens.keys()))}")
    print(f"Minimum length of tokens is : {min(split_lens.values())}")
    print(f"Maximum length of tokens is : {max(split_lens.values())}")
    print(f"The average length of tokens is : {sum(list(split_lens.values())) / len(list(split_lens.keys()))}")

Statistics for method are: 
Number of splits are : 32
Minimum length of split is : 7
Maximum length of split is : 36823
The average length of split is : 4776.689655172414
Minimum length of tokens is : 8
Maximum length of tokens is : 7605
The average length of tokens is : 1021.2068965517242
Statistics for method are: 
Number of splits are : 72
Minimum length of split is : 702
Maximum length of split is : 1999
The average length of split is : 1917.5862068965516
Minimum length of tokens is : 168
Maximum length of tokens is : 586
The average length of tokens is : 412.6034482758621
Statistics for method are: 
Number of splits are : 16
Minimum length of split is : 3802
Maximum length of split is : 9480
The average length of split is : 8645.8
Minimum length of tokens is : 913
Maximum length of tokens is : 1947
The average length of tokens is : 1842.9333333333334
Statistics for method are: 
Number of splits are : 21
Minimum length of split is : 2049
Maximum length of split is : 9645
The averag

In [32]:
def find_scores(split_embeddings):
    for i in range(len(split_embeddings.keys())):
        if i == len(split_embeddings.keys()) - 1:
            score = 0
        else:
            score = cosine_similarity(split_embeddings[i]['Embedding'], split_embeddings[i+1]['Embedding'])[0][0]        
        split_embeddings[i]['Score'] = score
    return split_embeddings

In [33]:
split_methods = ['semantic', 'char', 'token', 'recursive']

split_dict = {'semantic' : {'splits' : semantic_splits},
              'char' : {'splits' : char_splits},
              'token' : {'splits' : token_splits},
              'recursive' : {'splits' : recursive_splits}}

for split_type in split_dict.keys():
    split_dict[split_type]['base_embeddings'] =  {idx : {'Text' : x, 'Embedding' : np.array(embed_model.create_embedding(x)['data'][0]['embedding']).reshape(1, -1), 'can_join' : True} for idx, x in enumerate(split_dict[split_type]['splits'])}
    split_dict[split_type]['base_embeddings'] = find_scores(split_dict[split_type]['base_embeddings'])

In [34]:
def combine_smaller(split_embeddings, min_chunk_size):
    to_remove = []
    for i in split_embeddings.keys():
        if len(split_embeddings[i]['Text']) < min_chunk_size:
            if((split_embeddings[i]['Score'] >= split_embeddings[max(i-1,0)]['Score']) or ((i-1) in to_remove)):
                split_embeddings[i+1]['Text'] += split_embeddings[i]['Text']
                split_embeddings[i+1]['Embedding'] = np.add(split_embeddings[i]['Embedding'], split_embeddings[i+1]['Embedding'])
                to_remove.append(i)
            else:
                split_embeddings[i-1]['Text'] += split_embeddings[i]['Text']
                split_embeddings[i-1]['Embedding'] = np.add(split_embeddings[i]['Embedding'], split_embeddings[i-1]['Embedding'])
                to_remove.append(i)
    split_embeddings = {k: v for k, v in split_embeddings.items() if k not in to_remove}
    return split_embeddings

In [35]:
def find_max(split_embeddings):
    return max(d['Score'] for d in split_embeddings.values())

In [36]:
def find_split(text1, text2='', min_len = 200, max_len=2000, overlap=0):
    text = text1 + text2
    splits = []
    n_splits = len(text) / min_len
    n_splits = np.random.randint(2,n_splits)
    part_size = len(text) // n_splits
    splits = [text[i:i+part_size+overlap] for i in range(0, len(text), part_size)]
        
    embeddings = []
    for split in splits:
        embeddings.append(np.array(embed_model.create_embedding(split)['data'][0]['embedding']).reshape(1, -1))
    scores = []
    for i in range(len(embeddings) - 1):
        score = cosine_similarity(embeddings[i], embeddings[i+1])[0][0]
        scores.append((sum(scores) + score) / (len(scores) + 1))
    try:
        split_index = scores.index(min(scores))
        if split_index != 0:
            text1 = ' '.join(splits[:split_index])
            text2 = ' '.join(splits[split_index:])
        else:
            text1 = ' '.join(splits[:split_index+1])
            text2 = ' '.join(splits[split_index+1:])
    except:
        print(scores)
        print(len(splits))
        print(len(embeddings))
        print(split_index)
    response = []
    if(len(text1) < max_len):
        response.extend([text1])
    elif len(text1) > max_len:
        text1 = find_split(text1=text1, min_len=min_len, max_len=max_len, overlap=overlap)
        response.extend(text1)
    if(len(text2) < max_len):
        response.extend([text2])
    elif len(text2) > max_len:
        text2 = find_split(text1=text2, min_len=min_len, max_len=max_len, overlap=overlap)
        response.extend(text2)
    return list(set(response))

In [37]:
def break_bigger(split_embeddings, min_len, max_len, overlap):
    to_add = []
    to_remove = []
    for i in split_embeddings.keys():
        if len(split_embeddings[i]['Text']) > max_len:
            smaller_chunks = find_split(split_embeddings[i]['Text'], min_len=min_len, max_len=max_len, overlap=overlap)
            to_add.extend(smaller_chunks)
            to_remove.append(i)
    to_add = list(set(to_add))
    for i in range(len(to_add)):
        text = to_add[i]
        if len(text) > max_len:
            raise ValueError("Length of new chunk is greater than maximum length")
        embed = np.array(embed_model.create_embedding(text)['data'][0]['embedding']).reshape(1, -1)
        split_embeddings[i+len(split_embeddings.keys())] = {'Text' : text, 'Embedding' : embed, 'Score' : 0, 'can_join' : False}
    split_embeddings = {k: v for k, v in split_embeddings.items() if k not in to_remove}
    return split_embeddings

In [38]:
def combine_embeddings(max_len, threshold, split_embeddings, min_len, overlap, repeat=False):
    split_embeddings = combine_smaller(split_embeddings, min_len)
    split_embeddings = break_bigger(split_embeddings, min_len, max_len, overlap)
    temp_dict = {i: split_embeddings[k] for i, k in enumerate(sorted(split_embeddings.keys()))}
    split_embeddings = temp_dict
    split_embeddings = find_scores(split_embeddings)
    max_score = find_max(split_embeddings)
    counter = 1
    while max_score > threshold:
        to_delete = []
        texts_add = []
        for i in split_embeddings.keys():
            if(split_embeddings[i]['Score'] > threshold and split_embeddings[i]['can_join'] == True):
                if(len(split_embeddings[i]['Text'] + split_embeddings[i+1]['Text'])) < max_len:
                    split_embeddings[i+1]['Embedding'] = np.add(split_embeddings[i]['Embedding'], split_embeddings[i+1]['Embedding'])
                    split_embeddings[i+1]['Text']      = split_embeddings[i]['Text'] + split_embeddings[i+1]['Text']
                    to_delete.append(i)
                else:
                    smaller_chunks = find_split(split_embeddings[i]['Text'], split_embeddings[i+1]['Text'], min_len=min_len, max_len=max_len, overlap=overlap)
                    texts_add.append(smaller_chunks)
                    to_delete.extend([i, i+1])
        to_delete = set(to_delete)
        num_chunks = list(split_embeddings.keys())[-1]
        new_chunks = 1
        for j in range(len(texts_add)):
            for i in range(len(texts_add[j])):
                text = texts_add[i]
                embed = np.array(embed_model.create_embedding(text)['data'][0]['embedding']).reshape(1, -1)
                split_embeddings[num_chunks + new_chunks] = {'Text' : text, 'Embedding' : embed, 'Score' : 0}
                if i == len(texts_add[j]) - 1:
                    split_embeddings[num_chunks + new_chunks]['can_join'] = True
                else:
                    split_embeddings[num_chunks + new_chunks]['can_join'] = False
                new_chunks += 1
        if(len(to_delete) == 0) and repeat:
            return split_embeddings
        elif len(to_delete) == 0:
            repeat = True
            continue
        split_embeddings = {k: v for k, v in split_embeddings.items() if k not in to_delete}
        final_len = len(split_embeddings.keys())
        temp_dict = {i: split_embeddings[k] for i, k in enumerate(sorted(split_embeddings.keys()))}
        split_embeddings = temp_dict
        find_scores(split_embeddings)
        max_score = find_max(split_embeddings)
        print(f"After {counter} iterations, the number of splits are : {final_len}. The highest similarity score is : {max_score}")
        counter += 1
        repeat = False
    return split_embeddings

In [39]:
for split_type in split_dict.keys():
    split_dict[split_type]['optimize_embeddings'] =  combine_embeddings(2000, 0.7, split_dict[split_type]['base_embeddings'], 500, 0)

After 1 iterations, the number of splits are : 195. The highest similarity score is : 0.7296680204798144


In [40]:
for split_type in split_dict.keys():
    base_lens = [len(x['Text']) for x in split_dict[split_type]['base_embeddings'].values()]
    optimized_lens = [len(x['Text']) for x in split_dict[split_type]['optimize_embeddings'].values()]
    print(f"For split type {split_type}, the number of base chunks are : {len(base_lens)} and optimized chunks are : {len(optimized_lens)}")
    print(f"The minimum length for base is {min(base_lens)} and optimized is {min(optimized_lens)}")
    print(f"The maximum length for base is {max(base_lens)} and optimized is {max(optimized_lens)}")
    print(f"The average length for base is {sum(base_lens) / len(base_lens)} and optimized is {sum(optimized_lens) / len(optimized_lens)}")

For split type semantic, the number of base chunks are : 32 and optimized chunks are : 117
The minimum length for base is 10 and optimized is 540
The maximum length for base is 37032 and optimized is 1988
The average length for base is 4410.90625 and optimized is 1145.2051282051282
For split type char, the number of base chunks are : 72 and optimized chunks are : 195
The minimum length for base is 702 and optimized is 3
The maximum length for base is 1999 and optimized is 1922
The average length for base is 1925.611111111111 and optimized is 16.8
For split type token, the number of base chunks are : 16 and optimized chunks are : 122
The minimum length for base is 3802 and optimized is 565
The maximum length for base is 9480 and optimized is 1991
The average length for base is 8669.1875 and optimized is 1140.7377049180327
For split type recursive, the number of base chunks are : 21 and optimized chunks are : 123
The minimum length for base is 2049 and optimized is 545
The maximum length

In [41]:
client = QdrantClient(path="rag_eval")

client.create_collection(collection_name='semantic', vectors_config=VectorParams(size=384, distance=Distance.COSINE))
client.create_collection(collection_name='char', vectors_config=VectorParams(size=384, distance=Distance.COSINE))
client.create_collection(collection_name='token', vectors_config=VectorParams(size=384, distance=Distance.COSINE))
client.create_collection(collection_name='recursive', vectors_config=VectorParams(size=384, distance=Distance.COSINE))
client.create_collection(collection_name='optimized-semantic', vectors_config=VectorParams(size=384, distance=Distance.COSINE))
client.create_collection(collection_name='optimized-char', vectors_config=VectorParams(size=384, distance=Distance.COSINE))
client.create_collection(collection_name='optimized-token', vectors_config=VectorParams(size=384, distance=Distance.COSINE))
client.create_collection(collection_name='optimized-recursive', vectors_config=VectorParams(size=384, distance=Distance.COSINE))

True

In [45]:
split_dict.keys()

dict_keys(['semantic', 'char', 'token', 'recursive'])

In [47]:
# Create collection for semantic splits
points = [
  PointStruct(
    id = str(uuid.uuid4()),
    vector = chunk['Embedding'].flatten().tolist(),
    payload = {
      "text": chunk['Text']
    }
  )
  for chunk in split_dict['semantic']['base_embeddings'].values()]

operation_info = client.upsert(
    collection_name='semantic',
    wait=True,
    points=points
)

points = [
  PointStruct(
    id = str(uuid.uuid4()),
    vector = chunk['Embedding'].flatten().tolist(),
    payload = {
      "text": chunk['Text']
    }
  )
  for chunk in split_dict['char']['base_embeddings'].values()]

operation_info = client.upsert(
    collection_name='char',
    wait=True,
    points=points
)

points = [
  PointStruct(
    id = str(uuid.uuid4()),
    vector = chunk['Embedding'].flatten().tolist(),
    payload = {
      "text": chunk['Text']
    }
  )
  for chunk in split_dict['token']['base_embeddings'].values()]

operation_info = client.upsert(
    collection_name='token',
    wait=True,
    points=points
)

# Create collection for recursive splits
points = [
  PointStruct(
    id = str(uuid.uuid4()),
    vector = chunk['Embedding'].flatten().tolist(),
    payload = {
      "text": chunk['Text']
    }
  )
  for chunk in split_dict['recursive']['base_embeddings'].values()]

operation_info = client.upsert(
    collection_name='recursive',
    wait=True,
    points=points
)

#Create collection for optimized semantic splits
points = [
  PointStruct(
    id = str(uuid.uuid4()),
    vector = chunk['Embedding'].flatten().tolist(),
    payload = {
      "text": chunk['Text']
    }
  )
  for chunk in split_dict['semantic']['optimize_embeddings'].values()]

operation_info = client.upsert(
    collection_name='optimized-semantic',
    wait=True,
    points=points
)

#Create collection for optimized char splits
points = [
  PointStruct(
    id = str(uuid.uuid4()),
    vector = chunk['Embedding'].flatten().tolist(),
    payload = {
      "text": chunk['Text']
    }
  )
  for chunk in split_dict['char']['optimize_embeddings'].values()]

operation_info = client.upsert(
    collection_name='optimized-char',
    wait=True,
    points=points
)

#Create collection for optimized token splits
points = [
  PointStruct(
    id = str(uuid.uuid4()),
    vector = chunk['Embedding'].flatten().tolist(),
    payload = {
      "text": chunk['Text']
    }
  )
  for chunk in split_dict['token']['optimize_embeddings'].values()]

operation_info = client.upsert(
    collection_name='optimized-token',
    wait=True,
    points=points
)

#Create collection for optimized recursive splits
points = [
  PointStruct(
    id = str(uuid.uuid4()),
    vector = chunk['Embedding'].flatten().tolist(),
    payload = {
      "text": chunk['Text']
    }
  )
  for chunk in split_dict['recursive']['optimize_embeddings'].values()]

operation_info = client.upsert(
    collection_name='optimized-recursive',
    wait=True,
    points=points
)