In [1]:
import pandas as pd
import pymupdf
import pymupdf4llm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from time import sleep
import re
from llama_cpp import Llama
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from optimization.src.optimizer import ChunkOptimizer
from Evaluation.src.prompts import *
from Evaluation.src.data_generator import DataGenerator
import json

In [2]:
embed_model = Llama(model_path='C:/Users/shour/.cache/lm-studio/models/second-state/All-MiniLM-L6-v2-Embedding-GGUF/all-MiniLM-L6-v2-Q4_0.gguf', 
                    embedding=True,
                    verbose=False)

In [16]:
documents = {}

insurance_act = 'insurance-information\Insurance Act,1938 - incorporating all amendments till 20212021-08-12.pdf'

  insurance_act = 'insurance-information\Insurance Act,1938 - incorporating all amendments till 20212021-08-12.pdf'


In [17]:
doc = pymupdf.open(insurance_act)

complete_text = ""
for page in doc.pages(12):
    text = page.get_text()
    text = text[2:]
    paras = text.split('\n \n \n1')[:-1]
    for para in paras:
        complete_text += para

In [18]:
text = re.sub(r'\d\*{1,3}', '', complete_text)
text = re.sub(r'\n\d+', '', text)
text = re.sub(r'\(\d+\)', '', text)
text = text.replace('*', '').replace('\n \n', '\n').replace('  ', ' ').replace('  ', ' ').replace('[', '').replace(']', '').replace(' \n', '\n').replace(' .', '.').replace('..', '.')
text = text.strip()
documents['insurance_act'] = text

In [19]:
policyholder_file = 'insurance-information\Draft IRDAI(Protection of Policyholders’ Interests and Allied Matters of Insurers) Regulations, 2024.pdf'

doc = pymupdf.open(policyholder_file)

complete_text = ""
for page in doc.pages(2):
    text = page.get_text()
    complete_text += text
complete_text = complete_text[235:]

  policyholder_file = 'insurance-information\Draft IRDAI(Protection of Policyholders’ Interests and Allied Matters of Insurers) Regulations, 2024.pdf'


In [20]:
text = re.sub(r'\d+\s*\|\s*P\s*a\s*g\s*e', '', complete_text)
text = re.sub(r'\(\d+\)', '', text)
text = re.sub(r'\n\d+', '', text)
text = text.replace('  ', '').replace('\n \n', '\n').replace('\n\n', '\n').replace('. \n', '').replace('*','').replace('__', '_')
documents['policyholder'] = text

In [21]:
handbook_path = 'insurance-information\Life Insurance Handbook (English).pdf'

doc = pymupdf.open(handbook_path)

complete_text = ""
for page in doc.pages(2):
    text = page.get_text()
    complete_text += text

  handbook_path = 'insurance-information\Life Insurance Handbook (English).pdf'


In [22]:
text = re.sub(r'\n\d+', '', complete_text)
text = text.replace('  ', '').replace('\n \n', '\n').replace('\n\n', '\n').replace('. \n', '').replace('•', '')
documents['handbook'] = text

In [23]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1200,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)

token_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4",
    chunk_size=150,
    chunk_overlap=0,
)

splits = text_splitter.split_text(documents['insurance_act'])
split_lens = {len(x): len(embed_model.tokenize(text=x.encode('utf-8'))) for x in splits}

In [24]:
print(f"Number of splits are : {len(splits)}")
print(f"Minimum length of split is : {min(split_lens.keys())}")
print(f"Maximum length of split is : {max(split_lens.keys())}")
print(f"The average length of split is : {sum(list(split_lens.keys())) / len(list(split_lens.keys()))}")
print(f"Minimum length of tokens is : {min(split_lens.values())}")
print(f"Maximum length of tokens is : {max(split_lens.values())}")
print(f"The average length of tokens is : {sum(list(split_lens.values())) / len(list(split_lens.keys()))}")

Number of splits are : 121
Minimum length of split is : 262
Maximum length of split is : 1200
The average length of split is : 1139.4931506849316
Minimum length of tokens is : 70
Maximum length of tokens is : 352
The average length of tokens is : 246.05479452054794


In [None]:
sns.barplot(x=np.arange(len(split_lens.keys())), y=split_lens.keys())
plt.figure(figsize=(12, 14))
plt.show();

In [None]:
split_embeddings = {idx : {'Text' : x, 'Embedding' : np.array(embed_model.create_embedding(x)['data'][0]['embedding']).reshape(1, -1)} for idx, x in enumerate(splits)}

In [None]:
def find_scores(split_embeddings):
    similarity_scores = []
    for i in range(len(split_embeddings.keys())):
        if i == len(split_embeddings.keys()) - 1:
            score = 0
        else:
            score = cosine_similarity(split_embeddings[i]['Embedding'], split_embeddings[i+1]['Embedding'])[0][0]
        similarity_scores.append(score)
        
        split_embeddings[i]['Score'] = score
    return similarity_scores


In [None]:
similarity_scores = find_scores(split_embeddings)
plt.plot(similarity_scores)

In [None]:
percentile = 90
print(f"The maximum score is : {max(similarity_scores)}")
print(f"The minimum score is : {min(similarity_scores)}")
print(f"The {percentile} is : {np.percentile(similarity_scores, percentile)}")

In [None]:
optimizer = ChunkOptimizer(embed_model)
split_embeddings = optimizer.optimize_chunks(splits, 1200, 200, 0.8)

In [None]:
len(split_embeddings.keys())

In [None]:
generator_model = 'meta-llama-3.1-8b-instruct'
discriminator_model = 'meta-llama-3.1-8b-instruct'
base_url = "http://192.168.84.106:1234/v1"

generator = DataGenerator(generator_model_id=generator_model, discriminator_model_id=discriminator_model, base_url=base_url)

In [None]:
#generator.context_preprocess(splits, 'base-chunks.json')

In [None]:
file_path = 'base-chunks.json'  # Replace with the actual file path

with open(file_path, 'r') as file:
    data = json.load(file)

In [None]:
len(data)

In [None]:
context_rich_splits = [x['text'] for x in data]

In [None]:
#generator.generate_qa(context_rich_splits, 'qac.json')

In [None]:
file_path = 'qac.json' 

with open(file_path, 'r') as file:
    qac = json.load(file)

In [None]:
qac[0]

In [None]:
generator.qac_evaluator(qac, 'question_eval.json')

In [None]:
def find_max(split_embeddings):
    return max(d['Score'] for d in split_embeddings.values())

In [None]:
find_max(split_embeddings)

In [None]:
def find_split(text1, text2='', chars=['.'], min_len = 200, max_len=2000):
    text = text1 + text2
    splits = []
    i = 0
    while((len(splits) < 2) and (i < len(chars))):
        token_splitter = CharacterTextSplitter(
        separator=chars[i],
        chunk_size=(min_len * 3),
        chunk_overlap=0,
        length_function=len,
        is_separator_regex=False,
        )
        splits = token_splitter.split_text(text)
        i += 1
    if len(splits) < 2:
        n_splits = len(text) / min_len
        n_splits = np.random.randint(2,n_splits)
        part_size = len(text) // n_splits
        splits = [text[i:i+part_size] for i in range(0, len(text), part_size)]
        
    embeddings = []
    for split in splits:
        embeddings.append(np.array(embed_model.create_embedding(split)['data'][0]['embedding']).reshape(1, -1))
    scores = []
    for i in range(len(embeddings) - 1):
        score = cosine_similarity(embeddings[i], embeddings[i+1])[0][0]
        scores.append((sum(scores) + score) / (len(scores) + 1))
    try:
        split_index = scores.index(min(scores))
        if split_index != 0:
            text1 = ' '.join(splits[:split_index])
            text2 = ' '.join(splits[split_index:])
        else:
            text1 = ' '.join(splits[:split_index+1])
            text2 = ' '.join(splits[split_index+1:])
    except:
        print(scores)
        print(len(splits))
        print(len(embeddings))
        print(split_index)
    response = []
    if((len(text1) > min_len) and (len(text1) < max_len)):
        response.extend([text1])
    elif len(text1) > max_len:
        print("Entering nested splitter", len(text1))
        text1 = find_split(text1, chars=chars)
        response.extend(text1)
    if((len(text2) > min_len) and (len(text2) < max_len)):
        response.extend([text2])
    elif len(text2) > max_len:
        print("Entering nested splitter", len(text2))
        text2 = find_split(text2, chars=chars)
        response.extend(text2)
    return list(set(response))

In [None]:
def combine_embeddings(max_len, threshold, split_embeddings, min_len, repeat=False):
    max_score = find_max(split_embeddings)
    counter = 1
    while max_score > threshold:
        to_delete = []
        texts_add = []
        for i in split_embeddings.keys():
            if split_embeddings[i]['Score'] > threshold:
                if len(split_embeddings[i]['Text'] + split_embeddings[i+1]['Text']) < max_len:
                    split_embeddings[i+1]['Embedding'] = np.add(split_embeddings[i]['Embedding'], split_embeddings[i+1]['Embedding'])
                    split_embeddings[i+1]['Text']      = split_embeddings[i]['Text'] + ' ' + split_embeddings[i+1]['Text']
                    to_delete.append(i)
                else:
                    smaller_chunks = find_split(split_embeddings[i]['Text'], split_embeddings[i+1]['Text'], ['.', ';', ','])
                    smaller_chunks = [x for x in smaller_chunks if len(x) > min_len]
                    texts_add.extend(smaller_chunks)
                    to_delete.extend([i, i+1])
        to_delete = set(to_delete)
        for i in range(len(texts_add)):
            text = texts_add[i]
            if len(text) > max_len:
                raise ValueError("Length of new chunk is greater than maximum length")
            embed = np.array(embed_model.create_embedding(text)['data'][0]['embedding']).reshape(1, -1)
            split_embeddings[i+len(split_embeddings.keys())] = {'Text' : text, 'Embedding' : embed, 'Score' : 0}
        if(len(to_delete) == 0) and repeat:
            return split_embeddings
        elif len(to_delete) == 0:
            repeat = True
        split_embeddings = {k: v for k, v in split_embeddings.items() if k not in to_delete}
        final_len = len(split_embeddings.keys())
        temp_dict = {i: split_embeddings[k] for i, k in enumerate(sorted(split_embeddings.keys()))}
        split_embeddings = temp_dict
        split_embeddings = temp_dict
        find_scores(split_embeddings)
        max_score = find_max(split_embeddings)
        print(f"After {counter} iterations, the number of splits are : {final_len}. The highest similarity score is : {max_score}")
        counter += 1
        repeat = False
    return split_embeddings

In [None]:
split_embeddings = combine_embeddings(2000, 0.75, split_embeddings, 200)

In [None]:
splits = [x['Text'] for x in split_embeddings.values()]

In [None]:
lens = [len(x['Text']) for x in split_embeddings.values()]

In [None]:
print(f"The length of shortest chunk is : {min(lens)}")
print(f"The length of the longest chunk is : {max(lens)}")
print(f"The average length of chunks is : {sum(lens) / len(lens)}")

In [None]:
splits[lens.index(min(lens))]

In [None]:
len(split_embeddings.keys())

In [None]:
similarity_scores = find_scores(split_embeddings)
plt.plot(similarity_scores)

In [None]:
percentile = 90
print(f"The maximum score is : {max(similarity_scores)}")
print(f"The minimum score is : {min(similarity_scores)}")
print(f"The {percentile} is : {np.percentile(similarity_scores, percentile)}")

In [None]:
splits = documents['policyholder_file'].split('\n')
splits = [x.strip() for x in splits if ((len(x) > 10) and (len(embed_model.tokenize(text=x.encode('utf-8'))) > 5))]
split_lens = {len(x) : len(embed_model.tokenize(text=x.encode('utf-8'))) for x in splits}

In [None]:
print(f"Number of splits are : {len(splits)}")
print(f"Minimum length of split is : {min(split_lens.keys())}")
print(f"Maximum length of split is : {max(split_lens.keys())}")
print(f"Minimum length of tokens is : {min(split_lens.values())}")
print(f"Maximum length of tokens is : {max(split_lens.values())}")

In [None]:
splits