## 1. <font color = red> Install and Import the Required Libraries

In [1]:
!pip install -U -q pdfplumber tiktoken openai chromadb sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.2/661.2 kB[0m [31m39.1 MB/s[0m eta [36m0:

In [2]:
# Import all the required Libraries

import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
import openai
import chromadb

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## 2. <font color = red> Read, Process and Checnk the PDF File

In [4]:
# Define the path of the PDF
single_pdf_path = '/content/drive/MyDrive/ColabNotebooks/HelpMate/HelpMate/Principal-Sample-Life-Insurance-Policy.pdf'


In [5]:
# Open the PDF file
with pdfplumber.open(single_pdf_path) as pdf:

    # Get one of the pages from the PDF and examine it
    single_page = pdf.pages[6]

    # Extract text from the first page
    text = single_page.extract_text()

    # Extract tables from the first page
    tables = single_page.extract_tables()

    # Print the extracted text
    print(text)



Section A – Eligibility
Member Life Insurance Article 1
Member Accidental Death and Dismemberment Insurance Article 2
Dependent Life Insurance Article 3
Section B - Effective Dates
Member Life Insurance Article 1
Member Accidental Death and Dismemberment Insurance Article 2
Dependent Life Insurance Article 3
Section C - Individual Terminations
Member Life Insurance Article 1
Member Accidental Death and Dismemberment Insurance Article 2
Dependent Life Insurance Article 3
Termination for Fraud Article 4
Coverage While Outside of the United States Article 5
Section D - Continuation
Member Life Insurance Article 1
Dependent Insurance - Developmentally Disabled or
Physically Handicapped Children Article 2
Section E - Reinstatement
Reinstatement Article 1
Federal Required Family and Medical Leave Act (FMLA) Article 2
Reinstatement of Coverage for a Member or Dependent When
Coverage Ends due to Living Outside of the United States Article 3
Section F - Individual Purchase Rights
Member Life In

In [6]:
# Function to check whether a word is present in a table or not for segregation of regular text and tables

def check_bboxes(word, table_bbox):
    # Check whether word is inside a table bbox.
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

In [7]:
# Function to extract text from a PDF file.
# 1. Declare a variable p to store the iteration of the loop that will help us store page numbers alongside the text
# 2. Declare an empty list 'full_text' to store all the text files
# 3. Use pdfplumber to open the pdf pages one by one
# 4. Find the tables and their locations in the page
# 5. Extract the text from the tables in the variable 'tables'
# 6. Extract the regular words by calling the function check_bboxes() and checking whether words are present in the table or not
# 7. Use the cluster_objects utility to cluster non-table and table words together so that they retain the same chronology as in the original PDF
# 8. Declare an empty list 'lines' to store the page text
# 9. If a text element in present in the cluster, append it to 'lines', else if a table element is present, append the table
# 10. Append the page number and all lines to full_text, and increment 'p'
# 11. When the function has iterated over all pages, return the 'full_text' list

def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []


    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([page_no, " ".join(lines)])
            p +=1

    return full_text

In [8]:
# Initialize an empty list to store the extracted texts and document names
data = []

# Process the PDF file
print(f"...Processing {single_pdf_path}")

# Call the function to extract the text from the PDF
extracted_text = extract_text_from_pdf(single_pdf_path)

# Convert the extracted list to a PDF, and add a column to store document names
extracted_text_df = pd.DataFrame(extracted_text, columns=['Page No.', 'Page_Text'])

# Append the extracted text and document name to the list
data.append(extracted_text_df)

# Print a message to indicate progress
print(f"Finished processing {single_pdf_path}")

# Print a message to indicate all PDFs have been processed
print("PDF have been processed.")



...Processing /content/drive/MyDrive/ColabNotebooks/HelpMate/HelpMate/Principal-Sample-Life-Insurance-Policy.pdf




Finished processing /content/drive/MyDrive/ColabNotebooks/HelpMate/HelpMate/Principal-Sample-Life-Insurance-Policy.pdf
PDF have been processed.


In [9]:
data

[   Page No.                                          Page_Text
 0    Page 1  DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...
 1    Page 2                 This page left blank intentionally
 2    Page 3  POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...
 3    Page 4                 This page left blank intentionally
 4    Page 5  PRINCIPAL LIFE INSURANCE COMPANY (called The P...
 ..      ...                                                ...
 59  Page 60  I f a Dependent who was insured dies during th...
 60  Page 61  Section D - Claim Procedures Article 1 - Notic...
 61  Page 62  A claimant may request an appeal of a claim de...
 62  Page 63                 This page left blank intentionally
 63  Page 64  Principal Life Insurance Company Des Moines, I...
 
 [64 rows x 2 columns]]

In [10]:
# Concatenate all the DFs in the list 'data' together

insurance_pdfs_data = pd.concat(data, ignore_index=True)

In [11]:
insurance_pdfs_data.head(5)

Unnamed: 0,Page No.,Page_Text
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...
1,Page 2,This page left blank intentionally
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...
3,Page 4,This page left blank intentionally
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...


In [12]:
# Check one of the extracted page texts to ensure that the text has been correctly read

insurance_pdfs_data.Page_Text[2]

'POLICY RIDER GROUP INSURANCE POLICY NO: S655 COVERAGE: Life EMPLOYER: RHODE ISLAND JOHN DOE Effective on the later of the Date of Issue of this Group Policy or March 1, 2005, the following will apply to your Policy: From time to time The Principal may offer or provide certain employer groups who apply for coverage with The Principal a Financial Services Hotline and Grief Support Services or any other value added service for the employees of that employer group. In addition, The Principal may arrange for third party service providers (i.e., optometrists, health clubs), to provide discounted goods and services to those employer groups who apply for coverage with The Principal or who become insureds/enrollees of The Principal. While The Principal has arranged these goods, services and/or third party provider discounts, the third party service providers are liable to the applicants/insureds/enrollees for the provision of such goods and/or services. The Principal is not responsible for the

In [13]:
# Let's also check the length of all the texts as there might be some empty pages or pages with very few words that we can drop

insurance_pdfs_data['Text_Length'] = insurance_pdfs_data['Page_Text'].apply(lambda x: len(x.split(' ')))

In [14]:
# Retain only the rows with a text length of at least 10

insurance_pdfs_data = insurance_pdfs_data.loc[insurance_pdfs_data['Text_Length'] >= 10]
insurance_pdfs_data

Unnamed: 0,Page No.,Page_Text,Text_Length
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,30
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,230
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,110
5,Page 6,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,153
6,Page 7,Section A – Eligibility Member Life Insurance ...,176
7,Page 8,Section A - Member Life Insurance Schedule of ...,171
8,Page 9,P ART I - DEFINITIONS When used in this Group ...,387
9,Page 10,T he legally recognized union of two eligible ...,251
10,Page 11,(2) has been placed with the Member or spouse ...,299
11,Page 12,An institution that is licensed as a Hospital ...,352


In [15]:
# Store the metadata for each page in a separate column

insurance_pdfs_data['Metadata'] = insurance_pdfs_data.apply(lambda x: { 'Page_No.': x['Page No.']}, axis=1)

In [16]:
insurance_pdfs_data

Unnamed: 0,Page No.,Page_Text,Text_Length,Metadata
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,30,{'Page_No.': 'Page 1'}
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,230,{'Page_No.': 'Page 3'}
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,110,{'Page_No.': 'Page 5'}
5,Page 6,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,153,{'Page_No.': 'Page 6'}
6,Page 7,Section A – Eligibility Member Life Insurance ...,176,{'Page_No.': 'Page 7'}
7,Page 8,Section A - Member Life Insurance Schedule of ...,171,{'Page_No.': 'Page 8'}
8,Page 9,P ART I - DEFINITIONS When used in this Group ...,387,{'Page_No.': 'Page 9'}
9,Page 10,T he legally recognized union of two eligible ...,251,{'Page_No.': 'Page 10'}
10,Page 11,(2) has been placed with the Member or spouse ...,299,{'Page_No.': 'Page 11'}
11,Page 12,An institution that is licensed as a Hospital ...,352,{'Page_No.': 'Page 12'}


In [17]:
# Function to split text into fixed-size chunks
def split_text_into_chunks(text, chunk_size):
    chunks = []
    words = text.split()  # Split the text into words

    current_chunk = []  # Store words for the current chunk
    current_chunk_word_count = 0  # Count of words in the current chunk

    for word in words:
        if current_chunk_word_count + len(word) + 1 <= chunk_size:
            current_chunk.append(word)
            current_chunk_word_count += len(word) + 1
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_chunk_word_count = len(word)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks



In [18]:
def process_page(page_no):
    page = insurance_pdfs_data[insurance_pdfs_data['Page No.'] == page_no].Page_Text.values[0]
    metadata = insurance_pdfs_data[insurance_pdfs_data['Page No.'] == page_no].Metadata.values[0]

    if page is not None:
        # setting chunk size as 500
        chunk_size = 500
        text_chunks = split_text_into_chunks(page, chunk_size)

        # Creating a DataFrame to store the chunks, page title and page metadata
        data = {'Page No.': [], 'Page_Text': [], 'Metadata': []}

        for index, chunk in enumerate(text_chunks):
            data['Page No.'].append(page_no)
            data['Page_Text'].append(chunk)
            # adding chunk no as part of metadata
            metadata['Chunk_No.'] = index
            data['Metadata'].append(metadata)

        return pd.DataFrame(data)

In [19]:
# creating a dataframe after calling process
page_nos = insurance_pdfs_data["Page No."]
page_nos
all_dfs = []
for page_no in page_nos:
    df = process_page(page_no)
    if df is not None:
        all_dfs.append(df)

fixed_chunk_df = pd.concat(all_dfs, ignore_index=True)
fixed_chunk_df

Unnamed: 0,Page No.,Page_Text,Metadata
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,"{'Page_No.': 'Page 1', 'Chunk_No.': 0}"
1,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,"{'Page_No.': 'Page 3', 'Chunk_No.': 2}"
2,Page 3,arrange for third party service providers (i.e...,"{'Page_No.': 'Page 3', 'Chunk_No.': 2}"
3,Page 3,the provision of such goods and/or services no...,"{'Page_No.': 'Page 3', 'Chunk_No.': 2}"
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,"{'Page_No.': 'Page 5', 'Chunk_No.': 1}"
...,...,...,...
222,Page 61,incomplete claim. Written notification will be...,"{'Page_No.': 'Page 61', 'Chunk_No.': 4}"
223,Page 62,A claimant may request an appeal of a claim de...,"{'Page_No.': 'Page 62', 'Chunk_No.': 3}"
224,Page 62,"requested additional information, The Principa...","{'Page_No.': 'Page 62', 'Chunk_No.': 3}"
225,Page 62,may have the Member or Dependent whose loss is...,"{'Page_No.': 'Page 62', 'Chunk_No.': 3}"


## 3. <font color = red> Generate and Store Embeddings using OpenAI and ChromaDB

In this section, we will embed the pages in the dataframe through OpenAI's `text-embedding-ada-002` model, and store them in a ChromaDB collection.

In [20]:
# Set the API key
#filepath = "/content/drive/MyDrive/HelpMate/
filepath = '/content/drive/MyDrive/ColabNotebooks/HelpMate/HelpMate/'


with open(filepath + "OpenAI_API_Key.txt", "r") as f:
  openai.api_key = ' '.join(f.readlines())

In [21]:
# Import the OpenAI Embedding Function into chroma

from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

In [22]:
# Define the path where chroma collections will be stored

chroma_data_path = '/content/drive/MyDrive/HelpMate/ChromaDB_Data'

In [23]:
import chromadb

In [24]:
# Call PersistentClient()

client = chromadb.PersistentClient()

In [25]:
# Set up the embedding function using the OpenAI embedding model

model = "text-embedding-ada-002"
embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)

In [26]:
# Initialise a collection in chroma and pass the embedding_function to it so that it used OpenAI embeddings to embed the documents

insurance_collection = client.get_or_create_collection(name='RAG_on_Insurance', embedding_function=embedding_function)

In [27]:
# Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma

documents_list = fixed_chunk_df["Page_Text"].tolist()
metadata_list = fixed_chunk_df['Metadata'].tolist()

In [28]:
documents_list

['DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/01/2014 711 HIGH STREET GEORGE RI 02903 GROUP POLICY FOR: RHODE ISLAND JOHN DOE ALL MEMBERS Group Member Life Insurance Print Date: 07/16/2014',
 'POLICY RIDER GROUP INSURANCE POLICY NO: S655 COVERAGE: Life EMPLOYER: RHODE ISLAND JOHN DOE Effective on the later of the Date of Issue of this Group Policy or March 1, 2005, the following will apply to your Policy: From time to time The Principal may offer or provide certain employer groups who apply for coverage with The Principal a Financial Services Hotline and Grief Support Services or any other value added service for the employees of that employer group. In addition, The Principal may',
 'arrange for third party service providers (i.e., optometrists, health clubs), to provide discounted goods and services to those employer groups who apply for coverage with The Principal or who become insureds/enrollees of The Principal. While The Principal has arranged these goods, services and/or third

In [29]:
# Add the documents and metadata to the collection alongwith generic integer IDs. You can also feed the metadata information as IDs by combining the policy name and page no.

insurance_collection.add(
    documents= documents_list,
    ids = [str(i) for i in range(0, len(documents_list))],
    metadatas = metadata_list
)

In [30]:
# Let's take a look at the first few entries in the collection

insurance_collection.get(
    ids = ['0','1','2'],
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['0', '1', '2'],
 'embeddings': array([[-0.02259402,  0.01867824, -0.02725379, ..., -0.03693881,
          0.00290257, -0.00138684],
        [-0.02493387,  0.0016555 , -0.00959047, ..., -0.01926274,
         -0.00387163,  0.0051156 ],
        [ 0.00231976, -0.00615884,  0.00045036, ...,  0.00821858,
         -0.00602628, -0.0001812 ]]),
 'documents': ['DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/01/2014 711 HIGH STREET GEORGE RI 02903 GROUP POLICY FOR: RHODE ISLAND JOHN DOE ALL MEMBERS Group Member Life Insurance Print Date: 07/16/2014',
  'POLICY RIDER GROUP INSURANCE POLICY NO: S655 COVERAGE: Life EMPLOYER: RHODE ISLAND JOHN DOE Effective on the later of the Date of Issue of this Group Policy or March 1, 2005, the following will apply to your Policy: From time to time The Principal may offer or provide certain employer groups who apply for coverage with The Principal a Financial Services Hotline and Grief Support Services or any other value added service for the employees o

In [31]:
cache_collection = client.get_or_create_collection(name='Insurance_Cache', embedding_function=embedding_function)

In [32]:
cache_collection.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents', 'embeddings'],
 'data': None,
 'metadatas': []}

In [33]:

# Implementing Cache in Semantic Search

# Set a threshold for cache search

def sementicsearch(query):

  threshold = 0.2

  ids = []
  documents = []
  distances = []
  metadatas = []
  results_df = pd.DataFrame()

  # try to find from cache
  cache_results = cache_collection.query(
      query_texts=query,
      n_results=1
  )

  # If the distance is greater than the threshold, then return the results from the main collection.

  if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
        # Query the collection against the user query and return the top 10 results
        results = insurance_collection.query(
        query_texts=query,
        n_results=10
        )

        # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
        # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
        Keys = []
        Values = []

        for key, val in results.items():
          if val is None:
            continue
          for i in range(9):
            Keys.append(str(key)+str(i))
            Values.append(str(val[0][i]))


        cache_collection.add(
            documents= [query],
            ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
            metadatas = dict(zip(Keys, Values))
        )

        print("Not found in cache. Found in main collection.")

        result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
        results_df = pd.DataFrame.from_dict(result_dict)
        #results_df


  # If the distance is, however, less than the threshold, you can return the results from cache

  elif cache_results['distances'][0][0] <= threshold:
        cache_result_dict = cache_results['metadatas'][0][0]

        # Loop through each inner list and then through the dictionary
        for key, value in cache_result_dict.items():
            if 'ids' in key:
                ids.append(value)
            elif 'documents' in key:
                documents.append(value)
            elif 'distances' in key:
                distances.append(value)
            elif 'metadatas' in key:
                metadatas.append(value)

        print("Found in cache!")

        # Create a DataFrame
        results_df = pd.DataFrame({
          'IDs': ids,
          'Documents': documents,
          'Distances': distances,
          'Metadatas': metadatas
        })

  return results_df


In [34]:
from sentence_transformers import CrossEncoder, util

In [35]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

In [36]:
# Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model

def generate_response(query, results_df):
    """
    Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
    """
    messages = [
                {"role": "system", "content":  "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
                {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
                                                You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{results_df}'. These search results are essentially one page of an insurance document that may be relevant to the user query.

                                                The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the policy name and source page. The text inside the document may also contain tables in the format of a list of lists where each of the nested lists indicates a row.

                                                Use the documents in '{results_df}' to answer the query '{query}'. Frame an informative answer and also, use the dataframe to return the relevant policy names and page numbers as citations.

                                                Follow the guidelines below when performing the task.
                                                1. Try to provide relevant/accurate numbers if available.
                                                2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
                                                3. If the document text has tables with relevant information, please reformat the table and return the final information in a tabular in format.
                                                3. Use the Metadatas columns in the dataframe to retrieve and cite the policy name(s) and page numbers(s) as citation.
                                                4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
                                                5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.

                                                The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text along with the citation. Provide your complete response first with all information, and then provide the citations.
                                                """},
              ]

    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages
    )

    return response.choices[0].message.content.split('\n')

In [37]:
# Input (query, response) pairs for each of the top 20 responses received from the semantic search to the cross encoder
# Generate the cross_encoder scores for these pairs
query = "Can you please explain on accidental injury sustained while driving "

def retrieveresults(query):
  results_df = sementicsearch(query)

  cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
  cross_inputs = [[query, response] for response in results_df['Documents']]
  cross_rerank_scores = cross_encoder.predict(cross_inputs)

  results_df['Reranked_scores'] = cross_rerank_scores

  top_3_semantic = results_df.sort_values(by='Distances')
  top_3_rerank = results_df.sort_values(by='Reranked_scores', ascending=False)
  top_RAG = top_3_rerank[["Documents", "Metadatas"]][:3]

  response = generate_response(query, top_RAG)
  response = generate_response(query, top_RAG)

  #print("\n".join(response))
  #print (response)
  return (response,top_RAG)



In [38]:
query = "Can you please explain on accidental injury sustained while driving "
(response,top_RAG) = retrieveresults(query)
print ("TOP 3 SEARCH RESULTS\n")

print (top_RAG)
print ("\n \n RESPONSE TO QUERY FROM LLM \n")
print ("\n".join(response))

Not found in cache. Found in main collection.
TOP 3 SEARCH RESULTS

                                           Documents  \
0  Exposure Exposure to the elements will be pres...   
2  Article 2 - Benefit Qualification To qualify f...   
1  vehicle, station wagon, pick-up truck, or van-...   

                                 Metadatas  
0  {'Chunk_No.': 3, 'Page_No.': 'Page 55'}  
2  {'Page_No.': 'Page 53', 'Chunk_No.': 3}  
1  {'Chunk_No.': 3, 'Page_No.': 'Page 55'}  

 
 RESPONSE TO QUERY FROM LLM 

Accidental injury sustained while driving refers to any injury that occurs while operating a vehicle. If you sustain an accidental injury while driving, it may be covered under your insurance policy depending on the terms and conditions specified in your policy document.

Here are some general steps to follow in the event of an accidental injury sustained while driving:
1. Seek immediate medical attention for any injuries.
2. Contact your insurance company to report the incident and file a

In [39]:
query = "Can you please explain on accidental injury sustained while driving "
(response,top_RAG) = retrieveresults(query)
print ("TOP 3 SEARCH RESULTS\n")

print (top_RAG)
print ("\n \n RESPONSE TO QUERY FROM LLM \n")
print ("\n".join(response))

Found in cache!
TOP 3 SEARCH RESULTS

                                           Documents  \
3  Exposure Exposure to the elements will be pres...   
1  Article 2 - Benefit Qualification To qualify f...   
5  vehicle, station wagon, pick-up truck, or van-...   

                                 Metadatas  
3  {'Chunk_No.': 3, 'Page_No.': 'Page 56'}  
1  {'Chunk_No.': 2, 'Page_No.': 'Page 58'}  
5  {'Chunk_No.': 3, 'Page_No.': 'Page 55'}  

 
 RESPONSE TO QUERY FROM LLM 

Accidental injuries sustained while driving are typically covered under the "Accidental Death and Dismemberment" section of an insurance policy. This coverage provides benefits in the event of accidental bodily injuries resulting from a covered accident, including those sustained while driving. The benefits may vary based on the severity of the injury and the terms outlined in the policy.

Here is an overview of the Accidental Death and Dismemberment coverage based on the insurance documents provided:

| Insurance Poli

In [40]:
query = "What are the conditions under which a policy can be reinstated "
(response,top_RAG) = retrieveresults(query)
print ("TOP 3 SEARCH RESULTS\n")

print (top_RAG)
print ("\n \n RESPONSE TO QUERY FROM LLM \n")
print ("\n".join(response))

Not found in cache. Found in main collection.
TOP 3 SEARCH RESULTS

                                           Documents  \
0  Section E - Reinstatement Article 1 - Reinstat...   
1  basis as that being provided on the date cover...   
3  for individual purchase under PART III, Sectio...   

                                 Metadatas  
0  {'Page_No.': 'Page 40', 'Chunk_No.': 3}  
1  {'Page_No.': 'Page 41', 'Chunk_No.': 2}  
3  {'Chunk_No.': 4, 'Page_No.': 'Page 47'}  

 
 RESPONSE TO QUERY FROM LLM 

In order to reinstate a policy, the following conditions may apply based on the provided documents:

1. The policyholder must adhere to the guidelines outlined in Section E - Reinstatement Article 1 of the policy document.
2. Reinstatement may be subject to specific terms and conditions mentioned in the policy document, such as those provided under PART III, Section.

Here is a structured response along with the citations:

**Conditions for Policy Reinstatement:**
- Refer to Section E - Re

In [41]:
query = "Under what conditions can a policy be terminated "
(response,top_RAG) = retrieveresults(query)
print ("TOP 3 SEARCH RESULTS\n")

print (top_RAG)
print ("\n \n RESPONSE TO QUERY FROM LLM \n")
print ("\n".join(response))

Found in cache!
TOP 3 SEARCH RESULTS

                                           Documents  \
7  for individual purchase under PART III, Sectio...   
2  I f coverage for a Member or Dependent termina...   
6  and b. will not be used to satisfy any provisi...   

                                 Metadatas  
7  {'Page_No.': 'Page 40', 'Chunk_No.': 3}  
2  {'Page_No.': 'Page 40', 'Chunk_No.': 3}  
6  {'Page_No.': 'Page 40', 'Chunk_No.': 3}  

 
 RESPONSE TO QUERY FROM LLM 

The policy can be terminated under the following conditions:

1. Coverage termination for a Member or Dependent
2. The policy will not be used to satisfy any provisions

These conditions are mentioned in the insurance document found on Page 40 under PART III, Section C.

Citation:
- Policy Name: [Insert Policy Name]
- Page Number: Page 40
