In [1]:
import pandas as pd
from langchain.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredExcelLoader

In [2]:
documents= []
loader = Docx2txtLoader("/home/TeamNLP/Trainer_Examiner/AITrainer/Client/Prudential/faq/data/british_gas/HighMark_FAQ.docx")
# loader2 = Docx2txtLoader("demo_api/data/IRA_FAQ.docx")
excel_loader = UnstructuredExcelLoader("/home/TeamNLP/Trainer_Examiner/AITrainer/Client/Prudential/faq/data/british_gas/FAQ.xlsx")
documents.extend(loader.load())
documents.extend(excel_loader.load())

In [3]:
documents

[Document(metadata={'source': '/home/TeamNLP/Trainer_Examiner/AITrainer/Client/Prudential/faq/data/british_gas/HighMark_FAQ.docx'}, page_content='Understanding Common Insurance Terms\n\nHealth insurance and health insurance terminology can be tricky and sometimes confusing. That’s why we’ve broken down the most common insurance terms in easy-to-understand language. For more information on the rest of the most common insurance terms, please visit\xa0the Health Insurance Glossary.\n\nWhat is the Affordable Care Act (ACA)?\n\nThe\xa0Affordable Care Act (ACA), also referred to as “Obamacare” is care that aims to expand access to coverage, control health care costs and improve health care delivery for U.S. citizens and legal residents. Most U.S. citizens and legal residents are now required to have health insurance coverage or pay a penalty to the government. ACA legislation includes the expansion of\xa0Medicaid\xa0eligibility, the establishment of health insurance exchanges and protects he

In [3]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel
from langchain_openai import AzureChatOpenAI,AzureOpenAIEmbeddings
from config import configuration
import os

os.environ["OPENAI_API_TYPE"] = configuration['open_ai_cred']['OPENAI_API_TYPE']
os.environ["AZURE_OPENAI_ENDPOINT"] = configuration['open_ai_cred']['OPENAI_API_BASE']
os.environ["OPENAI_API_KEY"] = configuration['open_ai_cred']['OPENAI_API_KEY']

model = AzureChatOpenAI(
    azure_deployment=configuration['open_ai_config']['deployment_name'],  # or your deployment
    api_version=configuration['open_ai_config']['openai_api_version'],  # or your api version
    temperature=0,
    logprobs=False 
)

embeddings = AzureOpenAIEmbeddings(
	azure_deployment="text-embedding-3-small",
    model="text-embedding-3-small",
	api_key="76d132a244d24e658c34e95641e017ae",
	azure_endpoint = "https://azureai7383474271.cognitiveservices.azure.com",
	api_version='2023-05-15',
	dimensions=1536)

## Chunking Technique

In [None]:
def pages_to_lines(pages):
    document_lines = []
    for i, page in enumerate(pages):
        metadata = page.metadata
        #print(metadata)
        lines = page.page_content.split("\n")
        for line in lines:
            document_lines.append({
                "content": line,
                "element_type": "NarrativeText",
                "metadata":metadata,
                "page_number": i+1, # page numbers are 1-indexed
                "is_visual": False,
            })
    return document_lines

In [None]:
s=pages_to_lines(documents)
s

In [None]:
document_lines_str = [line["content"] for line in s]
document_str = "\n".join(document_lines_str)

In [None]:
document_str

In [None]:
def get_sections_text(sections, document_lines):
    """
    Takes in a list of DocumentSection objects and returns a list of dictionaries containing the attributes of each Section object plus the content of the section.
    """
    from typing import TypedDict
    class Section(TypedDict):
        title: str 
        metadata: list
        start: int
        end: int
        content: str
    section_dicts = []
    for i,s in enumerate(sections):
        if i == len(sections) - 1:
            end_index = len(document_lines) - 1
        else:
            end_index = sections[i+1]["start_index"]-1
        try:
            contents = [document_lines[j]["content"] for j in range(s["start_index"], end_index+1)]
            metadatas = [document_lines[j]["metadata"] for j in range(s["start_index"], end_index+1)]
        except Exception as e:
            print ("error in get_sections_text", e)
            print ("section ", s)
            raise e

        unique = list({item['source'] for item in metadatas})

        section_dicts.append(Section(
            title=s["title"],
            metadata=unique,
            content="\n".join(contents),
            start=s["start_index"],
            end=end_index
        ))
    return section_dicts


In [None]:
SYSTEM_PROMPT = """
Read the document below and extract a StructuredDocument object from it where each section of the document is centered around a single concept/topic. Whenever possible, your sections (and section titles) should match up with the natural sections of the document (i.e. Introduction, Conclusion, References, etc.). Sections can vary in length, but should generally be anywhere from a few paragraphs to a few pages long.
Each line of the document is marked with its line number in square brackets (e.g. [1], [2], [3], etc). Use the line numbers to indicate section start.
The start line numbers will be treated as inclusive. For example, if the first line of a section is line 5, the start_index should be 5. Your goal is to find the starting line number of a given section, where a section is a group of lines that are thematically related.
The first section must start at the first line number of the document ({start_line} in this case). The sections MUST cover the entire document. 
Section titles should be descriptive enough such that a person who is just skimming over the section titles and not actually reading the document can get a clear idea of what each section is about.
Note: the document provided to you may just be an excerpt from a larger document, rather than a complete document. Therefore, you can't always assume, for example, that the first line of the document is the beginning of the Introduction section and the last line is the end of the Conclusion section (if those section are even present).
"""

LANGUAGE_ADDENDUM = "For your section titles, YOU MUST use the same language as the document. If the document is in English, your section titles should be in English. If the document is in another language, your section titles should be in that language."


In [None]:
from pydantic import BaseModel, Field
from typing import List, Dict, Any
class DocumentSection(BaseModel):
    title: str = Field(description="main topic of this section of the document (very descriptive)")
    start_index: int = Field(description="line number where the section begins (inclusive)")
    
class StructuredDocument(BaseModel):
    """obtains meaningful sections, each centered around a single concept/topic"""
    sections: List[DocumentSection] = Field(description="a list of sections of the document")

In [None]:
# Add your custom path
import os
import sys
from config import configuration

custom_path = '/home/TeamNLP/Trainer_Examiner/AITrainer/Client/British_Gas/DEV/Shweta/faq/'
if custom_path not in sys.path:
	sys.path.append(custom_path)


In [None]:
from langchain_core.output_parsers import JsonOutputParser

def get_document_with_lines(document_lines, start_line: int, max_characters: int) :
    document_with_line_numbers = ""
    character_count = 0
    
    for i in range(start_line, len(document_lines)):
        line = document_lines[i]["content"]
        document_with_line_numbers += f"[{i}] {line}\n"
        character_count += len(line)
        if character_count > max_characters or i == len(document_lines) - 1:
            end_line = i
            break
    return document_with_line_numbers, end_line

def get_structured_document(document_with_line_numbers: str, start_line: int, llm_provider: str, model: str, language: str):
    """
    Note: This function relies on Instructor, which only supports certain model providers. That's why this function doesn't use the LLM abstract base class that is used elsewhere in the project.
    """

    formatted_system_prompt = SYSTEM_PROMPT.format(start_line=start_line)
    if language != "en":
        formatted_system_prompt += "\n" + LANGUAGE_ADDENDUM

    if llm_provider == "anthropic":
        base_url = os.environ.get("DSRAG_ANTHROPIC_BASE_URL", None)
        if base_url is not None:
            client = instructor.from_anthropic(Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"], base_url=base_url))
        else:
            client = instructor.from_anthropic(Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"]))
        return client.chat.completions.create(
            model=model,
            response_model=StructuredDocument,
            max_tokens=4000,
            temperature=0.0,
            system=formatted_system_prompt,
            messages=[
                {
                    "role": "user",
                    "content": document_with_line_numbers,                    
                },
            ],
        )
    elif llm_provider == "openai":
        from langchain_openai import AzureChatOpenAI
        os.environ["OPENAI_API_TYPE"] = configuration['open_ai_cred']['OPENAI_API_TYPE']
        os.environ["AZURE_OPENAI_ENDPOINT"] = configuration['open_ai_cred']['OPENAI_API_BASE']
        os.environ["OPENAI_API_KEY"] = configuration['open_ai_cred']['OPENAI_API_KEY']
        llm = AzureChatOpenAI(**{'deployment_name':configuration['open_ai_config']['deployment_name'], 'openai_api_version':configuration['open_ai_config']['openai_api_version']}, temperature=0.1)
        # llm=llm.with_structured_output(StructuredDocument)
        from langchain_core.prompts import PromptTemplate
        from langchain_core.output_parsers import StrOutputParser
        parser = JsonOutputParser(pydantic_object=StructuredDocument)
        partial_variables={"format_instructions": parser.get_format_instructions()}
        messages = [("system",formatted_system_prompt),("human", document_with_line_numbers),]
        # ai_msg = llm.invoke(messages)
        strctured_prompt = PromptTemplate(
        input_variables=["formatted_system_prompt","document_with_line_numbers","source"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
        template="""
	 **System Instructions:**
	 {formatted_system_prompt}
	         
	**Format Instructions:**
	{format_instructions}  
	
	**document_with_line_numbers:**
	{document_with_line_numbers}
    
	"""
	)
        chain=strctured_prompt | llm | parser

        return(chain.invoke({"formatted_system_prompt":formatted_system_prompt,"document_with_line_numbers":document_with_line_numbers}))
    else:
        raise ValueError("Invalid provider. Must be either 'anthropic' or 'openai'.")

In [None]:
def get_sections(document_lines, max_characters: int = 20000, llm_provider: str = "openai", model: str = "gpt-4o-mini", language: str = "en"):
    """
    Inputs
    - document_lines: list[dict] - the text of the document
    - max_iterations: int - the maximum number of iterations to run (used as a safety measure to prevent the possibility of an infinite loop)
    - max_characters: int - the maximum number of characters to process in one call to the LLM
    - llm_provider: str - the LLM provider to use (either "anthropic" or "openai")
    - model: str - the name of the LLM model to use

    Returns
    - sections: a list of dictionaries, each containing the following keys:
        - title: str - the main topic of this section of the document (very descriptive)
        - start: int - line number where the section begins (inclusive)
        - end: int - line number where the section ends (inclusive)
        - content: str - the text of the section
    """
    max_iterations = 2*(len(document_str) // max_characters + 1)
    print(max_iterations)
    start_line = 0
    all_sections = []
    for _ in range(max_iterations):
        document_with_line_numbers, end_line = get_document_with_lines(document_lines, start_line, max_characters)
        #print("docs--->",document_with_line_numbers)
        #print("source---->",len(source))
        structured_doc = get_structured_document(document_with_line_numbers,  start_line, llm_provider=llm_provider, model=model, language=language)
        #print("strucy--------------",structured_doc)
        new_sections = structured_doc["sections"]
        #print("checkpoint1")
        all_sections.extend(new_sections)
        
        if end_line >= len(document_lines) - 1:
            # reached the end of the document
            break
        else:
            if len(new_sections) > 1:
                start_line = all_sections[-1]["start_index"] # start from the next line after the last section
                all_sections.pop()
            else:
                start_line = end_line + 1

    # get the section text
    sections = get_sections_text(all_sections, document_lines)
    #print("sections---->",sections)

    return sections


In [None]:
import os

sections = get_sections(
            document_lines=s, 
        )

In [None]:
sections

In [None]:
docs=["title: " + section["title"]+"\n"+section["content"] for section in sections]

In [None]:
docs

In [None]:
len(docs)

In [None]:
metadata=[section["metadata"]for section in sections]

In [None]:
metadata

In [None]:
len(metadata)

In [None]:
metadatas=[{'source': metadata[i][0]} for i in range(len(metadata))]

In [None]:
metadatas

## Creating DataBase

In [None]:
from chromadb import Documents, EmbeddingFunction, Embeddings
import chromadb
from langchain_chroma import Chroma

class sEmbeddingFunction(EmbeddingFunction):
	def __call__(self, input: Documents) -> Embeddings:
		embeddings_list = [embeddings.embed_query(text) for text in input]
		return embeddings_list
		
	def embed_query(self, text: str) -> List[float]:
		return embeddings.embed_query(text)  
        

In [None]:
custom_embeddings=sEmbeddingFunction()
persistent_client = chromadb.PersistentClient(path="./chroma1")
collection = persistent_client.get_or_create_collection(name="britishgas",embedding_function=custom_embeddings)
collection.add(ids=list(map(str, range(len(docs)))), documents=docs, metadatas=[{'source': metadata[i][0]} for i in range(len(metadata))])

vector_store_from_client = Chroma(
    client=persistent_client,
    collection_name="britishgas",
	embedding_function=custom_embeddings
)


In [None]:
vector_store_from_client.get()

In [None]:
results = vector_store_from_client.similarity_search_with_score("What is coinsurance?")
results

In [None]:
search_kwargs={"score_threshold": 0.3}#{"k":k}
retriever = vector_store_from_client.as_retriever(search_type="similarity_score_threshold",search_kwargs=search_kwargs)
initial_docs=retriever.get_relevant_documents("What is the first step in identifying customer vulnerability?")


In [None]:
initial_docs

In [None]:
results

In [None]:
from langchain_core.documents import Document
from chromadb import Documents, EmbeddingFunction, Embeddings
pairs = [Document(page_content=doc[0].page_content) for doc in results if doc[1]<=1]

In [None]:
pairs

In [None]:
for result in results:
    print(f"Response: {result.page_content}")

## VectorStore Accessing

In [4]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="prudential",
    embedding_function=embeddings,
    persist_directory="./sampledb/prudential",  # Where to save data locally, remove if not neccesary
)

In [5]:
results = vector_store.similarity_search(
    "what is head-on collison?",
    k=4
)

In [6]:
for result in results:
    print(f"Response: {result.page_content}")

Response: title: Introduction to Head-On Collision
What is Head On collison?

A head-on collision occurs when two vehicles travelling in opposite directions (more or less) collide frontally with each other.

Response: title: Causes of Head-On Collision
How does Head-On Collision occur?

Head-on collisions are among the most devastating types of traffic accidents and are often the result of driver error or negligence. Understanding the key contributing factors can help promote awareness and prevention.

1. Distracted Driving
Engaging in activities like texting, using a GPS, or adjusting the radio diverts a driver’s attention from the road. These distractions can lead to lane drifting and significantly increase the risk of a head-on collision.

2. Impaired Driving
Operating a vehicle under the influence of alcohol or drugs impairs judgment, coordination, and reaction time. This makes it more likely for a driver to veer into oncoming traffic.

3. Fatigue
Drowsy driving can lead to lapses 

## Using the VectorDB with LLM

In [None]:
# {
#     "OPENAI_API_TYPE": "azure",
#     "AZURE_OPENAI_ENDPOINT": "https://cxaicoe-openai.openai.azure.com/",
#     "OPENAI_API_KEY": "f0dd5fb480d74684832ff376296b730b",
#     "azure_deployment": "cx-aicoe-gpt4",
#     "api_version": "2024-09-01-preview",
#     "model_version": "2024-08-06",
#     "temperature": 0.0,
#     "max_tokens": 1024,
#     "logprobs": false
# }

In [None]:
import os
from langchain_openai import AzureChatOpenAI,AzureOpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
os.environ["OPENAI_API_TYPE"] = 'azure'
os.environ["AZURE_OPENAI_ENDPOINT"] = 'https://exl-poc-demo.openai.azure.com/'
os.environ["OPENAI_API_KEY"] = '5588b6e4a35949458cd783e3fe61f960'

llm = AzureChatOpenAI(**{'deployment_name':'exl_gpt_4o','openai_api_version':'2024-02-15-preview'},temperature=0.1)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

prompt =  "Provide a concise answer for the {question} from the retrived documents: {docs}"
prompt_template = PromptTemplate(template=prompt,input_variables=["question","docs"])
chain = prompt_template | llm | StrOutputParser()

# Convert loaded documents into strings by concatenating their content
# and ignoring metadata
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


# chain = {"docs": format_docs} | prompt | llm | StrOutputParser()

question = "what is the difference between simple and traditional ira "

docs = vector_store_from_client.similarity_search(question, k=4)
res=chain.invoke({"question":question,"docs":format_docs(docs),"chat_history":chat_history})


In [None]:
res=chain.invoke({"question":question,"docs":format_docs(docs),"chat_history":chat_history})
print(res)

In [None]:
for doc in docs:
	print(doc.page_content)

## Test

In [None]:
gt = "Yes, there are some restrictions. Funds must be used for qualified education expenses to remain tax-free. Qualified expenses include tuition, fees, books, supplies, and room and board for students enrolled at least half-time. If you use the funds for non-qualified expenses, the earnings will be subject to federal income tax and a 10% penalty."

In [None]:
prompt2 = ChatPromptTemplate.from_template(
   '''Can you just provide a similarity score between these 2 strings, nothing else:
    string1: {gt}
    string2: {rag_output}
    output fromat: {{'score':numeric value for the similarity score}}'''
)

In [None]:
pr = prompt2.invoke({'gt':gt, 'rag_output':rag_output})

In [None]:
pr.messages[0].content

In [None]:
prompt2.messages

In [None]:
output = llm.invoke(pr.messages[0].content)
output

In [None]:
print(output.content)

## CCH

In [None]:
def get_document_title(document_text: str, document_title_guidance: str = "") -> str:
    """
    Extract the title of a document using a language model.

    Args:
        document_text (str): The text of the document.
        document_title_guidance (str, optional): Additional guidance for title extraction. Defaults to "".

    Returns:
        str: The extracted document title.
    """

	# Constants
    DOCUMENT_TITLE_PROMPT = """
	INSTRUCTIONS
	What is the title of the following document?
	
	Your response MUST be the title of the document, and nothing else. DO NOT respond with anything else.
	
	{document_title_guidance}
	
	{truncation_message}
	
	DOCUMENT
	{document_text}
	""".strip()
	
    TRUNCATION_MESSAGE = """
	Also note that the document text provided below is just the first ~{num_words} words of the document. That should be plenty for this task. Your response should still pertain to the entire document, not just the text provided below.
	""".strip()
    prompt_template = PromptTemplate(template=DOCUMENT_TITLE_PROMPT,input_variables=["document_text","truncation_message","document_title_guidance"])
    chain = prompt_template | llm | StrOutputParser()
    return chain.invoke({"document_text":document_text,"truncation_message":TRUNCATION_MESSAGE,"document_title_guidance":""})
    
    # return make_llm_call(chat_messages)

In [None]:
chunk = "\n".join(docs[0].page_content.split('\n\n')[2:])
chunk

In [None]:
           ########################testing with sections ##############################

In [None]:
title = get_document_title(chunk)
title

In [None]:
chunk_w_header = f"Document Title: {title}\n\n{chunk}"
print(chunk_w_header)

In [None]:
chunk=sections[3]["content"]

In [None]:
sections[3]

In [None]:
chunk

In [None]:
title=get_document_title(chunk)
title

In [None]:
sections

## Re-Ranking 

In [None]:
from langchain_core.retrievers import BaseRetriever
from sentence_transformers import CrossEncoder
from typing import List, Dict, Any, Tuple

cross_encoder = CrossEncoder('demo_api/model/ms-marco-MiniLM-L-6-v2')

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_into_chunks(text: str, chunk_size: int):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0, length_function=len)
    texts = text_splitter.create_documents([text])
    chunks = [text.page_content for text in texts]
    return chunks


# vectorstore: Any = Field(description="Vector store for initial retrieval")
# cross_encoder: Any = Field(description="Cross-encoder model for reranking")
# k: int = Field(default=5, description="Number of documents to retrieve initially")
# rerank_top_k: int = Field(default=3, description="Number of documents to return after reranking")

def get_relevant_documents(query,vectorstore,cross_encoder,k,rerank_top_k):
	# Initial retrieval
	chunks = []
	initial_docs = vectorstore.similarity_search(query, k=k)
	# for doc in initial_docs:
	# 	chunks.extend(split_into_chunks(doc.page_content, 30))
	# Prepare pairs for cross-encoder
	pairs = [[query,doc.page_content] for doc in initial_docs]
	
	# Get cross-encoder scores
	scores = cross_encoder.predict(pairs)
	
	# Sort documents by score
	scored_docs = sorted(zip(initial_docs, scores), key=lambda x: x[1], reverse=True)
	
	# Return top reranked documents
	return [doc for doc, _ in scored_docs[:rerank_top_k]],[score for _, score in scored_docs[:rerank_top_k]]

In [None]:
query = "what is a monetary policy"
k = 7
rerank_top_k = 3

In [None]:
filtered_results,filtered_score = get_relevant_documents(query,vector_store,cross_encoder,k,rerank_top_k)

In [None]:
for result in filtered_results:
    print(f"Response: {result.page_content}")

In [None]:
filtered_score

In [None]:
from scipy.stats import beta
x=beta.cdf(filtered_score[0], 0.4, 0.4)
x

## RSE

In [None]:
def get_best_segments(relevance_values: list, max_length: int, overall_max_length: int, minimum_value: float):
    """
    This function takes the chunk relevance values and then runs an optimization algorithm to find the best segments. In more technical terms, it solves a constrained version of the maximum sum subarray problem.

    Note: this is a simplified implementation intended for demonstration purposes. A more sophisticated implementation would be needed for production use and is available in the dsRAG library.

    Args:
        relevance_values (list): a list of relevance values for each chunk of a document
        max_length (int): the maximum length of a single segment (measured in number of chunks)
        overall_max_length (int): the maximum length of all segments (measured in number of chunks)
        minimum_value (float): the minimum value that a segment must have to be considered

    Returns:
        best_segments (list): a list of tuples (start, end) that represent the indices of the best segments (the end index is non-inclusive) in the document
        scores (list): a list of the scores for each of the best segments
    """
    best_segments = []
    scores = []
    total_length = 0
    while total_length < overall_max_length:
        # find the best remaining segment
        best_segment = None
        best_value = -1000
        for start in range(len(relevance_values)):
            # skip over negative value starting points
            if relevance_values[start] < 0:
                continue
            for end in range(start+1, min(start+max_length+1, len(relevance_values)+1)):
                # skip over negative value ending points
                if relevance_values[end-1] < 0:
                    continue
                # check if this segment overlaps with any of the best segments and skip if it does
                if any(start < seg_end and end > seg_start for seg_start, seg_end in best_segments):
                    continue
                # check if this segment would push us over the overall max length and skip if it would
                if total_length + end - start > overall_max_length:
                    continue
                
                # define segment value as the sum of the relevance values of its chunks
                segment_value = sum(relevance_values[start:end])
                if segment_value > best_value:
                    best_value = segment_value
                    best_segment = (start, end)
        
        # if we didn't find a valid segment then we're done
        if best_segment is None or best_value < minimum_value:
            break

        # otherwise, add the segment to the list of best segments
        best_segments.append(best_segment)
        scores.append(best_value)
        total_length += best_segment[1] - best_segment[0]
    
    return best_segments, scores

In [None]:
# define some parameters and constraints for the optimization
irrelevant_chunk_penalty = 0.2 # empirically, something around 0.2 works well; lower values bias towards longer segments
max_length = 20
overall_max_length = 30
minimum_value = 0.7

# subtract constant threshold value from chunk relevance values
relevance_values = [v - irrelevant_chunk_penalty for v in chunk_values] 

# run the optimization
best_segments, scores = get_best_segments(relevance_values, max_length, overall_max_length, minimum_value)

# print results
print ("Best segment indices")
print (best_segments) # indices of the best segments, with the end index non-inclusive
print ()
print ("Best segment scores")
print (scores)
print ()

## Question Generation

In [None]:
import os
from langchain_openai import AzureChatOpenAI,AzureOpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
os.environ["OPENAI_API_TYPE"] = 'azure'
os.environ["AZURE_OPENAI_ENDPOINT"] = 'https://exl-poc-demo.openai.azure.com/'
os.environ["OPENAI_API_KEY"] = '5588b6e4a35949458cd783e3fe61f960'

llm = AzureChatOpenAI(**{'deployment_name':'exl_gpt_4o','openai_api_version':'2024-02-15-preview'},temperature=0.1)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

prompt = """Construct 20 questions from the below provided documents: {docs}

DO NOT construct simple question that is already present in documents or is the title to a topic.
output should be a list of dictionary having keys question and answer.
Output Schema:
[{{'question':str,'answer':str}},....]
where question is the question and answer is its correct response."""
prompt_template = PromptTemplate(template=prompt,input_variables=["docs"])
chain = prompt_template | llm | StrOutputParser()

# Convert loaded documents into strings by concatenating their content
# and ignoring metadata
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


# chain = {"docs": format_docs} | prompt | llm | StrOutputParser()

# question = "what is the difference between simple and traditional ira "

# docs = vector_store_from_client.similarity_search(question, k=4)



In [None]:
documents

In [None]:
res=chain.invoke({"docs":format_docs(documents)})
print(res)

In [None]:
similarity_prompt_template= PromptTemplate(
    input_variables=["user_input","conversation_history"],
    template="""
	You are an AI assistant that checks if a question has already been asked in the conversation history. If the current question is semantically similar to any previous question or any previous answer, return `yes`. If not, return `no`. **If the answer is `yes`, give the response using the conversation history and rephrase the answer according to the question asked by the user, don't give the response as it is**.
    question:
    {user_input}

    conversation history:
    {conversation_history}
	
""")
llm_chain=similarity_prompt_template|llm|StrOutputParser()

chat_history = ['What is an IRA?', 'An IRA (Individual Retirement Account) is a tax-advantaged savings account designed to help you save for retirement. There are several types of IRAs, each with its own features and benefits.', 'What are the types of it?', "Commercial Banks:\n\nFocused on accepting deposits, making loans, and providing basic financial services to the public.\n\nInvestment Banks:\n\nSpecialize in helping businesses and governments raise capital, managing investments, and providing advisory services for mergers and acquisitions.\n\nCentral Banks:\n\nResponsible for managing a country's currency, money supply, and interest rates (e.g., Federal Reserve in the USA).\n\nRetail Banks:\n\nProvide consumer-oriented services like savings accounts, mortgages, and personal loans.\n\nCredit Unions:\n\nMember-owned financial cooperatives that provide similar services as retail banks but often at more favorable rates."]
query = "Do IRA have any kind of retirement benefit?"

is_similar=llm_chain.invoke({"conversation_history":chat_history,"user_input":query})
print(is_similar)

In [None]:
def check_similarity(user_input, conversation_history):
    similarity_prompt_template = PromptTemplate(
        input_variables=["user_input", "conversation_history"],
        template="""
        You are an AI assistant that checks if a question has already been asked in the conversation history. If the current question is semantically similar to any previous question or any previous answer, return `yes`. If not, return `no`. **If the answer is `yes`, give the response using the conversation history and rephrase the answer according to the question asked by the user, don't give the response as it is**.
        question:
        {user_input}

        conversation history:
        {conversation_history}
        """
    )
    llm_chain=similarity_prompt_template|llm|StrOutputParser()
    is_similar = llm_chain.invoke({"conversation_history": conversation_history, "user_input": user_input})
    return is_similar

# Example usage
chat_history = [
    'What is an IRA?', 
    'An IRA (Individual Retirement Account) is a tax-advantaged savings account designed to help you save for retirement. There are several types of IRAs, each with its own features and benefits.', 
    'What are the types of it?', 
    "Commercial Banks:\n\nFocused on accepting deposits, making loans, and providing basic financial services to the public.\n\nInvestment Banks:\n\nSpecialize in helping businesses and governments raise capital, managing investments, and providing advisory services for mergers and acquisitions.\n\nCentral Banks:\n\nResponsible for managing a country's currency, money supply, and interest rates (e.g., Federal Reserve in the USA).\n\nRetail Banks:\n\nProvide consumer-oriented services like savings accounts, mortgages, and personal loans.\n\nCredit Unions:\n\nMember-owned financial cooperatives that provide similar services as retail banks but often at more favorable rates."
]
query = "Do IRA have any kind of retirement benefit?"

is_similar = check_similarity(query, chat_history)
print(is_similar)

In [None]:
rephrase_question_prompt = PromptTemplate(
    input_variables=["user_input","conversation_history"],
    template="""You are an AI assistant that rephrases the current question based on the conversation history. If the current question references context from the conversation history, rephrase it to be a standalone question that includes the necessary context. Return the rephrased question.
   
	question:
    {user_input}

    conversation history:
    {conversation_history})
"""
)
llm_chain=rephrase_question_prompt|llm|StrOutputParser()

chat_history = [
    'What is a bank?', 
    "Commercial Banks:\n\nFocused on accepting deposits, making loans, and providing basic financial services to the public.\n\nInvestment Banks:\n\nSpecialize in helping businesses and governments raise capital, managing investments, and providing advisory services for mergers and acquisitions.\n\nCentral Banks:\n\nResponsible for managing a country's currency, money supply, and interest rates (e.g., Federal Reserve in the USA).\n\nRetail Banks:\n\nProvide consumer-oriented services like savings accounts, mortgages, and personal loans.\n\nCredit Unions:\n\nMember-owned financial cooperatives that provide similar services as retail banks but often at more favorable rates.",
'What are its types?', 
]
query = "What are its benefits?"
rephrased_query = llm_chain.invoke({"conversation_history":chat_history,"user_input":query})
print(rephrased_query)

In [None]:
def rephrase_question(user_input, conversation_history):
    rephrase_question_prompt = PromptTemplate(
        input_variables=["user_input", "conversation_history"],
        template="""You are an AI assistant that rephrases the current question based on the conversation history. If the current question references context from the conversation history, rephrase it to be a standalone question that includes the necessary context. Return the rephrased question.
        
        question:
        {user_input}

        conversation history:
        {conversation_history})
        """
    )
    llm_chain = rephrase_question_prompt|llm|StrOutputParser()
    rephrased_query = llm_chain.invoke({"conversation_history": conversation_history, "user_input": user_input})
    return rephrased_query

# Example usage
chat_history = [
    'What is a bank?', 
    "Commercial Banks:\n\nFocused on accepting deposits, making loans, and providing basic financial services to the public.\n\nInvestment Banks:\n\nSpecialize in helping businesses and governments raise capital, managing investments, and providing advisory services for mergers and acquisitions.\n\nCentral Banks:\n\nResponsible for managing a country's currency, money supply, and interest rates (e.g., Federal Reserve in the USA).\n\nRetail Banks:\n\nProvide consumer-oriented services like savings accounts, mortgages, and personal loans.\n\nCredit Unions:\n\nMember-owned financial cooperatives that provide similar services as retail banks but often at more favorable rates.",
    'What are its types?',
	'What is a simple IRA'
]
query = "What are its benefits"
rephrased_query = rephrase_question(query, chat_history)
print(rephrased_query)

In [None]:
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

In [None]:
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [None]:
retriever=vector_store.as_retriever()

In [None]:
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [None]:
def get_rephrased_query(query, chat_history):
    # Combine the input into a single dictionary
    input_data = {
        "question": query,
        "chat_history": chat_history
    }
    # Invoke the retriever with the combined input
    rephrased_query = history_aware_retriever.invoke(input_data)
    return rephrased_query

In [None]:
from langchain.chains import create_retrieval_chain  # Import the create_retrieval_chain function from the langchain.chains module
from langchain.chains.combine_documents import create_stuff_documents_chain

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

qa_system_prompt =  "Provide a concise answer for the {question} from the retrived documents: {context}"
qa_prompt_template = PromptTemplate(template=qa_system_prompt,input_variables=["question","context"])
chain = qa_prompt_template | llm | StrOutputParser()

# Convert loaded documents into strings by concatenating their content
# and ignoring metadata
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


# chain = {"docs": format_docs} | prompt | llm | StrOutputParser()

question = "what is the difference between simple and traditional ira "

docs = vector_store_from_client.similarity_search(question, k=4)
res=chain.invoke({"question":question,"context":format_docs(docs),"chat_history":chat_history})
# print(res)

In [None]:
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt_template)

In [None]:
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [None]:
from langchain_core.messages import HumanMessage  # Import the HumanMessage class

chat_history = []  # Initialize an empty list to store the chat history

# Ask the first question
first_question = "What is LLM?"
ai_response_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})  # Invoke the RAG chain with the question and an empty chat history
print('user query:', first_question) 
print('ai response:', ai_response_1["answer"])  # Print the answer from the RAG chain
chat_history.extend([HumanMessage(content=first_question), ai_response_1["answer"]])  # Add the question and answer to the chat history

# Ask the second question
second_question = "What are the different types of it?"
ai_response_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})  # Invoke the RAG chain with the second question and the updated chat history
chat_history.extend([HumanMessage(content=second_question), ai_response_2["answer"]])  # Add the second question and answer to the chat history
print('user query:', (second_question)) 
print('ai response:', ai_response_2["answer"]) 

In [None]:
def rephrase_question(user_input, conversation_history):
    rephrase_question_prompt = PromptTemplate(
        input_variables=["user_input", "conversation_history"],
        template="""You are an AI assistant that rephrases the current question based on the conversation history. If the current question references context from the conversation history, rephrase it to be a standalone question that includes the necessary context. Return the rephrased question.
        
        question:
        {user_input}

        conversation history:
        {conversation_history})
        """
    )
    llm_chain = LLMChain(prompt=rephrase_question_prompt, llm=llm, output_parser=StrOutputParser())
    rephrased_query = llm_chain.invoke({"conversation_history": conversation_history, "user_input": user_input})
    return rephrased_query

# Example usage
chat_history = [
    'What is a bank?', 
    "Commercial Banks:\n\nFocused on accepting deposits, making loans, and providing basic financial services to the public.\n\nInvestment Banks:\n\nSpecialize in helping businesses and governments raise capital, managing investments, and providing advisory services for mergers and acquisitions.\n\nCentral Banks:\n\nResponsible for managing a country's currency, money supply, and interest rates (e.g., Federal Reserve in the USA).\n\nRetail Banks:\n\nProvide consumer-oriented services like savings accounts, mortgages, and personal loans.\n\nCredit Unions:\n\nMember-owned financial cooperatives that provide similar services as retail banks but often at more favorable rates.",
    'What are its types?', 
]
query = "What are its benefits?"
rephrased_query = rephrase_question(query, chat_history)
print(rephrased_query)

In [None]:
from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser

In [None]:
class isimilar_output(BaseModel):
    flag: str = Field(description="return string value True/False. if satisfy then True else False")
    response: str = Field(description="answer to the particular question present in conversation history if is_similar is True or return empty")
parser = JsonOutputParser(pydantic_object=isimilar_output)
similarity_prompt_template = PromptTemplate(
 input_variables=["user_input", "conversation_history"],
 partial_variables={"format_instructions": parser.get_format_instructions()},
 template="""You are an AI assistant that checks if answer to a question can be fetched from the provided conversation history. If the current question is semantically similar to any previous question or any previous answer, return "True", else return "False".
 If it returns "True", fetch the answer to that particular question **only using the conversation history** and rephrase it according to the question asked and give the response using conversational fillers to make it sound more natural.
    return output as a json having keys flag and response only. If it returns "False", return empty in response. 

    outpiy schema:
    {{flag: str,
    response: str}}
    
 question:
 {user_input}
 
 conversation history:
 {conversation_history}
 """
)
chain=similarity_prompt_template | llm | parser

In [None]:
conversation_history = ['hi', 'Hey there! Looks like you’ve got a question—what’s on your mind?',]

In [None]:
user_input = 'hi'

In [None]:
s = chain.invoke({"conversation_history": conversation_history, "user_input": user_input})

In [None]:
s

In [None]:
You are an AI assistant that checks if answer to a question can be fetched from the provided conversation history. If the current question is semantically similar to any previous question or any previous answer, return "True", else return "False".
 If it returns "True", fetch the answer to that particular question **only using the conversation history** and rephrase it according to the question asked and give the response using conversational fillers to make it sound more natural.    
         Return output as a json having keys flag and response only.