In [None]:
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
import pandas as pd
from langchain.docstore.document import Document

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
# Set OpenAI API Key
os.environ["OPENAI_API_KEY"] = ""
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""

In [None]:
# Defining the LLMs
llm_gpt = OpenAI(temperature=0.5, max_tokens=1024)




  llm_gpt = OpenAI(temperature=0.5, max_tokens=1024)


In [None]:
# Prompt template
template = """Answer the question using only the provided context. Be concise and provide estimates when requested. If the context is insufficient, state that you lack enough information:
{context}

Question: {question}
"""
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [None]:

dataframe1000 = pd.read_csv("D:\LLMs\FProject_2\insurance1000.csv")

  dataframe1000 = pd.read_csv("D:\LLMs\FProject_2\insurance1000.csv")


In [7]:
print(dataframe1000.columns)


Index(['category', 'question', 'answer'], dtype='object')


In [8]:
# Prepare documents for FAISS
documents = []
for index, row in dataframe1000.iterrows():
    documents.append(Document(page_content=row["answer"], metadata={"category": row["category"]}))

In [9]:
# Generate embeddings and create FAISS database
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)

  embeddings = OpenAIEmbeddings()


In [10]:
# Define the retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})  # Top 3 relevant contexts



In [11]:
# Generate answers using the model
data = {"question": [], "ground_truth": [], "answer_gpt": [], "contexts": []}
for index, row in dataframe1000.iterrows():
    query = row["question"]  # Column name for questions
    ground_truth = row["answer"]  # Column name for ground truth answers

    data["question"].append(query)
    data["ground_truth"].append(ground_truth)

    # Retrieve relevant documents
    retrieved_docs = retriever.get_relevant_documents(query)
    context = " ".join([doc.page_content for doc in retrieved_docs])
    data["contexts"].append([doc.page_content for doc in retrieved_docs])

    # Run pipeline with GPT
    gpt_output = llm_gpt(prompt.format(context=context, question=query))
    data["answer_gpt"].append(gpt_output)


  retrieved_docs = retriever.get_relevant_documents(query)
  gpt_output = llm_gpt(prompt.format(context=context, question=query))


In [12]:
data["reference"] = [" ".join(contexts) for contexts in data["contexts"]]  # Combine list into a single string

In [13]:
# Convert data to a Dataset
dataset = Dataset.from_dict(data)

In [14]:
# Rename the ground_truth column to response for compatibility with RAGAs
dataset = dataset.rename_column("ground_truth", "response")


In [18]:
# Evaluate with RAGAs
result = evaluate(
    dataset=dataset,
    metrics=[
        answer_relevancy,
        faithfulness,
        context_precision,
        context_recall,
    ],
)


Evaluating: 100%|██████████| 4428/4428 [39:53<00:00,  1.85it/s] 


In [19]:
print("The evaluation result for the RAG system using OpenAI GPT on 1000 insurance Q&A pairs:")
result

The evaluation result for the RAG system using OpenAI GPT on 1000 insurance Q&A pairs:


{'answer_relevancy': 0.5464, 'faithfulness': 0.6061, 'context_precision': 0.9691, 'context_recall': 0.9069}

In [20]:
# Display the first row of stored data
print({key: value[0] for key, value in data.items()})

{'question': 'What is the average life insurance cost per month?', 'ground_truth': 'The cost of a life insurance policy depends on the type of policy you own. Term plans are generally cheaper in nature as it only offers death benefit with no profits or returns. Traditional plans and unit-linked plans tend to cost more as they offer a wide range of benefits. The cost also depends on the sum assured i.e. a higher sum assured will cost you more and vice versa.', 'answer_gpt': '\nUnfortunately, it is impossible to give an average cost for life insurance as it varies greatly depending on multiple factors such as age, coverage amount, risk classification, and type of policy. However, for a 60-year-old individual, a 10-year term insurance policy with a coverage amount of $100,000 can range from $28 to over $100 per month, depending on their risk classification. It is important to consult with an experienced life insurance broker to determine the best coverage and price for your specific needs

In [21]:
# Export data to CSV
output_df = pd.DataFrame(data)
output_df.to_csv("ins1000_gen.csv", index=False)

In [22]:
print(f"GPT Output: {gpt_output}")


GPT Output: 
Yes, a universal life insurance policy can expire if it is not maintained properly or if the policyholder chooses to cancel it. However, many universal life insurance policies come with a no-lapse guarantee, which ensures that the policy will remain in force as long as the premiums are paid. This is different from term life insurance policies, which have a set expiration date.


In [None]:

print("Keys (columns) in the data dictionary:")
print(data.keys())


Keys (columns) in the data dictionary:
dict_keys(['question', 'ground_truth', 'answer_gpt', 'contexts', 'reference'])
