In [None]:
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
import pandas as pd
from langchain.docstore.document import Document

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""

In [None]:
llm_gpt = OpenAI(temperature=0.5, max_tokens=1024)

  llm_gpt = OpenAI(temperature=0.5, max_tokens=1024)


In [None]:
# Prompt template
template = """Answer the question using only the provided context. Be concise and provide estimates when requested. If the context is insufficient, state that you lack enough information:
{context}

Question: {question}
"""
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [None]:
dataframe500 = pd.read_csv("insurance500.csv")

  dataframe500 = pd.read_csv("D:\LLMs\FProject_2\insurance500.csv")


In [7]:
print(dataframe500.columns)


Index(['category', 'question', 'answer'], dtype='object')


In [None]:
# Preparing documents for FAISS
documents = []
for index, row in dataframe500.iterrows():
    documents.append(Document(page_content=row["answer"], metadata={"category": row["category"]}))

In [None]:
# Generating embeddings
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)

  embeddings = OpenAIEmbeddings()


In [None]:
# Retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})  # Top 3 relevant contexts



In [15]:
# Generate answers using the model
data = {"question": [], "ground_truth": [], "answer_gpt": [], "contexts": []}
for index, row in dataframe500.iterrows():
    query = row["question"]  # Column name for questions
    ground_truth = row["answer"]  # Column name for ground truth answers

    data["question"].append(query)
    data["ground_truth"].append(ground_truth)

    # Retrieve relevant documents
    retrieved_docs = retriever.get_relevant_documents(query)
    context = " ".join([doc.page_content for doc in retrieved_docs])
    data["contexts"].append([doc.page_content for doc in retrieved_docs])

    # Run pipeline with GPT
    gpt_output = llm_gpt(prompt.format(context=context, question=query))
    data["answer_gpt"].append(gpt_output)


  retrieved_docs = retriever.get_relevant_documents(query)
  gpt_output = llm_gpt(prompt.format(context=context, question=query))


In [16]:
data["reference"] = [" ".join(contexts) for contexts in data["contexts"]]  # Combine list into a single string

In [None]:
# Converting to a Dataset
dataset = Dataset.from_dict(data)

In [None]:
# Renaming the ground_truth column to response for compatibility with RAGAs
dataset = dataset.rename_column("ground_truth", "response")


In [None]:
# Evaluation with RAGAs
result = evaluate(
    dataset=dataset,
    metrics=[
        answer_relevancy,
        faithfulness,
        context_precision,
        context_recall,
    ],
)


Evaluating: 100%|██████████| 2000/2000 [16:54<00:00,  1.97it/s]


In [20]:
print("The evaluation result for the RAG system using OpenAI GPT on 500 insurance Q&A pairs:")
result

The evaluation result for the RAG system using OpenAI GPT on 500 insurance Q&A pairs:


{'answer_relevancy': 0.6157, 'faithfulness': 0.9168, 'context_precision': 0.9698, 'context_recall': 0.9128}

In [None]:
# First row of stored data
print({key: value[0] for key, value in data.items()})

{'question': 'What is the average life insurance cost per month?', 'ground_truth': 'The cost of a life insurance policy depends on the type of policy you own. Term plans are generally cheaper in nature as it only offers death benefit with no profits or returns. Traditional plans and unit-linked plans tend to cost more as they offer a wide range of benefits. The cost also depends on the sum assured i.e. a higher sum assured will cost you more and vice versa.', 'answer_gpt': '\nThe average life insurance cost per month can vary greatly depending on various factors such as age, gender, income, smoking habits, type of policy, and sum assured. However, for a 26-year-old male applicant who smokes with an annual salary of Rs.7 lakh and a sum assured of Rs.1 crore, the average premium price for a term insurance plan is Rs.933 per month. It is important to note that the cost of a life insurance policy can increase with age and a higher sum assured will also result in a higher premium.', 'contex

In [None]:
# Exporting data to CSV
output_df = pd.DataFrame(data)
output_df.to_csv("ins500_gen.csv", index=False)