In [5]:
import pandas as pd
import numpy as np
from openai import OpenAI
from elasticsearch import Elasticsearch
from dotenv import load_dotenv
import json
from tqdm.auto import tqdm

In [2]:
load_dotenv()


True

In [3]:
client = OpenAI()

## Loading Data

In [6]:
with open('documents-with-ids.json', 'rt') as f:
    documents= json.load(f)

In [7]:
documents[0]

{'question': 'What services are covered under preventive care?',
 'text': 'Preventive care typically includes annual check-ups, vaccinations, and screenings. Coverage varies, so refer to your policy for specific services.',
 'policy': 'Health Insurance',
 'id': 'ac3af67e'}

## Setting up Elasticsearch

In [15]:
es_client = Elasticsearch('http://localhost:9200') 

In [16]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "question": {"type": "text"},
            "policy": {"type": "keyword"} 
        }
    }
}

index_name = "policy-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'policy-questions'})

In [18]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/201 [00:00<?, ?it/s]

In [23]:
def elastic_search(query, policy_filter):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "policy": policy_filter
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs


In [36]:
elastic_search(
    query="How do I file a home claim insurance",
    policy_filter="Homeowners Insurance"
)

[{'question': 'How do I file a claim for homeowners insurance?',
  'text': 'To file a claim, contact your insurance provider as soon as possible, provide details about the damage, and submit any necessary documentation, such as photos and repair estimates.',
  'policy': 'Homeowners Insurance',
  'id': '53e76f9b'},
 {'question': 'Do I need homeowners insurance if my home is paid off?',
  'text': "While it's not legally required, having homeowners insurance is highly recommended to protect your investment from potential risks and damages.",
  'policy': 'Homeowners Insurance',
  'id': '84eaa3b8'},
 {'question': 'What should I do if I experience water damage in my home?',
  'text': 'If you experience water damage, report it to your insurance provider immediately, document the damage with photos, and start necessary repairs to prevent further damage.',
  'policy': 'Homeowners Insurance',
  'id': 'cb5d1d56'},
 {'question': 'What is a home inventory, and should I have one?',
  'text': 'A home

## Evaluating Retrieval

In [38]:
df_ground_truth = pd.read_csv('data/ground-truth-data.csv')

In [39]:
df_ground_truth.head()

Unnamed: 0,id,question,policy
0,ac3af67e,What does preventive care usually include in y...,Health Insurance
1,ac3af67e,Are annual health assessments covered under yo...,Health Insurance
2,ac3af67e,Do you offer coverage for vaccinations in your...,Health Insurance
3,ac3af67e,Can you provide information on the screenings ...,Health Insurance
4,ac3af67e,How can I find out which specific preventive s...,Health Insurance


In [40]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [54]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['id']
    results = elastic_search(query=q['question'], policy_filter=q['policy'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/980 [00:00<?, ?it/s]

Hit Rate: One if the original answer is in the Retrieved Documents

In [48]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

Mean reciprocal rank: 1/Rank of the Original ANswer if the Original ANswer is in the Retrieved Documents

In [49]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [55]:
hit_rate(relevance_total), mrr(relevance_total)

(0.8183673469387756, 0.6226530612244899)

## RAG Flow Implementation

In [28]:
def build_prompt(query, search_results):
    prompt_template = """
You are CIC's insurance policy assistant. CIC Insurance Group Limited, commonly referred to as CIC Group, 
is a Kenyan insurance and investment group that operates mainly in Kenya, Uganda, South Sudan and Malawi. 
The group's headquarters are located in CIC Plaza, Mara Road, Upperhill in Nairobi, Kenya. 
CIC Insurance Group is leading Micro and Co-operative insurance provider in Kenya with its subsidiaries involved in fund, 
Reits and Asset Management, general insurance, medical insurance and life Assurance, pension and annuities.
Answer the customer's question based on the provided context.
Rules:
1.If the Question is a greeting or an appreciation comment, respond accordingly in a polite manner but never step out of your context.
2. Only use information from the provided context
3. If information is not in context, acknowledge and suggest contacting customer service
4. Be clear and professional
5. If query indicates urgency, note that in response

CUSTOMER INQUIRY: {customer_inquiry}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context += f"Policy Type: {doc.get('policy', 'N/A')}\nFAQ Question: {doc.get('question', 'N/A')}\nAnswer: {doc.get('text', 'N/A')}\n\n"
    
    prompt = prompt_template.format(customer_inquiry=query, context=context).strip()
    return prompt

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [29]:
def rag(query):
    search_results = elastic_search(query, "Health Insurance")
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [35]:
query = 'Tell me about Health insurance policy?'
rag(query)

"Health insurance policies generally cover a variety of healthcare services, including doctor visits, hospital stays, prescription medications, preventive care, and medical tests. Many plans also cover mental health services, such as therapy and counseling. However, dental coverage is typically not included in standard health insurance, though it can often be added as an optional benefit. Coverage for prescription medications and mental health services may vary, so it's important to review your specific policy details to understand the exact coverage and any limitations. If you have further questions or want to explore add-on options like dental coverage, it's recommended to contact your insurance provider directly."

In [31]:
def detect_policy_type(query):
    """Detect the insurance policy type from the query"""
    policy_keywords = {
        "Health Insurance": ["health", "medical", "doctor", "hospital", "prescription", "preventive"],
        "Auto Insurance": ["car", "vehicle", "auto", "accident", "collision", "comprehensive"],
        "Home Insurance": ["home", "house", "property", "damage", "theft", "flood"]
    }
    
    query_lower = query.lower()
    for policy_type, keywords in policy_keywords.items():
        if any(keyword in query_lower for keyword in keywords):
            return policy_type
    
    return "General"  

In [32]:
def enhanced_rag(query):
    """Enhanced RAG pipeline with policy detection and multiple context handling"""
    # Detect policy type
    policy_type = detect_policy_type(query)
    
    # Get relevant documents from Elasticsearch
    if policy_type == "General":
        # Search across all policy types if no specific type is detected
        search_results = []
        for policy in ["Health Insurance", "Auto Insurance", "Home Insurance"]:
            results = elastic_search(query, policy)
            search_results.extend(results[:2])  # Take top 2 from each policy type
    else:
        search_results = elastic_search(query, policy_type)
    
    # Build prompt with enhanced context
    prompt = build_enhanced_prompt(query, search_results, policy_type)
    
    # Get LLM response
    answer = llm(prompt)
    
    return {
        "answer": answer,
        "policy_type": policy_type,
        "context_used": search_results
    }

def build_enhanced_prompt(query, search_results, policy_type):
    """Build an enhanced prompt with better context structuring"""
    prompt_template = """
You are CIC's insurance policy assistant. Answer the customer's question based on the provided context.
Follow these rules:
1. Only use information from the provided context
2. If information is not in the context, acknowledge that and suggest contacting customer service
3. Be clear and concise, but maintain a helpful and professional tone
4. If the query indicates urgency or distress, note that in your response

POLICY TYPE: {policy_type}

CUSTOMER QUERY: {query}

RELEVANT CONTEXT:
{context}

RESPONSE GUIDELINES:
- Focus on addressing the specific question
- Include relevant policy details from the context
- Mention any important limitations or conditions
- If escalation is needed, suggest appropriate next steps

Please provide your response:
""".strip()

    # Format context from search results
    formatted_context = ""
    for i, doc in enumerate(search_results, 1):
        formatted_context += f"\nSource {i}:\n"
        formatted_context += f"Policy: {doc.get('policy', 'N/A')}\n"
        formatted_context += f"Q: {doc.get('question', 'N/A')}\n"
        formatted_context += f"A: {doc.get('text', 'N/A')}\n"

    return prompt_template.format(
        policy_type=policy_type,
        query=query,
        context=formatted_context
    )

In [33]:
def needs_escalation(query, answer):
    """Determine if the query needs escalation to human agent"""
    escalation_triggers = {
        'urgency_keywords': ['emergency', 'urgent', 'immediate', 'critical', 'asap'],
        'distress_keywords': ['help', 'worried', 'concerned', 'upset', 'angry'],
        'complex_keywords': ['claim dispute', 'policy cancellation', 'coverage denial'],
    }
    
    query_lower = query.lower()
    
    # Check for urgency or distress
    for keyword_type, keywords in escalation_triggers.items():
        if any(keyword in query_lower for keyword in keywords):
            return True, f"Escalated due to {keyword_type}"
    
    # Check if answer indicates uncertainty
    uncertainty_phrases = ['contact customer service', 'cannot provide', 'don\'t have information']
    if any(phrase in answer.lower() for phrase in uncertainty_phrases):
        return True, "Escalated due to incomplete information"
        
    return False, None

def process_query(query):
    """Main function to process queries with escalation handling"""
    # Get RAG response
    rag_response = enhanced_rag(query)
    answer = rag_response['answer']
    
    # Check if escalation is needed
    needs_escalate, reason = needs_escalation(query, answer)
    
    if needs_escalate:
        escalation_response = {
            "answer": answer,
            "escalated": True,
            "escalation_reason": reason,
            "next_steps": "This query will be escalated to our customer service team. "
                         "For urgent matters, please contact our 24/7 support line.",
            "context": rag_response
        }
        return escalation_response
    
    return {
        "answer": answer,
        "escalated": False,
        "context": rag_response
    }

In [34]:
# Test cases
test_queries = [
    "What does my health insurance cover?",
    "I had a car accident, need urgent help!",
    "How do I file a home insurance claim?",
    "Can you explain my policy deductible?"
]

for query in test_queries:
    print(f"\nQuery: {query}")
    response = process_query(query)
    print(f"Escalated: {response['escalated']}")
    print(f"Answer: {response['answer']}")
    if response['escalated']:
        print(f"Escalation Reason: {response['escalation_reason']}")
    print("-" * 50)


Query: What does my health insurance cover?
Escalated: False
Answer: Your health insurance covers general doctor visits, prescriptions, and emergency care. For detailed information regarding specific coverage, including any limitations or additional services like mental health or preventive services, please refer to your policy document. If you need further assistance or have any urgent concerns, I recommend contacting our customer service for more personalized support.
--------------------------------------------------

Query: I had a car accident, need urgent help!
Escalated: True
Answer: I'm sorry to hear about your accident and I understand that you're in urgent need of help. Here's what you should do immediately:

1. Ensure that everyone is safe and call emergency services if needed.
2. Exchange information with other drivers involved in the accident.
3. Document the accident details.
4. Report the accident to your insurance provider within 24 hours to start the claims process.

