## Importations

In [1]:
import pandas as pd
import numpy as np
import hashlib
import json
from collections import defaultdict
import os
from groq import Groq
from dotenv import load_dotenv
from tqdm.auto import tqdm
from openai import OpenAI


In [2]:
load_dotenv()


True

In [49]:
# Access the API keys
groq_api_key = os.getenv("GROQ_API_KEY")

## Loading Data

In [3]:
with open('data.json', 'rt') as f:
    documents= json.load(f)

In [4]:
documents[10]

{'question': 'Does homeowners insurance cover mold damage?',
 'text': 'Mold damage is often covered if caused by a sudden event, like water damage from a burst pipe. Gradual mold growth is typically not covered.',
 'policy': 'Homeowners Insurance'}

## Generating unique ids for each document

In [5]:
def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['policy']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [7]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [8]:
documents[3]

{'question': 'Does health insurance cover prescription medications?',
 'text': 'Prescription medication coverage varies by plan. Many plans cover a range of medications, subject to copayments or deductibles.',
 'policy': 'Health Insurance',
 'id': '1dc03e8e'}

In [9]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [10]:
# Checking the legnth of ids and documents
len(hashes), len(documents)

(199, 201)

In [34]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

8ade9655 2
c1ec38d4 2


In [35]:
hashes['c1ec38d4']

[{'question': 'What does homeowners insurance cover?',
  'text': 'Homeowners insurance covers damages to your home and personal property due to events like fire, theft, and some natural disasters. It may also include liability protection.',
  'policy': 'Homeowners Insurance',
  'id': 'c1ec38d4'},
 {'question': 'What does homeowners insurance cover?',
  'text': 'Homeowners insurance covers your home and personal property against risks like fire, theft, and liability. Specifics vary by policy.',
  'policy': 'Homeowners Insurance',
  'id': 'c1ec38d4'}]

In [36]:
hashes['8ade9655']

[{'question': 'What is uninsured motorist coverage?',
  'text': "Uninsured motorist coverage helps pay for damages if you're hit by a driver without insurance or insufficient coverage.",
  'policy': 'Auto Insurance',
  'id': '8ade9655'},
 {'question': 'What is uninsured motorist coverage?',
  'text': "Uninsured motorist coverage protects you if you're in an accident with a driver who doesn't have insurance. It may also cover underinsured drivers.",
  'policy': 'Auto Insurance',
  'id': '8ade9655'}]

In [37]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [11]:
prompt_template = """
You are emulating a customer interested in CIC's insurance policies. 
Formulate 5 questions this customer might ask based on an FAQ record. 
Each question should be complete, concise, and avoid using too many words directly from the record.

The record:

policy: {policy}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()


In [12]:
prompt = prompt_template.format(**documents[0])
print(prompt)

You are emulating a customer interested in CIC's insurance policies. 
Formulate 5 questions this customer might ask based on an FAQ record. 
Each question should be complete, concise, and avoid using too many words directly from the record.

The record:

policy: Health Insurance
question: What services are covered under preventive care?
answer: Preventive care typically includes annual check-ups, vaccinations, and screenings. Coverage varies, so refer to your policy for specific services.

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]


In [13]:
client = OpenAI()

In [14]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [15]:
quiz=llm(prompt)


In [16]:
print(quiz)


[
  "What is included in the preventive care benefits of your health insurance?",
  "Are vaccinations part of the services covered under preventive care?",
  "Do annual health check-ups fall under preventive care coverage?",
  "Can you explain what screenings are covered as preventive care services?",
  "How can I find detailed information about preventive care services in my policy?"
]


In [18]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response


In [17]:
from tqdm.auto import tqdm

In [31]:
results = []
for doc in tqdm(documents):
    doc_id = doc['id']
    policy = doc['policy']
    if any(r['id'] == doc_id for r in results):
        continue
    questions_raw = generate_questions(doc)
    #print(f"Raw JSON data for doc_id {doc_id}: {questions_raw}")

    # Check if it's empty
    if not questions_raw:
        print(f"Error: Empty response for doc_id {doc_id}")
    else:
        # Try parsing the list of questions directly
        try:
            questions = json.loads(questions_raw)
            if isinstance(questions, list):
                for question in questions:
                    results.append({"id": doc_id, "question": question, "policy": policy})
            else:
                print(f"Unexpected data format for doc_id {doc_id}")
        except json.JSONDecodeError as e:
            print(f"JSON decoding failed for doc_id {doc_id}: {e}")

  0%|          | 0/201 [00:00<?, ?it/s]

Unexpected data format for doc_id 9171895c
Unexpected data format for doc_id cb2f95ef
Unexpected data format for doc_id afdeb119


In [95]:
results['7fc6cf18']

['Can I include dependents in my health insurance policy?',
 'When can I add dependents to my health insurance plan?',
 'Are there specific events that allow me to add dependents to my health insurance?',
 'What are the qualifying life events for adding dependents to my health insurance?',
 'How does the open enrollment period impact adding dependents to my health insurance plan?']

In [23]:
doc_index = {d['id']: d for d in documents}


In [32]:
df = pd.DataFrame(results)

In [33]:
df.head()

Unnamed: 0,id,question,policy
0,ac3af67e,What does preventive care usually include in y...,Health Insurance
1,ac3af67e,Are annual health assessments covered under yo...,Health Insurance
2,ac3af67e,Do you offer coverage for vaccinations in your...,Health Insurance
3,ac3af67e,Can you provide information on the screenings ...,Health Insurance
4,ac3af67e,How can I find out which specific preventive s...,Health Insurance


In [34]:
df.to_csv('ground-truth-data.csv', index=False)

question,course,document
When does the course begin?,data-engineering-zoomcamp,c02e79ef
How can I get the course schedule?,data-engineering-zoomcamp,c02e79ef
What is the link for course registration?,data-engineering-zoomcamp,c02e79ef
How can I receive course announcements?,data-engineering-zoomcamp,c02e79ef
Where do I join the Slack channel?,data-engineering-zoomcamp,c02e79ef
Where can I find the prerequisites for this course?,data-engineering-zoomcamp,1f6520ca
How do I check the prerequisites for this course?,data-engineering-zoomcamp,1f6520ca
Where are the course prerequisites listed?,data-engineering-zoomcamp,1f6520ca
What are the requirements for joining this course?,data-engineering-zoomcamp,1f6520ca
