In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# !pip install --upgrade --upgrade-strategy eager transformers accelerate

# !pip install --upgrade --force-reinstall numpy scikit-learn

# !pip install --upgrade --force-reinstall torch torchvision --index-url https://download.pytorch.org/whl/cu121

# !pip install transformers==4.41.2 accelerate==0.30.1 peft==0.10.0

# !pip install numpy==1.26.4

# !pip install -q detoxify
# !pip install -q gradio

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import logging

# Set up basic logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def build_prompt_text(tokenizer, instruction: str, input_text: str) -> str:
    """Prefer chat template when available; fallback to a basic instruction format."""
    try:
        # Phi-3 models use a specific chat template format
        messages = [
            {"role": "user", "content": input_text if input_text else instruction},
        ]
        # The system prompt can be added if desired, but for simple questions, user role is sufficient
        # messages.insert(0, {"role": "system", "content": instruction})
        
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        return prompt
    except Exception as e:
        logger.debug(f"Chat template build failed, falling back to basic prompt: {e}")
        # Fallback for any other model type
        if input_text:
            return f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n"
        else:
            return f"### Instruction:\n{instruction}\n\n### Response:\n"

print("✅ Helper functions defined.")

✅ Helper functions defined.


In [4]:
# Define the base model and the path to your fine-tuned adapter
BASE_MODEL = "microsoft/Phi-3-mini-4k-instruct"
ADAPTER_PATH = "/kaggle/input/insurance-assistant-gpu/pytorch/default/1/insurance-assistant-gpu/" # Corrected path from our discussion

# Configure bitsandbytes for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the tokenizer from the base model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# Load the base model with the specified quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Load the LoRA adapter and merge it with the base model
model = PeftModel.from_pretrained(model, ADAPTER_PATH)

# Set the padding token if it's not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

print("✅ Model and adapter loaded successfully!")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Model and adapter loaded successfully!


In [5]:
def generate(
    model,
    tokenizer,
    prompt: str,
    device: torch.device,
    max_new_tokens: int = 128,
    temperature: float = 0.7,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.1,
    do_sample: bool = True,
) -> str:
    
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs.get("attention_mask").to(device)

    # Generation settings
    gen_kwargs = dict(
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        repetition_penalty=repetition_penalty,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    if do_sample:
        gen_kwargs.update(dict(
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
        ))
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            **gen_kwargs,
        )

    # Decode only the newly generated tokens
    new_tokens = outputs[0, input_ids.shape[1]:]
    response_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
    
    return response_text.strip()

print("✅ Advanced generation function defined.")

✅ Advanced generation function defined.


In [6]:
# --- Inference Parameters (customize these) ---
instruction = "You are a helpful insurance assistant. Answer the user's question clearly."
input_text = "What is the main difference between term life insurance and whole life insurance?"
max_new_tokens = 320
temperature = 0.7
top_p = 0.95
top_k = 50
repetition_penalty = 1.1
# ---

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Build the prompt
prompt = build_prompt_text(tokenizer, instruction, input_text)
print("--- PROMPT ---")
print(prompt)
print("\n--- RESPONSE ---")

# Generate and print the response
response = generate(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    device=device,
    max_new_tokens=max_new_tokens,
    temperature=temperature,
    top_p=top_p,
    top_k=top_k,
    repetition_penalty=repetition_penalty,
)

print(response)

--- PROMPT ---
<|user|>
What is the main difference between term life insurance and whole life insurance?<|end|>
<|assistant|>


--- RESPONSE ---


2025-09-03 23:08:43.407281: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756940923.429921    2058 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756940923.436881    2058 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Term life insurance provides coverage for a specific period of time, usually 10-30 years while paying low premium cost. If death occur within that specified length then there will be an amount which can go to your beneficiary tax free if I am correct on this answer it may not cover as long but at very cheap price than Whole Life Insurance policy who provide permanent lifelong protection in most case you have no out put from those type until retirement or early into our senior year they tend to last longer with higher premium rate thankfully their cash value accumulate overtime so you don’t necessarily need someone else take care of them when older like some people want do thanks again hope i help!


In [7]:
import torch

# --- Inference Parameters ---
instruction = "You are an expert insurance assistant. Provide a clear and concise explanation suitable for someone learning about life insurance. Compare term life insurance and whole life insurance, focusing on coverage duration, premiums, and cash value. Avoid grammar mistakes and keep the answer professional."
input_text = "What is the main difference between term life insurance and whole life insurance?"
max_new_tokens = 200
temperature = 0.55
top_p = 0.9
top_k = 30
repetition_penalty = 1.2
# -----------------------------

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Build the prompt
prompt = build_prompt_text(tokenizer, instruction, input_text)
print("--- PROMPT ---")
print(prompt)
print("\n--- RESPONSE ---")

# Generate and print the response
response = generate(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    device=device,
    max_new_tokens=max_new_tokens,
    temperature=temperature,
    top_p=top_p,
    top_k=top_k,
    repetition_penalty=repetition_penalty,
)

print(response)


--- PROMPT ---
<|user|>
What is the main difference between term life insurance and whole life insurance?<|end|>
<|assistant|>


--- RESPONSE ---
The primary distinction lies in their duration, cash value accumulation feature. Term Life Insurance provide coverage for a specific time period (either 10 or even as long at up to age85), while Whole-Life policy offer lifetime protection with an investment component that can grow tax deferred overtime if you pay premium into it regularly until your death OR reach some specified point of maturity such like when I turn67 year old my $239 monthly payment will stop but keep me alive till be die from natural cause thereafter any income/cash remaining go back underwrite company who issue original contract then distribute proceeds according how they have been set aside by them either through trustee agent etc... what kind want consider depend upon individual situation financial need health condition family status among other factors so please speak

In [8]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from detoxify import Detoxify

# --- Load Evaluation Models ---
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
toxicity_detector = Detoxify('original')

# --- Define Metric Functions ---

def calculate_toxicity(texts: list) -> dict:
    """Calculates the mean toxicity score for a list of texts."""
    try:
        results = toxicity_detector.predict(texts)
        return np.mean(results['toxicity'])
    except Exception as e:
        print(f"Toxicity calculation failed: {e}")
        return 0.0

def calculate_semantic_similarity(questions: list, generated_answers: list) -> float:
    """Calculates the cosine similarity between question and answer embeddings."""
    question_embeddings = similarity_model.encode(questions)
    answer_embeddings = similarity_model.encode(generated_answers)
    
    similarities = []
    for q_emb, a_emb in zip(question_embeddings, answer_embeddings):
        similarity = cosine_similarity([q_emb], [a_emb])[0][0]
        similarities.append(similarity)
        
    return np.mean(similarities)

def calculate_domain_relevance(texts: list) -> float:
    """Scores relevance based on the presence of insurance-related keywords."""
    insurance_keywords = [
        'insurance', 'policy', 'premium', 'deductible', 'coverage', 'claim', 
        'liability', 'risk', 'beneficiary', 'insurer', 'auto', 'health', 'life', 'property'
    ]
    relevance_scores = []
    for text in texts:
        text_lower = text.lower()
        keyword_count = sum(1 for keyword in insurance_keywords if keyword in text_lower)
        word_count = len(text.split())
        relevance = keyword_count / max(word_count, 1)
        relevance_scores.append(relevance)
        
    return np.mean(relevance_scores) * 10 # Scale for better readability

print("✅ Evaluation functions and models are ready.")



✅ Evaluation functions and models are ready.


In [9]:
# --- 1. Define a small test dataset ---
test_data = [
    {
        "question": "What is comprehensive auto insurance and what does it cover?",
    },
    {
        "question": "How do insurance deductibles work?",
    },
    {
        "question": "Explain the concept of an insurance premium.",
    }
]

# --- 2. Generate model responses for the test data ---
print("Generating model responses for evaluation...")
generated_responses = []
for item in test_data:
    prompt = build_prompt_text(tokenizer, instruction, item["question"])
    response = generate(
        model=model,
        tokenizer=tokenizer,
        prompt=prompt,
        device=device,
        max_new_tokens=100
    )
    generated_responses.append(response)
    print(f"Q: {item['question']}\nA: {response}\n")

# --- 3. Calculate metrics ---
print("\nCalculating evaluation metrics...")
questions = [item['question'] for item in test_data]

toxicity_score = calculate_toxicity(generated_responses)
relevance_score = calculate_domain_relevance(generated_responses)
similarity_score = calculate_semantic_similarity(questions, generated_responses)

# --- 4. Display Report ---
print("\n" + "="*50)
print("EVALUATION REPORT")
print("="*50)
print(f"Mean Toxicity Score:      {toxicity_score:.4f} (Lower is better)")
print(f"Domain Relevance Score:   {relevance_score:.4f} (Higher is better)")
print(f"Semantic Similarity:      {similarity_score:.4f} (Higher is better)")
print("="*50)

Generating model responses for evaluation...
Q: What is comprehensive auto insurance and what does it cover?
A: Comprehensive coverage provide a broad form of protection for your vehicle that include damage to the car from things such as vandalism, hail storms or falling tree limbs. It also protect you against loss due to animal strike (such as hit by an elk), glass breakage, tire blowout etc...Comprehensive coverage will pay all damages up to $100K including collision which be more specific type of property/casualty claim involving another

Q: How do insurance deductibles work?
A: An insurance deductible is a set amount that you pay for each claim before your coverage begin. The purpose of the deductible be to share risk between yourself and an insurer, as well as help control what cost on average per policyholder in any given year or quarter it also encourage people buy more affordable plan with higher out-of house payment than premium

Q: Explain the concept of an insurance premium.

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


EVALUATION REPORT
Mean Toxicity Score:      0.0009 (Lower is better)
Domain Relevance Score:   0.7965 (Higher is better)
Semantic Similarity:      0.7853 (Higher is better)


In [None]:
import gradio as gr
import time
import threading
import queue
from typing import List

# --- Avatars for the Chatbot ---
user_avatar = "https://cdn-icons-png.flaticon.com/512/3135/3135715.png"  # A generic user icon
ai_avatar = "https://www.shutterstock.com/image-vector/cute-chat-bot-smiling-flat-260nw-2175518705.jpg"  

# --- Helper to format the chat for copying ---
def format_conversation(history: List[List[str]]) -> str:
    if not history:
        return ""
    formatted = []
    for user_msg, bot_msg in history:
        formatted.append(f"👤 User: {user_msg}")
        formatted.append(f"🤖 Assistant: {bot_msg}")
    return "\n\n---\n\n".join(formatted)

# --- The backend generator function that streams the response ---
def respond(
    message: str,
    history: List[List[str]],
    max_new_tokens: int,
    temperature: float,
    top_p: float,
):
    history.append([message, ""])
    yield "", history

    q = queue.Queue()

    def generate_in_thread():
        # --- CORRECTED PROMPT LOGIC ---
        # 1. Build a list of message dictionaries from the history
        instruction = "You are a helpful insurance assistant. Answer clearly and stay on topic."
        messages = [{"role": "system", "content": instruction}]
        for user_msg, bot_msg in history[:-1]: # up to the current turn
            messages.append({"role": "user", "content": user_msg})
            if bot_msg: # Only add assistant message if it exists
                messages.append({"role": "assistant", "content": bot_msg})
        messages.append({"role": "user", "content": message}) # Add the current user message

        # 2. Use the tokenizer's chat template for perfect formatting
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        # --- END OF CORRECTION ---

        device = model.device
        output = generate(
            model=model, tokenizer=tokenizer, prompt=prompt, device=device,
            max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p,
            top_k=50, repetition_penalty=1.1, do_sample=True,
        )
        q.put(output)

    thread = threading.Thread(target=generate_in_thread)
    thread.start()

    thinking_animation = ["Thinking.", "Thinking..", "Thinking..."]
    frame_index = 0
    while thread.is_alive():
        history[-1][1] = thinking_animation[frame_index % len(thinking_animation)]
        yield "", history
        time.sleep(0.3)
        frame_index += 1

    final_output = q.get()
    history[-1][1] = final_output
    yield "", history

# --- Custom White + Blue Theme ---
custom_theme = gr.themes.Soft(
    primary_hue="blue", secondary_hue="slate",
).set(
    body_background_fill="white",
    block_background_fill="white",
    block_shadow="0 2px 8px rgba(0,0,0,0.05)",
    block_radius="0.75rem",
    button_primary_background_fill="#2563eb",
    button_primary_background_fill_hover="#1e40af",
    button_primary_text_color="white",
    button_secondary_background_fill="#f1f5f9",
    button_secondary_background_fill_hover="#e2e8f0",
    button_secondary_text_color="#1e293b",
    input_background_fill="white",
    input_border_color="#cbd5e1",
    input_radius="0.5rem",
)

# --- Custom Theme ---
with gr.Blocks(title="Insurance AI Assistant", theme=custom_theme) as demo:
    gr.Markdown(
        "<h1 style='text-align:center; color:#1e3a8a;'>🛡️ Insurance AI Assistant</h1>"
    )
    gr.Markdown(
        "<p style='text-align:center; color:#475569;'>Your modern, professional, and helpful insurance companion</p>"
    )

    chatbot = gr.Chatbot(
        [], elem_id="chatbot", label="Chat Window", bubble_full_width=False,
        height=550, avatar_images=(user_avatar, ai_avatar),
        show_copy_button=False
    )

    # --- Custom CSS to force full-page white background & fix message colors ---
    gr.HTML(
    """
    <style>
        body, .gradio-container {
            background-color: #ffffff !important;
            color: #1e293b !important;
            font-family: 'Inter', 'Segoe UI', Roboto, sans-serif !important;
        }

        /* Chat window */
        #chatbot {
            border: 1px solid #e2e8f0 !important;
            border-radius: 14px !important;
            padding: 8px !important;
            background-color: #ffffff !important;
        }

        /* User messages (soft modern blue) */
        #chatbot .message.user {
            background: linear-gradient(135deg, #3b82f6, #2563eb) !important;
            color: #ffffff !important;
            border-radius: 14px !important;
            padding: 10px 14px !important;
            box-shadow: 0px 2px 6px rgba(59, 130, 246, 0.25) !important;
        }

        /* Bot messages (light neutral contrast) */
        #chatbot .message.bot {
            background-color: #f1f5f9 !important;
            color: #1e293b !important;
            border-radius: 14px !important;
            padding: 10px 14px !important;
            box-shadow: 0px 1px 4px rgba(0,0,0,0.06) !important;
        }

        /* Buttons */
        button {
            border-radius: 12px !important;
            font-weight: 600 !important;
        }
        button.primary {
            background: linear-gradient(135deg, #1e3a8a, #0ea5e9) !important;
            color: white !important;
        }
        button.secondary {
            background-color: #f8fafc !important;
            color: #334155 !important;
            border: 1px solid #cbd5e1 !important;
        }

        /* Accordion + sliders */
        .gr-accordion, .gr-slider {
            border-radius: 12px !important;
        }
    </style>
    """
)

    with gr.Row():
        msg = gr.Textbox(
            scale=4, show_label=False, placeholder="💬 Ask me anything about insurance...",
            container=True,
        )
        submit = gr.Button("✉️ Send", variant="primary", scale=1, min_width=150)
    with gr.Accordion("⚙️ Advanced Generation Settings", open=False):
        max_new_tokens = gr.Slider(32, 512, value=250, step=8, label="Max New Tokens")
        temperature = gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="Temperature")
        top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-P")

    with gr.Row():
        clear = gr.Button("🗑️ Clear Chat", variant="secondary")
        copy_btn = gr.Button("📋 Copy Conversation", variant="secondary")
    
    formatted_chat = gr.Textbox(
        label="Formatted Conversation for Copying", visible=False, 
        lines=10, interactive=False,
    )

    inputs = [msg, chatbot, max_new_tokens, temperature, top_p]
    outputs = [msg, chatbot]

    submit.click(respond, inputs=inputs, outputs=outputs)
    msg.submit(respond, inputs=inputs, outputs=outputs)
    clear.click(lambda: (None, None), outputs=[msg, chatbot], queue=False)

    def copy_chat(history):
        formatted_text = format_conversation(history)
        return gr.update(value=formatted_text, visible=True)

    copy_btn.click(copy_chat, inputs=[chatbot], outputs=[formatted_chat])

# --- Launch the app ---
demo.launch(share=True, debug=True)

  chatbot = gr.Chatbot(
  chatbot = gr.Chatbot(


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://569e5c4ae4b0c710b0.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
