In [3]:
# Environment Setup for Local Use

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import random
import json
import re
from dotenv import load_dotenv
import openai

# Load the .env file
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Plotting Defaults
plt.style.use("ggplot")
sns.set(style="whitegrid")

# Version Info
print(f"Python version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

# Local Project Directories
PROJECT_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(PROJECT_DIR, "data")
os.makedirs(DATA_DIR, exist_ok=True)

print(f"Project directory: {PROJECT_DIR}")
print(f"Data directory: {DATA_DIR}")

# Optional: Test OpenAI API connection
def test_openai_connection():
    if not openai.api_key:
        print("API key not found.")
        return False
    try:
        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": "Hello, are you working?"}],
            max_tokens=10,
        )
        print("OpenAI API connection successful with GPT-4o")
        return True
    except Exception as e:
        print(f"OpenAI API connection failed: {e}")
        return False

# Uncomment to test connection
# test_openai_connection()

Python version: 3.13.2 (main, Feb  4 2025, 14:51:09) [Clang 16.0.0 (clang-1600.0.26.6)]
Pandas version: 2.2.3
NumPy version: 2.2.4
Project directory: /Users/tom/insurance-sentiment-dashboard
Data directory: /Users/tom/insurance-sentiment-dashboard/data


In [4]:
print(f"Project directory: {PROJECT_DIR}")
print(f"Data directory: {DATA_DIR}")

Project directory: /Users/tom/insurance-sentiment-dashboard
Data directory: /Users/tom/insurance-sentiment-dashboard/data


In [5]:
def classify_sentiment(text: str) -> str:
    try:
        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": f"Classify the sentiment of the following insurance-related feedback as positive, negative, or neutral:\n\n{text}"
                }
            ],
            max_tokens=10,
        )

        # Extract the response text
        result = response.choices[0].message.content.strip().lower()
        return result

    except Exception as e:
        print(f"Sentiment classification failed: {e}")
        return "error"

In [6]:
sample_text = "The claims process was super slow and frustrating."
print("Sentiment:", classify_sentiment(sample_text))

Sentiment: negative


In [8]:
import pandas as pd

def validate_sentiments(df, sample_size=10):
    """
    Randomly sample and display sentiment-labeled insurance comments for manual validation.

    Parameters:
        df (pd.DataFrame): The DataFrame with 'comment' and 'sentiment' columns.
        sample_size (int): Number of rows to sample for inspection.

    Returns:
        pd.DataFrame: Sampled rows for review.
    """
    if 'comment' not in df.columns or 'sentiment' not in df.columns:
        raise ValueError("DataFrame must contain 'comment' and 'sentiment' columns.")
    
    sample = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
    return sample[['comment', 'sentiment']]

In [14]:
def validate_sentiments_from_list(comments, sample_size=5):
    import pandas as pd
    sample = comments[:sample_size]
    results = [{"comment": text, "sentiment": classify_sentiment(text)} for text in sample]
    return pd.DataFrame(results)

In [15]:
test_comments = [
    "I’m really happy with how quickly they resolved my issue.",
    "The customer service rep was rude.",
    "I don’t feel strongly either way about this company.",
    # Add more if you have them
]

In [16]:
validate_sentiments_from_list(test_comments)

Unnamed: 0,comment,sentiment
0,I’m really happy with how quickly they resolve...,the sentiment of the feedback is positive.
1,The customer service rep was rude.,negative
2,I don’t feel strongly either way about this co...,neutral


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def retrieve_relevant_comments(query: str, comments: list, top_n: int = 5) -> pd.DataFrame:
    """
    Retrieve top N relevant comments using TF-IDF and cosine similarity.

    Parameters:
        query (str): The user's search query.
        comments (list): A list of insurance-related text comments.
        top_n (int): Number of top relevant results to return.

    Returns:
        pd.DataFrame: Top N matching comments with similarity scores.
    """
    # Combine query and comments into one list
    all_texts = [query] + comments

    # Vectorize using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_texts)

    # Compute cosine similarity between the query and all comments
    similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

    # Build results DataFrame
    results_df = pd.DataFrame({
        'comment': comments,
        'similarity': similarities
    })

    # Return top N results sorted by similarity
    return results_df.sort_values(by='similarity', ascending=False).head(top_n).reset_index(drop=True)

In [18]:
retrieve_relevant_comments("Was the claims process smooth?", test_comments)

Unnamed: 0,comment,similarity
0,The customer service rep was rude.,0.263568
1,I’m really happy with how quickly they resolve...,0.0
2,I don’t feel strongly either way about this co...,0.0


In [19]:
retrieve_relevant_comments("How was your experience with the insurance claims process?", test_comments)

Unnamed: 0,comment,similarity
0,The customer service rep was rude.,0.198428
1,I’m really happy with how quickly they resolve...,0.158253
2,I don’t feel strongly either way about this co...,0.0


In [20]:
import openai

def answer_with_context(query: str, comments: list, top_n: int = 5) -> str:
    """
    Generate an answer to a query using top N relevant insurance comments as context.

    Parameters:
        query (str): The user's question.
        comments (list): A list of insurance-related text comments.
        top_n (int): Number of relevant comments to retrieve.

    Returns:
        str: GPT-generated answer using retrieved context.
    """
    # Retrieve relevant comments
    top_comments_df = retrieve_relevant_comments(query, comments, top_n=top_n)
    context = "\n- ".join(top_comments_df['comment'].tolist())

    prompt = (
        f"You are analyzing insurance customer feedback.\n"
        f"Here are relevant comments:\n- {context}\n\n"
        f"Based on these, answer the following question:\n\n{query}"
    )

    # Call GPT-4o to generate answer
    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=150
    )

    return response.choices[0].message.content.strip()

In [21]:
answer_with_context("How did customers feel about the claims process?", test_comments)

'Based on the comments provided, there is limited specific feedback regarding the claims process itself. However, one customer mentioned being "really happy with how quickly they resolved my issue," which suggests a positive experience related to the resolution of a claim. The other comments do not directly address the claims process but instead focus on neutral and negative experiences with the company and customer service. Therefore, while one customer may have had a positive experience with the claims process, the overall sentiment cannot be fully determined from the limited feedback provided.'