In [1]:
import os
import warnings

# Suppress tqdm IProgress warning in Jupyter (optional: pip install ipywidgets for widget progress bars)
try:
    from tqdm import TqdmWarning
    warnings.filterwarnings("ignore", category=TqdmWarning)
except ImportError:
    pass

import pandas as pd
from openai import OpenAI

In [3]:
# read case data from csv file
case_data = pd.read_csv('data/dummy_data_clean.csv')

In [None]:
# convert case_data.case_text_lower to Kura Conversation object

from kura.types import Message, Conversation
from datetime import datetime
from rich import print


def process_case_row(row) -> Conversation:
    """Build a Kura Conversation from a case_data row (case_id, case_text_lower)."""
    case_id = row["case_id"]
    case_text = row["case_text_lower"] if pd.notna(row["case_text_lower"]) else ""
    return Conversation(
        chat_id=str(case_id),
        created_at=datetime.now(),
        messages=[
            Message(
                created_at=datetime.now(), # replace with case_data.created_at
                # product = case_data.product
                role="user",
                content=case_text,
            )
        ],
        metadata={"case_id": case_id},
    )


# Build conversations from case_data and show the first one
conversations = [process_case_row(case_data.iloc[i]) for i in range(len(case_data))]
print(conversations[0])


In [None]:
from kura.checkpoints import JSONLCheckpointManager
from kura.summarisation import summarise_conversations, SummaryModel
from kura.cluster import generate_base_clusters_from_conversation_summaries, ClusterDescriptionModel
from kura.meta_cluster import reduce_clusters_from_base_clusters, MetaClusterModel
from kura.dimensionality import reduce_dimensionality_from_clusters, HDBUMAP




In [None]:
# Model setup from config: switch between OpenAI / Bedrock (summary) and OpenAI / local (embeddings)
import config
from config import (
    SUMMARIZATION_BACKEND,
    SUMMARIZATION_MODEL,
    EMBEDDING_BACKEND,
    EMBEDDING_MODEL,
    MAX_CLUSTERS,
    get_checkpoint_dir,
)
from kura.embedding import OpenAIEmbeddingModel, SentenceTransformerEmbeddingModel

CHECKPOINT_DIRECTORY = str(get_checkpoint_dir())


def get_summary_model():
    """Build SummaryModel from config. Use OpenAI or AWS Bedrock via Instructor provider string."""
    # Instructor format: "provider/model-id"
    # OpenAI:   "openai/gpt-4o-mini", "openai/gpt-4o"
    # Bedrock:  "bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0" (set AWS credentials / region)
    if SUMMARIZATION_BACKEND == "bedrock":
        model_id = f"bedrock/{SUMMARIZATION_MODEL}"
    else:
        model_id = f"openai/{SUMMARIZATION_MODEL}"
    return SummaryModel(model=model_id, max_concurrent_requests=10)


def get_embedding_model():
    """Build embedding model from config. Use OpenAI API or local SentenceTransformer (e.g. all-mpnet-base-v2)."""
    if EMBEDDING_BACKEND == "local":
        # Local: sentence-transformers (pip install sentence-transformers)
        # Examples: "all-mpnet-base-v2", "all-MiniLM-L6-v2"
        return SentenceTransformerEmbeddingModel(
            model_name=EMBEDDING_MODEL,
            model_batch_size=128,
            device="cpu",  # or "cuda" if GPU
        )
    else:
        # OpenAI: "text-embedding-3-small", "text-embedding-3-large"
        return OpenAIEmbeddingModel(
            model_name=EMBEDDING_MODEL,
            model_batch_size=50,
            n_concurrent_jobs=5,
        )


# Optional: print active config
print(f"Summary: {SUMMARIZATION_BACKEND} / {SUMMARIZATION_MODEL}")
print(f"Embedding: {EMBEDDING_BACKEND} / {EMBEDDING_MODEL}")
print(f"Checkpoints: {CHECKPOINT_DIRECTORY}")

In [32]:
# Custom summary prompt (domain-specific for support cases)
# Ref: https://usekura.xyz/notebooks/how-to-look-at-data/02_summaries_task/#defining-our-new-summary-model
# Pass to analyze_conversations(..., prompt=CASE_SUMMARY_PROMPT). Use None for Kura default.
# To extend the default: from kura.summarisation import DEFAULT_SUMMARY_PROMPT; prompt=DEFAULT_SUMMARY_PROMPT + " Your extra instructions."

CASE_SUMMARY_PROMPT = """
Analyze the support case conversation to provide a focused summary for clustering.

In your response:

1. Identify the product area and case type, such as:
   - Payroll (submission, deadlines, earning codes, direct deposit)
   - Tax (notices, rate changes, SUIs, account maintenance)
   - HR & payroll (transfers, employee setup, supervisor visibility, time & labor)
   - Billing (invoices, fees, discrepancies)

2. State the user's concrete need in 20–30 words: what they asked for and how it was resolved (or not).

Format:
"User needed [product area]: [specific request or issue]. [Resolution type: e.g. guided to self-resolve, escalated, how-to.]"

Reference the conversation below:
<messages>
{% for message in conversation.messages %}
<message>{{message.role}}: {{message.content}}</message>
{% endfor %}
</messages>

You must also return these structured fields (they are captured on each summary):

- **summary**: (Use the format above; concise, no PII.)
- **request**: What is the user's overall request? Start with "The user's overall request for the assistant is to"
- **task**: What task is being performed? Start with "The task is to"
- **topic**: Optional short topic label (e.g. "payroll", "tax", "hr").
- **languages**: List of main languages (human and/or programming). Use lowercase full names.
- **concerning_score**: Integer 1–5 (1=benign, 5=immediate review).
- **user_frustration**: Integer 1–5 (1=happy, 5=extremely frustrated).
- **assistant_errors**: What did the assistant do wrong, if anything? (e.g. "Misunderstood intent", "Outdated information".) Use null if none.
"""

In [33]:
async def analyze_conversations(
    conversations,
    checkpoint_manager,
    *,
    response_schema=None,
    prompt=None,
):
    # Models from config: summary (OpenAI/Bedrock), embeddings (OpenAI/local e.g. all-mpnet-base-v2)
    summary_model = get_summary_model()
    embedding_model = get_embedding_model()
    cluster_model = ClusterDescriptionModel()
    meta_cluster_model = MetaClusterModel()
    dimensionality_model = HDBUMAP()

    # Run pipeline steps
    # Each conversation is summarized by an LLM (optional: response_schema, prompt)
    summarise_kw = dict(
        conversations=conversations,
        model=summary_model,
        checkpoint_manager=checkpoint_manager,
    )
    if response_schema is not None:
        summarise_kw["response_schema"] = response_schema
    if prompt is not None:
        summarise_kw["prompt"] = prompt
    summaries = await summarise_conversations(**summarise_kw)

    # Generate base clusters from conversation summaries
    clusters = await generate_base_clusters_from_conversation_summaries(
        summaries,
        embedding_model=embedding_model,
        clustering_model=cluster_model,
        checkpoint_manager=checkpoint_manager,
    )

    # Similar clusters are progressively combined
    reduced_clusters = await reduce_clusters_from_base_clusters(
        clusters, model=meta_cluster_model, checkpoint_manager=checkpoint_manager
    )

    # Projects clusters for visualization
    projected = await reduce_dimensionality_from_clusters(
        reduced_clusters,
        model=dimensionality_model,
        checkpoint_manager=checkpoint_manager,
    )

    return projected




In [34]:
# run the pipeline
checkpoint_manager = JSONLCheckpointManager(CHECKPOINT_DIRECTORY, enabled=False)
checkpoint_manager.save_checkpoint("conversations", conversations)
clusters = await analyze_conversations(
    conversations,
    checkpoint_manager=checkpoint_manager,
    prompt=CASE_SUMMARY_PROMPT,
)

Summarising 48 conversations: 100%|██████████| 48/48 [00:12<00:00,  3.91it/s]


In [35]:
# Get top-level clusters (those without parents)
parent_clusters = [cluster for cluster in clusters if cluster.parent_id is None]

# Format each cluster's info with name, description and number of chats
formatted_clusters = []
for cluster in parent_clusters:
    cluster_info = (
        f"[bold]{cluster.name}[/bold] : {cluster.description} : {len(cluster.chat_ids)}"
    )
    formatted_clusters.append(cluster_info)

# Join with newlines and print
print("\n\n".join(formatted_clusters))

In [36]:
# Analysing Our Summaries
# user_frustration, concerning_score, assistant_errors are on *summaries* (from the summary step), not on conversations.
from kura.types import ConversationSummary
from kura.checkpoints import JSONLCheckpointManager

checkpoint_manager = JSONLCheckpointManager(CHECKPOINT_DIRECTORY, enabled=True)
summaries = checkpoint_manager.load_checkpoint("summaries", ConversationSummary)
conversations = checkpoint_manager.load_checkpoint("conversations", Conversation)


# List of dict with all summary fields (user_frustration, concerning_score, assistant_errors, ...)
summaries_as_dicts = [s.model_dump() for s in summaries]

id_to_conversation = {
    conversation.chat_id: conversation for conversation in conversations
}


for i in range(10):
    print(summaries[i].summary)
    print("  user_frustration:", summaries[i].user_frustration, "| concerning_score:", summaries[i].concerning_score)
    print(id_to_conversation[summaries[i].chat_id].messages[0].content)

In [37]:
# Minimal tree
visualise_pipeline_results(clusters, style="basic")

# Default: tree + percentages + progress bars + stats
#visualise_pipeline_results(clusters, style="enhanced")

# Colored + tables (needs: pip install rich)
#visualise_pipeline_results(clusters, style="rich")

Clusters (48 conversations)
╠══ Assist with HR and payroll inquiries (30 conversations)
╠══ Assist with tax rate changes and account maintenance (6 conversations)
╠══ Resolve employee timeclock access issues (1 conversations)
╠══ Assist with holiday application for time and labor (3 conversations)
╚══ Assist with tax notice resolutions and changes (8 conversations)



In [None]:
# 