In [1]:
import os
import warnings

# Suppress tqdm IProgress warning in Jupyter (optional: pip install ipywidgets for widget progress bars)
try:
    from tqdm import TqdmWarning
    warnings.filterwarnings("ignore", category=TqdmWarning)
except ImportError:
    pass

import pandas as pd
from openai import OpenAI

In [2]:
# read case data from csv file
case_data = pd.read_csv('data/dummy_data_clean.csv')

In [3]:
# convert case_data.case_text_lower to Kura Conversation object

from kura.types import Message, Conversation
from datetime import datetime
from rich import print


def process_case_row(row) -> Conversation:
    """Build a Kura Conversation from a case_data row (case_id, case_text_lower)."""
    case_id = row["case_id"]
    case_text = row["case_text_lower"] if pd.notna(row["case_text_lower"]) else ""
    return Conversation(
        chat_id=str(case_id),
        created_at=datetime.now(),
        messages=[
            Message(
                created_at=datetime.now(), # replace with case_data.created_at
                # product = case_data.product
                role="user",
                content=case_text,
            )
        ],
        metadata={"case_id": case_id},
    )


# Build conversations from case_data and show the first one
conversations = [process_case_row(case_data.iloc[i]) for i in range(len(case_data))]
print(conversations[0])


In [14]:
from kura.checkpoints import JSONLCheckpointManager
from kura.summarisation import summarise_conversations, SummaryModel, DEFAULT_SUMMARY_PROMPT
from kura.types.summarisation import GeneratedSummary
from kura.cluster import generate_base_clusters_from_conversation_summaries, ClusterDescriptionModel
from kura.meta_cluster import reduce_clusters_from_base_clusters, MetaClusterModel
from kura.dimensionality import reduce_dimensionality_from_clusters, HDBUMAP
from kura.visualization import visualise_pipeline_results




In [6]:
# Model setup from config: switch between OpenAI / Bedrock (summary) and OpenAI / local (embeddings)
import config
from config import (
    SUMMARIZATION_BACKEND,
    SUMMARIZATION_MODEL,
    EMBEDDING_BACKEND,
    EMBEDDING_MODEL,
    get_checkpoint_dir,
)
from kura.embedding import OpenAIEmbeddingModel, SentenceTransformerEmbeddingModel

CHECKPOINT_DIRECTORY = str(get_checkpoint_dir())


def get_summary_model():
    """Build SummaryModel from config. Use OpenAI or AWS Bedrock via Instructor provider string."""
    # Instructor format: "provider/model-id"
    # OpenAI:   "openai/gpt-4o-mini", "openai/gpt-4o"
    # Bedrock:  "bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0" (set AWS credentials / region)
    if SUMMARIZATION_BACKEND == "bedrock":
        model_id = f"bedrock/{SUMMARIZATION_MODEL}"
    else:
        model_id = f"openai/{SUMMARIZATION_MODEL}"
    return SummaryModel(model=model_id, max_concurrent_requests=10)


def get_embedding_model():
    """Build embedding model from config. Use OpenAI API or local SentenceTransformer (e.g. all-mpnet-base-v2)."""
    if EMBEDDING_BACKEND == "local":
        # Local: sentence-transformers (pip install sentence-transformers)
        # Examples: "all-mpnet-base-v2", "all-MiniLM-L6-v2"
        return SentenceTransformerEmbeddingModel(
            model_name=EMBEDDING_MODEL,
            model_batch_size=128,
            device="cpu",  # or "cuda" if GPU
        )
    else:
        # OpenAI: "text-embedding-3-small", "text-embedding-3-large"
        return OpenAIEmbeddingModel(
            model_name=EMBEDDING_MODEL,
            model_batch_size=50,
            n_concurrent_jobs=5,
        )


# Optional: print active config
print(f"Summary: {SUMMARIZATION_BACKEND} / {SUMMARIZATION_MODEL}")
print(f"Embedding: {EMBEDDING_BACKEND} / {EMBEDDING_MODEL}")
print(f"Checkpoints: {CHECKPOINT_DIRECTORY}")

In [7]:
# --- Summarization: what's extracted & how to customize ---
# Default (CLIO) extracted fields: summary, request, task, topic, languages,
# concerning_score (1-5), user_frustration (1-5), assistant_errors.
# You can: (1) override the prompt, (2) add fields by extending GeneratedSummary
# (custom fields end up in summary.metadata).

from pydantic import Field

# Optional: custom schema = default CLIO fields + your own (e.g. for support cases)
class CaseSummary(GeneratedSummary):
    """Add fields; they appear in ConversationSummary.metadata."""
    product_area: str = Field(description="Product area: payroll, tax, hr, billing, etc.")
    resolution_type: str = Field(description="How resolved: self_resolved, escalated, how_to, etc.")
    sentiment: str = Field(description="User sentiment: positive, neutral, frustrated")

# Optional: custom prompt (or use DEFAULT_SUMMARY_PROMPT, or DEFAULT_SUMMARY_PROMPT + " ...")
CUSTOM_SUMMARY_PROMPT = None  # set to a string to override; None = use Kura default

# To use custom extraction, run pipeline with:
#   await analyze_conversations(conversations, checkpoint_manager,
#       response_schema=CaseSummary,
#       prompt=CUSTOM_SUMMARY_PROMPT)  # omit prompt= to keep default
# Then read e.g. summaries[0].metadata["product_area"]

In [8]:
async def analyze_conversations(
    conversations,
    checkpoint_manager,
    *,
    response_schema=None,
    prompt=None,
):
    # Models from config: summary (OpenAI/Bedrock), embeddings (OpenAI/local e.g. all-mpnet-base-v2)
    summary_model = get_summary_model()
    embedding_model = get_embedding_model()
    cluster_model = ClusterDescriptionModel()
    meta_cluster_model = MetaClusterModel()
    dimensionality_model = HDBUMAP()

    # Run pipeline steps
    # Each conversation is summarized by an LLM (optional: response_schema, prompt)
    summarise_kw = dict(
        conversations=conversations,
        model=summary_model,
        checkpoint_manager=checkpoint_manager,
    )
    if response_schema is not None:
        summarise_kw["response_schema"] = response_schema
    if prompt is not None:
        summarise_kw["prompt"] = prompt
    summaries = await summarise_conversations(**summarise_kw)

    # Generate base clusters from conversation summaries
    clusters = await generate_base_clusters_from_conversation_summaries(
        summaries,
        embedding_model=embedding_model,
        clustering_model=cluster_model,
        checkpoint_manager=checkpoint_manager,
    )

    # Similar clusters are progressively combined
    reduced_clusters = await reduce_clusters_from_base_clusters(
        clusters, model=meta_cluster_model, checkpoint_manager=checkpoint_manager
    )

    # Projects clusters for visualization
    projected = await reduce_dimensionality_from_clusters(
        reduced_clusters,
        model=dimensionality_model,
        checkpoint_manager=checkpoint_manager,
    )

    return projected




In [9]:
# run the pipeline
checkpoint_manager = JSONLCheckpointManager(CHECKPOINT_DIRECTORY, enabled=True)
checkpoint_manager.save_checkpoint("conversations", conversations)
clusters = await analyze_conversations(
    conversations, checkpoint_manager=checkpoint_manager
)

Summarising 48 conversations: 100%|██████████| 48/48 [00:11<00:00,  4.29it/s]


In [10]:
# Get top-level clusters (those without parents)
parent_clusters = [cluster for cluster in clusters if cluster.parent_id is None]

# Format each cluster's info with name, description and number of chats
formatted_clusters = []
for cluster in parent_clusters:
    cluster_info = (
        f"[bold]{cluster.name}[/bold] : {cluster.description} : {len(cluster.chat_ids)}"
    )
    formatted_clusters.append(cluster_info)

# Join with newlines and print
print("\n\n".join(formatted_clusters))

In [12]:
# Analysing Our Summaries
from kura.types import ConversationSummary
from kura.checkpoints import JSONLCheckpointManager

checkpoint_manager = JSONLCheckpointManager(CHECKPOINT_DIRECTORY, enabled=True)
summaries = checkpoint_manager.load_checkpoint("summaries", ConversationSummary)
conversations = checkpoint_manager.load_checkpoint("conversations", Conversation)


id_to_conversation = {
    conversation.chat_id: conversation for conversation in conversations
}


for i in range(10):
    print(summaries[i].summary)
    print(id_to_conversation[summaries[i].chat_id].messages[0].content)

In [16]:
from kura.visualization import visualise_pipeline_results

# Minimal tree
visualise_pipeline_results(clusters, style="basic")

# Default: tree + percentages + progress bars + stats
#visualise_pipeline_results(clusters, style="enhanced")

# Colored + tables (needs: pip install rich)
#visualise_pipeline_results(clusters, style="rich")

Clusters (48 conversations)
╠══ Assist with payroll and payment processing issues (2 conversations)
╠══ Assist with time and labor support tasks (6 conversations)
╠══ Assist with payroll and HR inquiries (18 conversations)
╠══ Assist with tax rate change inquiries (13 conversations)
╚══ Assist with HR and payroll system issues (9 conversations)



In [None]:
# 