# Tales RAG and PowerPoint Agent Evaluation

This notebook demonstrates how to evaluate the RAG agent and PowerPoint generation capabilities of the Tales system.

In [1]:
# Import the necessary modules
import asyncio
import json
from pathlib import Path

from tales.evaluation import RAGEvaluator, PowerPointEvaluator, run_batch_evaluation
from tales.agent import agent
from tales.db_handler import ChromaDBHandler
from tales.config import DB_PATH
from tales.utils import get_available_docs
from langchain_core.messages import HumanMessage

Compiling RAG Agent...


## Check Available Documents

First, let's check what documents are available in our vector store.

In [2]:
# Initialize the ChromaDB handler
db_handler = ChromaDBHandler(persist_directory=DB_PATH)

# Get stored documents
stored_docs = db_handler.get_stored_documents()
print(f"Found {len(stored_docs)} documents in the vector store:")
for doc in stored_docs:
    print(f" - {doc}")

Database Initialized Successfully...
Found 2 documents in the vector store:
 - data/How Much Information.pdf
 - data/bugra.pdf


## Single Query Evaluation

Let's evaluate the RAG agent on a single query.

In [6]:
# Initialize the RAG evaluator
rag_evaluator = RAGEvaluator()

# Define a query to evaluate
query = "What are the 3 hypothesis mentioned?"

# Run the evaluation
rag_metrics, messages = rag_evaluator.evaluate_rag_query(query)

# Print the metrics
print("=== RAG Evaluation Results ===")
print(f"Response Time: {rag_metrics.response_time:.2f} seconds")
print(f"Context Relevance: {rag_metrics.context_relevance_score:.1f}/10")
print(f"Answer Correctness: {rag_metrics.answer_correctness_score:.1f}/10")
print(f"Answer Completeness: {rag_metrics.answer_completeness_score:.1f}/10")
print(f"Hallucination Score: {rag_metrics.hallucination_score:.1f}/10 (lower is better)")
print(f"Documents Retrieved: {rag_metrics.num_documents_retrieved}")
print(f"Research Iterations: {rag_metrics.num_research_iterations}")

Database Initialized Successfully...
Analyzing query...
Retrieving documents...
Generating answer...
Reflecting on answer...
=== RAG Evaluation Results ===
Response Time: 3.96 seconds
Context Relevance: 0.0/10
Answer Correctness: 5.0/10
Answer Completeness: 10.0/10
Hallucination Score: 5.0/10 (lower is better)
Documents Retrieved: 0
Research Iterations: 0


## View the RAG Response

Let's look at the response the RAG agent provided.

In [7]:
# Print the last message (the response)
from IPython.display import Markdown

response = next((msg.content for msg in reversed(messages) if hasattr(msg, 'content')), "No response found")
display(Markdown(f"**Query:** {query}\n\n**Response:**\n{response}"))

**Query:** What are the 3 hypothesis mentioned?

**Response:**
The three hypotheses mentioned in the text are:

*   **H1.** Trust is lower if expectations are violated.
*   **H2.** Changes in interface transparency affect trust depending on whether expectations are violated.
*   **H3.** If expectations are violated, procedural transparency increases trust, but additional information about outcomes erodes this trust.

## PowerPoint Generation Evaluation

Now, let's evaluate the PowerPoint generation capabilities.

In [None]:
# Initialize the PowerPoint evaluator
ppt_evaluator = PowerPointEvaluator()

# Generate and evaluate a PowerPoint
try:
    ppt_metrics = await ppt_evaluator.evaluate_ppt_generation(messages)
    
    # Print PowerPoint metrics
    print("=== PowerPoint Evaluation Results ===")
    print(f"Generation Time: {ppt_metrics.generation_time:.2f} seconds")
    print(f"Number of Slides: {ppt_metrics.slides_count}")
    print(f"Avg Content Per Slide: {ppt_metrics.avg_content_per_slide:.1f} characters")
    print(f"Content Coverage: {ppt_metrics.content_coverage_score:.1f}/10")
    print(f"Design Quality: {ppt_metrics.design_quality_score:.1f}/10")
    print(f"Organization: {ppt_metrics.organization_score:.1f}/10")
    
except Exception as e:
    print(f"Error evaluating PowerPoint: {e}")

## Batch Evaluation

Let's run a batch evaluation on multiple queries.

In [None]:
# Define a list of queries
queries = [
    "What are the key concepts in information theory?",
    "Explain the challenges of data overload in modern society",
    "What are the main differences between structured and unstructured data?"
]

# Run batch evaluation (this may take some time)
results = run_batch_evaluation(queries, save_path="notebook_evaluation_results.json")

# Print summary
print("\nBatch Evaluation Summary:")
for i, result in enumerate(results):
    print(f"\nQuery {i+1}: {result['query']}")
    print(f"RAG Correctness: {result['rag_metrics']['answer_correctness_score']:.1f}/10")
    print(f"RAG Completeness: {result['rag_metrics']['answer_completeness_score']:.1f}/10")
    print(f"PowerPoint Slides: {result['ppt_metrics']['slides_count']}")
    print(f"PowerPoint Content Coverage: {result['ppt_metrics']['content_coverage_score']:.1f}/10")

## Visualization of Results

Let's create a simple visualization of the evaluation results.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Create a DataFrame from the results
df_rag = pd.DataFrame([
    {
        'Query': r['query'],
        'Context Relevance': r['rag_metrics']['context_relevance_score'],
        'Answer Correctness': r['rag_metrics']['answer_correctness_score'],
        'Answer Completeness': r['rag_metrics']['answer_completeness_score'],
        'Hallucination Score': r['rag_metrics']['hallucination_score']
    } for r in results
])

df_ppt = pd.DataFrame([
    {
        'Query': r['query'],
        'Slides': r['ppt_metrics']['slides_count'],
        'Content Coverage': r['ppt_metrics']['content_coverage_score'],
        'Design Quality': r['ppt_metrics']['design_quality_score'],
        'Organization': r['ppt_metrics']['organization_score']
    } for r in results
])

# Plot RAG metrics
plt.figure(figsize=(12, 6))
x = np.arange(len(df_rag))
width = 0.2

plt.bar(x - 1.5*width, df_rag['Context Relevance'], width, label='Context Relevance')
plt.bar(x - 0.5*width, df_rag['Answer Correctness'], width, label='Answer Correctness')
plt.bar(x + 0.5*width, df_rag['Answer Completeness'], width, label='Answer Completeness')
plt.bar(x + 1.5*width, df_rag['Hallucination Score'], width, label='Hallucination Score')

plt.xlabel('Queries')
plt.ylabel('Score (0-10)')
plt.title('RAG Agent Evaluation Metrics')
plt.xticks(x, [f"Query {i+1}" for i in range(len(df_rag))])
plt.legend()
plt.ylim(0, 10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

# Plot PPT metrics
plt.figure(figsize=(12, 6))
x = np.arange(len(df_ppt))
width = 0.2

plt.bar(x - width, df_ppt['Content Coverage'], width, label='Content Coverage')
plt.bar(x, df_ppt['Design Quality'], width, label='Design Quality')
plt.bar(x + width, df_ppt['Organization'], width, label='Organization')

plt.xlabel('Queries')
plt.ylabel('Score (0-10)')
plt.title('PowerPoint Generation Evaluation Metrics')
plt.xticks(x, [f"Query {i+1}" for i in range(len(df_ppt))])
plt.legend()
plt.ylim(0, 10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()