In [None]:
# Cell 1: Setup and Imports
import sys
import os
sys.path.append('src')  # Add src directory to path

from vector_emb import (
    discover_workshops, 
    process_all_workshops, 
    get_collection_status,
    answer_question,
    llm_answer_question,
    get_openai_client,
    retrieve_relevant_chunks,
    get_context_for_question,
    combine_chunks,
    format_sources
)
import pandas as pd
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

print("✅ Imports successful")

In [None]:
# Cell 2: Check Workshop Discovery
print("🔍 Discovering Workshop Files...")
workshops = discover_workshops()

if workshops:
    print(f"\nFound {len(workshops)} workshop files:")
    for workshop_id, info in workshops.items():
        print(f"  📄 {workshop_id}: {info['filename']}")
        print(f"     Path: {info['path']}")
        print(f"     Exists: {os.path.exists(info['path'])}")
        print()
else:
    print("❌ No workshop files found in data directory")

In [None]:
# Cell 3: Process Workshops into Vector Database
print("🚀 Processing workshops into vector database...")
print("This may take a few minutes for the first run...\n")

# Process all workshops
processed_workshops = process_all_workshops()

# Check collection status
print("\n" + "="*50)
print("📊 Collection Status After Processing:")
get_collection_status()

In [None]:
# Cell 4: Interactive Question Answering with Detailed Retrieval
def detailed_question_answer(question, workshop_filter=None, show_chunks=True):
    """Answer a question with detailed retrieval information"""
    
    print(f"❓ Question: {question}")
    print(f"🎯 Workshop Filter: {workshop_filter or 'All workshops'}")
    print("-" * 80)
    
    # Step 1: Get context and sources
    print("🔍 Step 1: Retrieving relevant chunks...")
    context, sources, chunks = get_context_for_question(
        question=question,
        workshop_filter=workshop_filter,
        max_chunks=5
    )
    
    print(f"   📦 Retrieved {len(chunks)} chunks")
    print(f"   🔢 Context tokens: {len(context.split())}")
    
    # Step 2: Show retrieved chunks
    if show_chunks:
        print("\n📋 Step 2: Retrieved Chunks (ordered by relevance):")
        for i, chunk in enumerate(chunks):
            metadata = chunk['metadata']
            print(f"\n   Chunk {i+1}:")
            print(f"   └── Workshop: {metadata.get('workshop_id', 'Unknown')}")
            print(f"   └── Position: {metadata.get('position', 'Unknown')}")
            print(f"   └── Speaker: {metadata.get('speaker', 'Unknown')}")
            print(f"   └── Timestamp: {metadata.get('timestamp', 'Unknown')}")
            print(f"   └── Tokens: {metadata.get('token_count', 'Unknown')}")
            print(f"   └── Text Preview: {chunk['text'][:200]}...")
    
    # Step 3: Show combined context
    print("\n📝 Step 3: Combined Context:")
    print(f"   📏 Total context length: {len(context)} characters")
    print(f"   🔢 Estimated tokens: {len(context.split())}")
    
    # Step 4: Generate answer
    print("\n🤖 Step 4: Generating answer...")
    client = get_openai_client()
    answer, context_info = llm_answer_question(client, context, sources, chunks, question)
    
    print(f"\n💬 Answer:")
    print(f"   {answer}")
    
    print(f"\n📊 Context Info:")
    for key, value in context_info.items():
        if key != 'chunks':  # Don't print the full chunks again
            print(f"   └── {key}: {value}")
    
    return {
        'question': question,
        'answer': answer,
        'context': context,
        'sources': sources,
        'chunks': chunks,
        'context_info': context_info
    }

# Example usage
result = detailed_question_answer(
    question="What is Modal and how is it used?",
    workshop_filter=None,  # Search all workshops
    show_chunks=True
)

In [None]:
# Cell 5: Compare Different Workshop Filters
questions = [
    "What is Modal?",
    "How do you deploy applications?", 
    "What are the main benefits discussed?"
]

results = []

for question in questions:
    print(f"\n{'='*80}")
    print(f"🔄 Testing: {question}")
    print(f"{'='*80}")
    
    # Test with all workshops
    print("\n🌍 Searching ALL workshops:")
    result_all = detailed_question_answer(question, workshop_filter=None, show_chunks=False)
    
    # Test with specific workshop (if WS1 exists)
    if 'WS1' in discover_workshops():
        print("\n🎯 Searching WS1 only:")
        result_ws1 = detailed_question_answer(question, workshop_filter="WS1", show_chunks=False)
    
    results.append({
        'question': question,
        'all_workshops': result_all,
        'ws1_only': result_ws1 if 'WS1' in discover_workshops() else None
    })

In [None]:
# Cell 6: Analyze Retrieval Patterns
def analyze_retrieval_patterns(results):
    """Analyze which workshops and chunks are being retrieved most often"""
    
    workshop_usage = {}
    chunk_positions = []
    speakers_mentioned = {}
    
    for result in results:
        if 'all_workshops' in result:
            sources = result['all_workshops']['sources']
            
            for source in sources:
                # Track workshop usage
                workshop_id = source.get('workshop_id', 'Unknown')
                workshop_usage[workshop_id] = workshop_usage.get(workshop_id, 0) + 1
                
                # Track chunk positions
                position = source.get('position', 0)
                if isinstance(position, (int, str)) and str(position).isdigit():
                    chunk_positions.append(int(position))
                
                # Track speakers
                speaker = source.get('speaker', 'Unknown')
                speakers_mentioned[speaker] = speakers_mentioned.get(speaker, 0) + 1
    
    # Create analysis DataFrame
    analysis_df = pd.DataFrame([
        {'Metric': 'Workshop Usage', 'Data': workshop_usage},
        {'Metric': 'Speaker Frequency', 'Data': speakers_mentioned},
        {'Metric': 'Chunk Positions', 'Data': f"Range: {min(chunk_positions) if chunk_positions else 0}-{max(chunk_positions) if chunk_positions else 0}"}
    ])
    
    print("📈 Retrieval Pattern Analysis:")
    print(f"   🏢 Workshop Usage: {workshop_usage}")
    print(f"   🎤 Speaker Frequency: {speakers_mentioned}")
    print(f"   📍 Chunk Position Range: {min(chunk_positions) if chunk_positions else 0}-{max(chunk_positions) if chunk_positions else 0}")
    
    return analysis_df

# Run analysis if we have results
if 'results' in locals():
    analysis = analyze_retrieval_patterns(results)

In [None]:
# Cell 7: Detailed Chunk Inspection
def inspect_chunks_in_detail(question, max_chunks=10):
    """Get detailed view of all retrieved chunks for a question"""
    
    print(f"🔬 Detailed Chunk Inspection for: '{question}'")
    print("="*80)
    
    # Get more chunks than usual for analysis
    chunks = retrieve_relevant_chunks(question, n_results=max_chunks)
    
    chunk_data = []
    for i, chunk in enumerate(chunks):
        metadata = chunk['metadata']
        
        chunk_info = {
            'Rank': i + 1,
            'Workshop': metadata.get('workshop_id', 'Unknown'),
            'Position': metadata.get('position', 'Unknown'),
            'Speaker': metadata.get('speaker', 'Unknown'),
            'Timestamp': metadata.get('timestamp', 'Unknown'),
            'Token_Count': metadata.get('token_count', 'Unknown'),
            'Text_Length': len(chunk['text']),
            'Text_Preview': chunk['text'][:150] + "..." if len(chunk['text']) > 150 else chunk['text']
        }
        chunk_data.append(chunk_info)
    
    # Convert to DataFrame for better display
    chunks_df = pd.DataFrame(chunk_data)
    
    print(f"📊 Found {len(chunks)} chunks:")
    print(chunks_df[['Rank', 'Workshop', 'Position', 'Speaker', 'Token_Count', 'Text_Length']].to_string(index=False))
    
    print(f"\n📝 Text Previews:")
    for i, chunk_info in enumerate(chunk_data[:5]):  # Show first 5
        print(f"\n{i+1}. [{chunk_info['Workshop']}] Position {chunk_info['Position']}:")
        print(f"   {chunk_info['Text_Preview']}")
    
    return chunks_df

# Example inspection
chunks_analysis = inspect_chunks_in_detail("What is Modal?", max_chunks=10)

In [None]:
# Cell 8: Test Different Retrieval Parameters
def test_retrieval_parameters(question):
    """Test how different parameters affect retrieval"""
    
    print(f"🧪 Testing Retrieval Parameters for: '{question}'")
    print("="*80)
    
    # Test different chunk counts
    chunk_counts = [3, 5, 10]
    
    for count in chunk_counts:
        print(f"\n📦 Retrieving {count} chunks:")
        chunks = retrieve_relevant_chunks(question, n_results=count)
        context = combine_chunks(chunks)
        
        print(f"   └── Retrieved: {len(chunks)} chunks")
        print(f"   └── Context tokens: ~{len(context.split())}")
        print(f"   └── Context length: {len(context)} chars")
        
        # Show workshop distribution
        workshops = [chunk['metadata'].get('workshop_id', 'Unknown') for chunk in chunks]
        workshop_dist = {ws: workshops.count(ws) for ws in set(workshops)}
        print(f"   └── Workshop distribution: {workshop_dist}")

# Test parameters
test_retrieval_parameters("How do you use Modal for deployment?")

In [None]:
# Cell 9: Export Results for Analysis
def export_qa_results(results, filename="qa_analysis.csv"):
    """Export Q&A results to CSV for further analysis"""
    
    export_data = []
    
    for result in results:
        question = result['question']
        
        if 'all_workshops' in result and result['all_workshops']:
            qa_data = result['all_workshops']
            
            export_data.append({
                'question': question,
                'answer': qa_data['answer'],
                'num_chunks': len(qa_data['chunks']),
                'context_tokens': qa_data['context_info'].get('context_tokens', 0),
                'completion_tokens': qa_data['context_info'].get('completion_tokens', 0),
                'workshops_used': ', '.join(qa_data['context_info'].get('workshops_used', [])),
                'context_length': len(qa_data['context'])
            })
    
    if export_data:
        export_df = pd.DataFrame(export_data)
        export_df.to_csv(filename, index=False)
        print(f"📄 Exported {len(export_data)} Q&A results to {filename}")
        print(export_df.head())
        return export_df
    else:
        print("❌ No results to export")

# Export if we have results
if 'results' in locals():
    qa_df = export_qa_results(results)

In [None]:
# Cell 10: Interactive Q&A Session
def interactive_qa():
    """Interactive question-answering session"""
    
    print("🎯 Interactive Q&A Session")
    print("Ask questions about your workshop transcripts!")
    print("Type 'quit' to exit, 'status' to check collection status")
    print("-" * 50)
    
    while True:
        question = input("\n❓ Your question: ").strip()
        
        if question.lower() == 'quit':
            print("👋 Goodbye!")
            break
        elif question.lower() == 'status':
            get_collection_status()
            continue
        elif not question:
            continue
        
        try:
            # Get detailed answer
            result = detailed_question_answer(question, show_chunks=False)
            
            # Ask if user wants to see chunks
            show_detail = input("\n🔍 Show detailed chunk analysis? (y/n): ").lower()
            if show_detail == 'y':
                inspect_chunks_in_detail(question, max_chunks=5)
                
        except Exception as e:
            print(f"❌ Error: {e}")

# Uncomment to run interactive session
# interactive_qa()