In [100]:
import os
import time
import chromadb
from chromadb.utils import embedding_functions
from openai import OpenAI
import json
import glob
from process_transcript import chunk_workshop_transcript, count_tokens, robust_chunk_workshop
from dotenv import load_dotenv
from typing import List, Dict, Any
import numpy as np
import re
import uuid

# Support both local and Modal paths - notebook compatible version
# Check for Modal environment first, then use relative paths for local notebook
if os.path.exists("/root/data"):
    DATA_DIR = "/root/data"
    CHROMA_DB_PATH = "/root/chroma_db"
else:
    # For notebook environment, go up one directory from src to project root
    current_dir = os.getcwd()
    if current_dir.endswith('/src'):
        project_root = os.path.dirname(current_dir)
    else:
        project_root = current_dir
    DATA_DIR = os.path.join(project_root, "data")
    CHROMA_DB_PATH = os.path.join(project_root, "chroma_db")

COLLECTION_NAME = "workshop_chunks_all"
EMBEDDING_MODEL = "text-embedding-3-small"
DEFAULT_MAX_TOKENS = 12000
DEFAULT_MAX_CHUNKS = 5
COMPLETION_MODEL = "gpt-4o-mini"
EMBEDDING_MAX_TOKENS = 7000

SYSTEM_PROMPT = """You are a helpful workshop assistant.
Answer questions based only on the workshop transcript sections provided.
If you don't know the answer or can't find it in the provided sections, say so.
When referencing information, mention which workshop(s) the information comes from."""

def discover_workshops(data_dir=DATA_DIR):
    """Discover all workshop VTT files in the data directory"""
    try:
        pattern = os.path.join(data_dir, "*.vtt")
        vtt_files = glob.glob(pattern)
        
        workshops = {}
        for vtt_file in vtt_files:
            filename = os.path.basename(vtt_file)
            workshop_id = filename.split('-')[0] if '-' in filename else filename.split('.')[0]
            
            workshops[workshop_id] = {
                'id': workshop_id,
                'filename': filename,
                'path': vtt_file
            }
        
        return workshops
        
    except Exception as e:
        print(f"Error discovering workshops: {e}")
        return {}

In [101]:
files_vtt = discover_workshops(data_dir=DATA_DIR)

In [102]:
files_vtt['WS5']['path']

'/home/pastor/projects/discord-chat-bot/data/WS5-C2.vtt'

In [103]:
def load_vtt_content(file_path):
    """Load VTT file and extract clean text content"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return ""
    
    lines = content.split('\n')
    content_lines = []
    
    for line in lines:
        line = line.strip()
        if (not line or 
            line == 'WEBVTT' or 
            '-->' in line or 
            re.match(r'^\d+:\d+:\d+', line) or
            re.match(r'^[A-Z]+(\s*:.*)?$', line)):
            continue
        content_lines.append(line)
    
    return " ".join(content_lines)

In [104]:
def count_tokens(text: str) -> int:
    """Count tokens using tiktoken"""
    try:
        encoding = tiktoken.encoding_for_model("gpt-4")
        return len(encoding.encode(text))
    except:
        encoding = tiktoken.get_encoding("cl100k_base")
        return len(encoding.encode(text))

In [105]:
def split_large_chunk(text: str, max_tokens: int = EMBEDDING_MAX_TOKENS) -> List[str]:
    """Split a large chunk into smaller pieces that fit within token limits"""
    if count_tokens(text) <= max_tokens:
        return [text]
    
    sentences = re.split(r'[.!?]+\s+', text)
    
    chunks = []
    current_chunk = ""
    current_tokens = 0
    
    for sentence in sentences:
        sentence_tokens = count_tokens(sentence)
        
        if sentence_tokens > max_tokens:
            words = sentence.split()
            word_chunk = ""
            word_tokens = 0
            
            for word in words:
                word_token_count = count_tokens(word)
                if word_tokens + word_token_count > max_tokens and word_chunk:
                    chunks.append(word_chunk.strip())
                    word_chunk = word
                    word_tokens = word_token_count
                else:
                    word_chunk += " " + word if word_chunk else word
                    word_tokens += word_token_count
            
            if word_chunk:
                chunks.append(word_chunk.strip())
            continue
        
        if current_tokens + sentence_tokens > max_tokens and current_chunk:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
            current_tokens = sentence_tokens
        else:
            current_chunk += ". " + sentence if current_chunk else sentence
            current_tokens += sentence_tokens
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks


In [106]:
def generate_embedding(text: str) -> List[float]:
    """Generate embedding for text with safe token splitting"""
    client = get_openai_client()
    
    token_count = count_tokens(text)
    
    if token_count <= EMBEDDING_MAX_TOKENS:
        response = client.embeddings.create(
            input=text,
            model=EMBEDDING_MODEL
        )
        return response.data[0].embedding
    else:
        split_texts = split_large_chunk(text, EMBEDDING_MAX_TOKENS)
        
        embeddings = []
        for split_text in split_texts:
            response = client.embeddings.create(
                input=split_text,
                model=EMBEDDING_MODEL
            )
            embeddings.append(response.data[0].embedding)
        
        avg_embedding = np.mean(embeddings, axis=0).tolist()
        return avg_embedding

In [107]:
def get_openai_client():
    """Initialize OpenAI client with API key"""
    load_dotenv()
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY not found in environment variables")
    return OpenAI(api_key=api_key)

In [108]:
EMBEDDING_MODEL = "text-embedding-3-small"
DEFAULT_MAX_TOKENS = 12000
DEFAULT_MAX_CHUNKS = 5
COMPLETION_MODEL = "gpt-4o-mini"
EMBEDDING_MAX_TOKENS = 7000

In [109]:
import tiktoken

for i in range(1, 7):
    transcript_vtt = load_vtt_content(files_vtt[f'WS{i}']['path'])
    print(len(transcript_vtt))
    print(count_tokens(transcript_vtt))
    chunk = split_large_chunk(transcript_vtt)
    print(chunk[-1][-100::])
    print(f"transcript {i} {transcript_vtt[-100:]}")
    print("embedding")
    emb_trans = generate_embedding(transcript_vtt)
    print(emb_trans[0])

121771
30201
e you in 46 h for the next workshop as well. So thanks everyone, and thanks for a great 1st session.
transcript 1 e you in 46 h for the next workshop as well. So thanks everyone, and thanks for a great 1st session.
embedding
0.003587765106931329
59521
13381
d build a session soon. And see you in the next workshop next week. Thanks, everyone. Thank you. Bye
transcript 2 d build a session soon. And see you in the next workshop next week. Thanks, everyone. Thank you. Bye
embedding
0.01878167036920786
106880
23917
nk you for all the great engagement and great questions. All right. Thanks, everyone. Thank you, Hik
transcript 3 nk you for all the great engagement and great questions. All right. Thanks, everyone. Thank you, Hik
embedding
0.008148182008881122
111936
25005
all in the next call and see you on Discord. So thanks once again, Eddie. Appreciate you, man. Yeah.
transcript 4 all in the next call and see you on Discord. So thanks once again, Eddie. Appreciate you, man. Yea

In [110]:
def get_chroma_client():
    """Initialize and return a ChromaDB client with persistence"""
    os.makedirs(CHROMA_DB_PATH, exist_ok=True)
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    return client

def get_or_create_collection(client, collection_name=COLLECTION_NAME):
    """Get or create a collection in ChromaDB"""
    try:
        collection = client.get_collection(name=collection_name)
    except:
        collection = client.create_collection(
            name=collection_name,
            metadata={"description": "Workshop transcript chunks"}
        )
    return collection

In [111]:
def add_chunks_to_collection(collection, chunks, workshop_id):
    """Add multiple chunks to the collection with workshop metadata"""
    ids = []
    documents = []
    embeddings = []
    metadatas = []
    
    for i, chunk in enumerate(chunks):
        try:
            embedding = generate_embedding(chunk['text'])
            chunk_id = f"{workshop_id}_{chunk['chunk_id']}"
            
            ids.append(chunk_id)
            documents.append(chunk['text'])
            embeddings.append(embedding)
            
            metadata = {
                'workshop_id': workshop_id,
                'position': chunk['position'],
                'token_count': chunk['token_count'],
                'source': chunk['source'],
                'timestamp': chunk.get('timestamp', 'Unknown'),
                'speaker': chunk.get('speaker', 'Unknown'),
                'original_chunk_id': chunk['chunk_id']
            }
            metadatas.append(metadata)
            
        except Exception as e:
            continue
    if ids:
        try:
            collection.add(
                ids=ids,
                documents=documents,
                embeddings=embeddings,
                metadatas=metadatas
            )
            return len(ids)
        except Exception as e:
            return 0
    else:
        return 0

In [114]:
def process_workshop(workshop_data: Dict[str, str], collection_name: str = COLLECTION_NAME) -> int:
    """Process a single workshop"""
    try:
        workshop_id = workshop_data['id']
        workshop_path = workshop_data['path']
        
        client = get_chroma_client()
        collection = get_or_create_collection(client, collection_name)
        
        # Check if already processed
        try:
            existing_results = collection.query(
                query_embeddings=[[0.0] * 1536],
                n_results=1,
                where={"workshop_id": workshop_id}
            )
            if existing_results and existing_results['ids'] and len(existing_results['ids'][0]) > 0:
                return 0
        except:
            pass
        
        chunks = robust_chunk_workshop(workshop_path, workshop_id)
        
        if not chunks:
            return 0
        
        num_added = add_chunks_to_collection(collection, chunks, workshop_id)
        return num_added
        
    except Exception as e:
        print(f"Error processing workshop {workshop_data.get('id', 'Unknown')}: {str(e)}")
        return 0

def process_all_workshops(collection_name=COLLECTION_NAME):
    """Process all discovered workshops"""
    workshops = discover_workshops()
    
    if not workshops:
        return []
    
    processed_workshops = []
    
    for workshop_id, workshop_info in workshops.items():
        print(workshop_id, workshop_info)
        try:
            num_chunks = process_workshop(workshop_info, collection_name)
            print(num_chunks)
            if num_chunks > 0:
                processed_workshops.append(workshop_id)
        except Exception as e:
            continue
    
    return processed_workshops

In [113]:
all_wsh = process_all_workshops()

WS5 {'id': 'WS5', 'filename': 'WS5-C2.vtt', 'path': '/home/pastor/projects/discord-chat-bot/data/WS5-C2.vtt'}
0
WS2 {'id': 'WS2', 'filename': 'WS2-C2.vtt', 'path': '/home/pastor/projects/discord-chat-bot/data/WS2-C2.vtt'}
0
WS1 {'id': 'WS1', 'filename': 'WS1-C2.vtt', 'path': '/home/pastor/projects/discord-chat-bot/data/WS1-C2.vtt'}
0
WS3 {'id': 'WS3', 'filename': 'WS3-C2.vtt', 'path': '/home/pastor/projects/discord-chat-bot/data/WS3-C2.vtt'}
0
WS4 {'id': 'WS4', 'filename': 'WS4-C2.vtt', 'path': '/home/pastor/projects/discord-chat-bot/data/WS4-C2.vtt'}
0
WS6 {'id': 'WS6', 'filename': 'WS6-C2.vtt', 'path': '/home/pastor/projects/discord-chat-bot/data/WS6-C2.vtt'}
0


In [115]:
def query_collection(collection, query_text, n_results=DEFAULT_MAX_CHUNKS, workshop_filter=None):
    """Query the collection for relevant documents"""
    query_embedding = generate_embedding(query_text)
    
    query_params = {
        "query_embeddings": [query_embedding],
        "n_results": n_results
    }
    
    if workshop_filter:
        if isinstance(workshop_filter, str):
            query_params["where"] = {"workshop_id": workshop_filter}
        elif isinstance(workshop_filter, list):
            query_params["where"] = {"workshop_id": {"$in": workshop_filter}}
    
    results = collection.query(**query_params)
    return results

def retrieve_relevant_chunks(question, collection_name=COLLECTION_NAME, n_results=DEFAULT_MAX_CHUNKS, workshop_filter=None):
    """Retrieve chunks from vector database for a given question"""
    client = get_chroma_client()
    collection = get_or_create_collection(client, collection_name)
    
    results = query_collection(collection, question, n_results=n_results, workshop_filter=workshop_filter)
    print(results)
    
    chunks = []
    if results and 'documents' in results and results['documents'] and len(results['documents'][0]) > 0:
        for i in range(len(results['documents'][0])):
            chunk = {
                'text': results['documents'][0][i],
                'metadata': results['metadatas'][0][i],
                'id': results['ids'][0][i],
                'relevance': 1.0
            }
            chunks.append(chunk)
    
    return chunks

In [116]:
def combine_chunks(chunks, max_tokens=DEFAULT_MAX_TOKENS):
    """Combine multiple chunks into a single context"""
    if not chunks:
        return ""
    
    sorted_chunks = sorted(chunks, key=lambda x: int(x['metadata'].get('position', 0)))
    
    combined_text = ""
    total_tokens = 0
    
    for chunk in sorted_chunks:
        chunk_text = chunk['text']
        chunk_tokens = int(chunk['metadata'].get('token_count', 0))
        
        if chunk_tokens == 0:
            chunk_tokens = count_tokens(chunk_text)
        
        if total_tokens + chunk_tokens > max_tokens:
            break
        
        if combined_text:
            combined_text += "\n\n--- Next Section ---\n\n"
        
        combined_text += chunk_text
        total_tokens += chunk_tokens
    
    return combined_text

def get_context_for_question(question, collection_name=COLLECTION_NAME, max_chunks=DEFAULT_MAX_CHUNKS, workshop_filter=None):
    """Get relevant context from the vector database for a question"""
    chunks = retrieve_relevant_chunks(question, collection_name, max_chunks, workshop_filter)
    print(chunks)
    
    sources = []
    for chunk in chunks:
        metadata = chunk['metadata']
        text = chunk['text']
        
        source = {
            'position': metadata.get('position', 'Unknown'),
            'timestamp': metadata.get('timestamp', "Unknown"),
            'speaker': metadata.get('speaker', "Unknown"),
            'workshop_id': metadata.get('workshop_id', "Unknown"),
            'text': text,
            'relevance': chunk.get('relevance')
        }
        sources.append(source)
    
    context = combine_chunks(chunks)
    return context, sources, chunks

In [130]:
def answer_question(question, workshop_filter=None):
    """Answer a question based on workshop transcripts"""
    client = get_chroma_client()
    collection = get_or_create_collection(client, COLLECTION_NAME)
    
    # Check if collection is empty and populate if needed
    count = collection.count()
    if count == 0:
        process_all_workshops(COLLECTION_NAME)
    
    context, sources, chunks = get_context_for_question(
        question=question,
        collection_name=COLLECTION_NAME,
        max_chunks=DEFAULT_MAX_CHUNKS,
        workshop_filter=workshop_filter
    )
    
    return context, sources, chunks

def llm_answer_question(client, context, sources, chunk, question):
    """Generate LLM answer with workshop awareness"""
    num_chunks = len(sources)
    workshops_used = list(set([source.get('workshop_id', 'Unknown') for source in sources]))
    
    client = get_openai_client()
    try:
        enhanced_system_prompt = SYSTEM_PROMPT + f"\n\nThe information provided comes from workshops: {', '.join(workshops_used)}."
        
        response = client.chat.completions.create(
            model=COMPLETION_MODEL,
            messages=[
                {"role": "system", "content": enhanced_system_prompt},
                {"role": "user", "content": f"Workshop Transcript Sections:\n{context}\n\nQuestion: {question}"}
            ],
            temperature=0
        )
        
        message = response.choices[0].message.content
        
        completion_tokens = response.usage.completion_tokens if hasattr(response, 'usage') else 0
        prompt_tokens = response.usage.prompt_tokens if hasattr(response, 'usage') else count_tokens(context)
        
        context_info = {
             "num_chunks": num_chunks,
             "context_tokens": prompt_tokens,
             "completion_tokens": completion_tokens,
             "embedding_tokens": num_chunks * 1536,
             "workshops_used": workshops_used,
             "chunks": chunks
         }
        
        return message, context_info

    except Exception as e:
        error_message = f"Sorry, an error occurred: {str(e)}"
        return error_message, {"error": str(e)}

In [131]:
llm_answer_question(client, context, source, chunk, question)

("Sorry, an error occurred: name 'chunks' is not defined",
 {'error': "name 'chunks' is not defined"})

In [119]:
client = get_chroma_client()
collection = get_or_create_collection(client, COLLECTION_NAME)

In [120]:
collection.count()

183

In [None]:
client = get_openai_client()

token_count = count_tokens(text)

if token_count <= EMBEDDING_MAX_TOKENS:
    response = client.embeddings.create(
        input=text,
        model=EMBEDDING_MODEL
    )
    return response.data[0].embedding
else:
    split_texts = split_large_chunk(text, EMBEDDING_MAX_TOKENS)
    
    embeddings = []
    for split_text in split_texts:
        response = client.embeddings.create(
            input=split_text,
            model=EMBEDDING_MODEL
        )
        embeddings.append(response.data[0].embedding)
    
    avg_embedding = np.mean(embeddings, axis=0).tolist()

In [None]:
    query_embedding = generate_embedding(query_text)
    
    query_params = {
        "query_embeddings": [query_embedding],
        "n_results": n_results
    }
    
    if workshop_filter:
        if isinstance(workshop_filter, str):
            query_params["where"] = {"workshop_id": workshop_filter}
        elif isinstance(workshop_filter, list):
            query_params["where"] = {"workshop_id": {"$in": workshop_filter}}
    
    results = collection.query(**query_params)

In [None]:
client = get_chroma_client()
collection = get_or_create_collection(client, collection_name)

results = query_collection(collection, question, n_results=n_results, workshop_filter=workshop_filter)

chunks = []
if results and 'documents' in results and results['documents'] and len(results['documents'][0]) > 0:
    for i in range(len(results['documents'][0])):
        chunk = {
            'text': results['documents'][0][i],
            'metadata': results['metadatas'][0][i],
            'id': results['ids'][0][i],
            'relevance': 1.0
        }
        chunks.append(chunk)

In [None]:
    chunks = retrieve_relevant_chunks(question, collection_name, max_chunks, workshop_filter)
    
    sources = []
    for chunk in chunks:
        metadata = chunk['metadata']
        text = chunk['text']
        
        source = {
            'position': metadata.get('position', 'Unknown'),
            'timestamp': metadata.get('timestamp', "Unknown"),
            'speaker': metadata.get('speaker', "Unknown"),
            'workshop_id': metadata.get('workshop_id', "Unknown"),
            'text': text,
            'relevance': chunk.get('relevance')
        }
        sources.append(source)
    
    context = combine_chunks(chunks)

In [None]:
    client = get_chroma_client()
    collection = get_or_create_collection(client, COLLECTION_NAME)
    
    # Check if collection is empty and populate if needed
    count = collection.count()
    if count == 0:
        process_all_workshops(COLLECTION_NAME)
    
    context, sources, chunks = get_context_for_question(
        question=question,
        collection_name=COLLECTION_NAME,
        max_chunks=DEFAULT_MAX_CHUNKS,
        workshop_filter=workshop_filter
    )

In [None]:
answer_question(question)

In [None]:
!pip install langchain

In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)
character_split_texts = character_splitter.split_text('\n\n'.join(transcript_vtt))


In [20]:
def word_wrap(string, n_chars=72):
    # Wrap a string at the next space after n_chars
    if len(string) < n_chars:
        return string
    else:
        return string[:n_chars].rsplit(' ', 1)[0] + '\n' + word_wrap(string[len(string[:n_chars].rsplit(' ', 1)[0])+1:], n_chars)

In [None]:
print(word_wrap(character_split_texts[10]))
print(f"\nTotal chunks: {len(character_split_texts)}")

In [None]:
!pip install sentence-transformers

In [27]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

print(word_wrap(token_split_texts[10]))
print(f"\nTotal chunks: {len(token_split_texts)}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

i d o t h i n k i n i n t h e s h o r t t e r m s e t t i n g s o m e t
h i n g u p c o u l d b e s u p e r c o o l. s o m e o n e e l s e m e
n t i o n e d t h a t 9 h u g o b o w n e - a n d e r s o n : w i t h r
e s p e c t t o t h e t o o l o v e r w h e l m, i t ' d b e g r e a t.
t h e y d o n ' t t h i n k t h a t n e c e s s a r i l y t h e y ' l l
b e a b l e t o u s e a l l o f t h e m b y t h e e n d o f t h e c o u
r s e, a n d t h a t ' s n o t a n e x p e c t a t i o n i t ' s m o r
e f o

Total chunks: 679


In [None]:
# Cell 1: Import and Setup
import sys
import os

# Add src directory to path if needed
if 'src' not in sys.path:
    sys.path.append('src')

from vector_emb import (
    answer_question, 
    llm_answer_question, 
    get_openai_client,
    get_workshop_info,
    get_collection_status,
    format_sources
)
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

print("✅ Imports and setup complete!")

In [2]:
# Cell 2: Check Workshop Status
# See what workshops are available and if the collection is populated
workshop_info = get_workshop_info()
print(f"📚 Available workshops: {workshop_info['workshop_ids']}")
print(f"📊 Total workshops: {workshop_info['total_workshops']}")

collection_status = get_collection_status()
print(f"\n📈 Collection status: {collection_status}")

Found 6 workshops: ['WS5', 'WS2', 'WS1', 'WS3', 'WS4', 'WS6']
📚 Available workshops: ['WS5', 'WS2', 'WS1', 'WS3', 'WS4', 'WS6']
📊 Total workshops: 6
Retrieved existing collection 'workshop_chunks_all'
Collection 'workshop_chunks_all' contains 193 total chunks
Found 6 workshops: ['WS5', 'WS2', 'WS1', 'WS3', 'WS4', 'WS6']
Workshop breakdown:
  - WS5: 60 chunks ✓ Processed
  - WS2: 1 chunks ✓ Processed
  - WS1: 65 chunks ✓ Processed
  - WS3: 1 chunks ✓ Processed
  - WS4: 1 chunks ✓ Processed
  - WS6: 65 chunks ✓ Processed

📈 Collection status: {'total_chunks': 193, 'workshop_counts': {'WS5': 60, 'WS2': 1, 'WS1': 65, 'WS3': 1, 'WS4': 1, 'WS6': 65}}


In [None]:
# Cell 3: Simple Q&A Function
def ask_question(question, workshop_filter=None, show_sources=True):
    """
    Ask a question and get an answer from the workshop transcripts
    
    Args:
        question (str): Your question
        workshop_filter (str or list): Filter by specific workshop(s), e.g., "WS1" or ["WS1", "WS2"]
        show_sources (bool): Whether to display source information
    """
    print(f"🤔 Question: {question}")
    if workshop_filter:
        print(f"🎯 Filtering by workshop(s): {workshop_filter}")
    print("=" * 50)
    
    try:
        # Get relevant context and sources
        context, sources, chunks = answer_question(question, workshop_filter=workshop_filter)
        
        if not context:
            print("❌ No relevant context found for your question.")
            return
        
        # Generate LLM answer
        client = get_openai_client()
        answer, context_info = llm_answer_question(client, context, sources, chunks, question)
        
        # Display results
        print("🤖 Answer:")
        print("-" * 30)
        print(answer)
        print("\n")
        
        if show_sources:
            print("📚 Sources:")
            print("-" * 30)
            print(format_sources([{
                'workshop_id': source['workshop_id'],
                'position': source['position'],
                'speaker': source['speaker'],
                'text': source['text'][:200] + "..." if len(source['text']) > 200 else source['text']
            } for source in sources]))
        
        print("\n📊 Context Info:")
        print(f"- Chunks used: {context_info['num_chunks']}")
        print(f"- Workshops referenced: {', '.join(context_info['workshops_used'])}")
        print(f"- Context tokens: {context_info['context_tokens']}")
        print(f"- Completion tokens: {context_info['completion_tokens']}")
        
    except Exception as e:
        print(f"❌ Error: {str(e)}")

# Test the function
ask_question("What is the main topic covered in the workshops?")

In [None]:
# Cell 4: Interactive Q&A Loop
def interactive_qa():
    """Run an interactive Q&A session"""
    print("🎓 Workshop Q&A Session Started!")
    print("Type 'quit' to exit, 'workshops' to see available workshops")
    print("=" * 50)
    
    while True:
        question = input("\n💭 Your question: ").strip()
        
        if question.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            break
        elif question.lower() == 'workshops':
            info = get_workshop_info()
            print(f"Available workshops: {', '.join(info['workshop_ids'])}")
            continue
        elif not question:
            continue
        
        # Check if user wants to filter by workshop
        workshop_filter = None
        if question.startswith('@'):
            parts = question.split(' ', 1)
            if len(parts) == 2:
                workshop_filter = parts[0][1:]  # Remove @ symbol
                question = parts[1]
                print(f"🎯 Filtering by workshop: {workshop_filter}")
        
        ask_question(question, workshop_filter=workshop_filter, show_sources=False)

# Uncomment the line below to start interactive mode
# interactive_qa()

In [None]:
# Cell 5: Specific Workshop Questions
# Example of asking questions about specific workshops

# Ask about a specific workshop
ask_question("What are the key concepts covered?", workshop_filter="WS1")

# Ask about multiple workshops
ask_question("What are the differences between the approaches?", workshop_filter=["WS1", "WS2"])

# General question across all workshops
ask_question("What are the most important takeaways?")