In [5]:
import os
import time
import chromadb
from chromadb.utils import embedding_functions
from openai import OpenAI
import json
import glob
from process_transcript import chunk_workshop_transcript, count_tokens, robust_chunk_workshop
from dotenv import load_dotenv
from typing import List, Dict, Any
import numpy as np
import re
import uuid

# Support both local and Modal paths - notebook compatible version
# Check for Modal environment first, then use relative paths for local notebook
if os.path.exists("/root/data"):
    DATA_DIR = "/root/data"
    CHROMA_DB_PATH = "/root/chroma_db"
else:
    # For notebook environment, go up one directory from src to project root
    current_dir = os.getcwd()
    if current_dir.endswith('/src'):
        project_root = os.path.dirname(current_dir)
    else:
        project_root = current_dir
    DATA_DIR = os.path.join(project_root, "data")
    CHROMA_DB_PATH = os.path.join(project_root, "chroma_db")

COLLECTION_NAME = "workshop_chunks_all"
EMBEDDING_MODEL = "text-embedding-3-small"
DEFAULT_MAX_TOKENS = 12000
DEFAULT_MAX_CHUNKS = 5
COMPLETION_MODEL = "gpt-4o-mini"
EMBEDDING_MAX_TOKENS = 8000

SYSTEM_PROMPT = """You are a helpful workshop assistant.
Answer questions based only on the workshop transcript sections provided.
If you don't know the answer or can't find it in the provided sections, say so.
When referencing information, mention which workshop(s) the information comes from."""

def discover_workshops(data_dir=DATA_DIR):
    """Discover all workshop VTT files in the data directory"""
    try:
        pattern = os.path.join(data_dir, "*.vtt")
        vtt_files = glob.glob(pattern)
        
        workshops = {}
        for vtt_file in vtt_files:
            filename = os.path.basename(vtt_file)
            workshop_id = filename.split('-')[0] if '-' in filename else filename.split('.')[0]
            
            workshops[workshop_id] = {
                'id': workshop_id,
                'filename': filename,
                'path': vtt_file
            }
        
        return workshops
        
    except Exception as e:
        print(f"Error discovering workshops: {e}")
        return {}

In [6]:
files_vtt = discover_workshops(data_dir=DATA_DIR)

In [11]:
files_vtt['WS5']['path']

'/home/pastor/projects/discord-chat-bot/data/WS5-C2.vtt'

In [12]:
def load_vtt_content(file_path):
    """Load VTT file and extract clean text content"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return ""
    
    lines = content.split('\n')
    content_lines = []
    
    for line in lines:
        line = line.strip()
        if (not line or 
            line == 'WEBVTT' or 
            '-->' in line or 
            re.match(r'^\d+:\d+:\d+', line) or
            re.match(r'^[A-Z]+(\s*:.*)?$', line)):
            continue
        content_lines.append(line)
    
    return " ".join(content_lines)

In [14]:
transcript_vtt = load_vtt_content(files_vtt['WS5']['path'])

In [None]:
!pip install langchain

In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)
character_split_texts = character_splitter.split_text('\n\n'.join(transcript_vtt))


In [20]:
def word_wrap(string, n_chars=72):
    # Wrap a string at the next space after n_chars
    if len(string) < n_chars:
        return string
    else:
        return string[:n_chars].rsplit(' ', 1)[0] + '\n' + word_wrap(string[len(string[:n_chars].rsplit(' ', 1)[0])+1:], n_chars)

In [None]:
print(word_wrap(character_split_texts[10]))
print(f"\nTotal chunks: {len(character_split_texts)}")

In [None]:
!pip install sentence-transformers

In [27]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

print(word_wrap(token_split_texts[10]))
print(f"\nTotal chunks: {len(token_split_texts)}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

i d o t h i n k i n i n t h e s h o r t t e r m s e t t i n g s o m e t
h i n g u p c o u l d b e s u p e r c o o l. s o m e o n e e l s e m e
n t i o n e d t h a t 9 h u g o b o w n e - a n d e r s o n : w i t h r
e s p e c t t o t h e t o o l o v e r w h e l m, i t ' d b e g r e a t.
t h e y d o n ' t t h i n k t h a t n e c e s s a r i l y t h e y ' l l
b e a b l e t o u s e a l l o f t h e m b y t h e e n d o f t h e c o u
r s e, a n d t h a t ' s n o t a n e x p e c t a t i o n i t ' s m o r
e f o

Total chunks: 679


In [None]:
# Cell 1: Import and Setup
import sys
import os

# Add src directory to path if needed
if 'src' not in sys.path:
    sys.path.append('src')

from vector_emb import (
    answer_question, 
    llm_answer_question, 
    get_openai_client,
    get_workshop_info,
    get_collection_status,
    format_sources
)
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

print("✅ Imports and setup complete!")

In [2]:
# Cell 2: Check Workshop Status
# See what workshops are available and if the collection is populated
workshop_info = get_workshop_info()
print(f"📚 Available workshops: {workshop_info['workshop_ids']}")
print(f"📊 Total workshops: {workshop_info['total_workshops']}")

collection_status = get_collection_status()
print(f"\n📈 Collection status: {collection_status}")

Found 6 workshops: ['WS5', 'WS2', 'WS1', 'WS3', 'WS4', 'WS6']
📚 Available workshops: ['WS5', 'WS2', 'WS1', 'WS3', 'WS4', 'WS6']
📊 Total workshops: 6
Retrieved existing collection 'workshop_chunks_all'
Collection 'workshop_chunks_all' contains 193 total chunks
Found 6 workshops: ['WS5', 'WS2', 'WS1', 'WS3', 'WS4', 'WS6']
Workshop breakdown:
  - WS5: 60 chunks ✓ Processed
  - WS2: 1 chunks ✓ Processed
  - WS1: 65 chunks ✓ Processed
  - WS3: 1 chunks ✓ Processed
  - WS4: 1 chunks ✓ Processed
  - WS6: 65 chunks ✓ Processed

📈 Collection status: {'total_chunks': 193, 'workshop_counts': {'WS5': 60, 'WS2': 1, 'WS1': 65, 'WS3': 1, 'WS4': 1, 'WS6': 65}}


In [None]:
# Cell 3: Simple Q&A Function
def ask_question(question, workshop_filter=None, show_sources=True):
    """
    Ask a question and get an answer from the workshop transcripts
    
    Args:
        question (str): Your question
        workshop_filter (str or list): Filter by specific workshop(s), e.g., "WS1" or ["WS1", "WS2"]
        show_sources (bool): Whether to display source information
    """
    print(f"🤔 Question: {question}")
    if workshop_filter:
        print(f"🎯 Filtering by workshop(s): {workshop_filter}")
    print("=" * 50)
    
    try:
        # Get relevant context and sources
        context, sources, chunks = answer_question(question, workshop_filter=workshop_filter)
        
        if not context:
            print("❌ No relevant context found for your question.")
            return
        
        # Generate LLM answer
        client = get_openai_client()
        answer, context_info = llm_answer_question(client, context, sources, chunks, question)
        
        # Display results
        print("🤖 Answer:")
        print("-" * 30)
        print(answer)
        print("\n")
        
        if show_sources:
            print("📚 Sources:")
            print("-" * 30)
            print(format_sources([{
                'workshop_id': source['workshop_id'],
                'position': source['position'],
                'speaker': source['speaker'],
                'text': source['text'][:200] + "..." if len(source['text']) > 200 else source['text']
            } for source in sources]))
        
        print("\n📊 Context Info:")
        print(f"- Chunks used: {context_info['num_chunks']}")
        print(f"- Workshops referenced: {', '.join(context_info['workshops_used'])}")
        print(f"- Context tokens: {context_info['context_tokens']}")
        print(f"- Completion tokens: {context_info['completion_tokens']}")
        
    except Exception as e:
        print(f"❌ Error: {str(e)}")

# Test the function
ask_question("What is the main topic covered in the workshops?")

In [None]:
# Cell 4: Interactive Q&A Loop
def interactive_qa():
    """Run an interactive Q&A session"""
    print("🎓 Workshop Q&A Session Started!")
    print("Type 'quit' to exit, 'workshops' to see available workshops")
    print("=" * 50)
    
    while True:
        question = input("\n💭 Your question: ").strip()
        
        if question.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            break
        elif question.lower() == 'workshops':
            info = get_workshop_info()
            print(f"Available workshops: {', '.join(info['workshop_ids'])}")
            continue
        elif not question:
            continue
        
        # Check if user wants to filter by workshop
        workshop_filter = None
        if question.startswith('@'):
            parts = question.split(' ', 1)
            if len(parts) == 2:
                workshop_filter = parts[0][1:]  # Remove @ symbol
                question = parts[1]
                print(f"🎯 Filtering by workshop: {workshop_filter}")
        
        ask_question(question, workshop_filter=workshop_filter, show_sources=False)

# Uncomment the line below to start interactive mode
# interactive_qa()

In [None]:
# Cell 5: Specific Workshop Questions
# Example of asking questions about specific workshops

# Ask about a specific workshop
ask_question("What are the key concepts covered?", workshop_filter="WS1")

# Ask about multiple workshops
ask_question("What are the differences between the approaches?", workshop_filter=["WS1", "WS2"])

# General question across all workshops
ask_question("What are the most important takeaways?")