In [None]:
# This is pre-summary generation notebook v1.1, as the previous version worked but generated 129k summaries. Additionally, this version generates summaries of 1000 words instaed of 
# 500 words, so we can inspect the difference in quality and nuance of ideas. 
# V2: changed the vector dimensions to 1536 to match requirements later in the pipeline

In [5]:
import os
import pickle
import json
import boto3
import logging
import yaml
import pandas as pd
import numpy as np
from datetime import datetime
from typing import Dict, List, Any, Union
from tqdm import tqdm
from openai import OpenAI
from pinecone import Pinecone
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# ---------- Configure Logging ----------
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('topic_summarization.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# ---------- Config ----------
class Config:
    def __init__(self, config_file="config.yaml", summary_words=500):
        # Load configuration from YAML file
        self.config_data = self.load_config(config_file)
        
        # File paths
        self.CLUSTERED_VECTORS_PATH = "rizzbot_data/cleaned_clustered_vectors.pkl"
        self.TOPIC_MODEL_PATH = "rizzbot_data/bertopic_model"
        
        # S3 settings
        self.S3_BUCKET = "rizzbot-temp-storage"
        self.S3_PREFIX = "rizzbot/Summaries/"
        
        # Pinecone settings
        self.PINECONE_INDEX = "rizzbot-summaries"
        
        # Model settings
        self.EMBEDDING_MODEL = "text-embedding-3-small"
        self.SUMMARY_MODEL = "gpt-4o-mini"
        self.EMBEDDING_DIMENSIONS = 1536
        
        # Processing settings - NOW CONFIGURABLE
        self.MAX_SUMMARY_WORDS = summary_words  # Can be set to 500 or 1000
        self.MAX_DOCS_PER_TOPIC = 50
        self.CHUNK_SIZE = 500
        
        # API Keys from config.yaml
        self.OPENAI_API_KEY = self.config_data.get('openai_api_key')
        self.PINECONE_API_KEY = self.config_data.get('pinecone_api_key')
        
        # Validate required keys
        self.validate_config()
    
    def load_config(self, config_file: str) -> dict:
        """Load configuration from YAML file"""
        try:
            with open(config_file, 'r') as f:
                config = yaml.safe_load(f)
            logger.info(f"Loaded configuration from {config_file}")
            return config
        except FileNotFoundError:
            logger.error(f"Configuration file {config_file} not found")
            raise
        except yaml.YAMLError as e:
            logger.error(f"Error parsing YAML file: {e}")
            raise
    
    def validate_config(self):
        """Validate that required API keys are present"""
        missing_keys = []
        
        if not self.OPENAI_API_KEY:
            missing_keys.append('openai_api_key')
        
        if not self.PINECONE_API_KEY:
            missing_keys.append('pinecone_api_key')
        
        if missing_keys:
            logger.error(f"Missing required API keys in config.yaml: {missing_keys}")
            raise ValueError(f"Missing required API keys: {', '.join(missing_keys)}")
        
        logger.info("All required API keys found in configuration")

class TopicSummarizer:
    def __init__(self, config_file="config.yaml", summary_words=500, run_name=None):
        self.config = Config(config_file, summary_words)
        self.run_name = run_name or f"{summary_words}word_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        self.s3_client = None
        self.openai_client = None
        self.pinecone_client = None
        self.pinecone_index = None
        self.embedder = None
        self.topic_model = None
        self.clustered_vectors = None
        
        logger.info(f"Initialized TopicSummarizer for {summary_words}-word summaries, run: {self.run_name}")
        
    def initialize_clients(self):
        """Initialize all external service clients - UPDATED"""
        try:
            logger.info("Initializing clients...")
            
            # Initialize S3 client
            self.s3_client = boto3.client("s3")
            logger.info("S3 client initialized")
            
            # Initialize OpenAI client
            self.openai_client = OpenAI(api_key=self.config.OPENAI_API_KEY)
            logger.info("OpenAI client initialized")
            
            # Test OpenAI connection
            try:
                test_response = self.openai_client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[{"role": "user", "content": "Hello"}],
                    max_tokens=5
                )
                logger.info("OpenAI API connection test successful")
            except Exception as e:
                logger.error(f"OpenAI API connection test failed: {e}")
                raise
            
            # Initialize Pinecone client
            self.pinecone_client = Pinecone(api_key=self.config.PINECONE_API_KEY)
            self.pinecone_index = self.pinecone_client.Index(self.config.PINECONE_INDEX)
            logger.info("Pinecone client initialized")
            
            # REMOVED: sentence transformer initialization since we're using OpenAI
            # self.embedder = SentenceTransformer(self.config.EMBEDDING_MODEL)
            
            logger.info("All clients initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize clients: {e}")
            raise
    
    def generate_embedding(self, text: str) -> List[float]:
        """Generate 1536-dimensional embedding using OpenAI"""
        try:
            response = self.openai_client.embeddings.create(
                model=self.config.EMBEDDING_MODEL,
                input=text,
                dimensions=self.config.EMBEDDING_DIMENSIONS
            )
            embedding = response.data[0].embedding
            
            # Verify dimensions
            if len(embedding) != self.config.EMBEDDING_DIMENSIONS:
                raise ValueError(f"Expected {self.config.EMBEDDING_DIMENSIONS} dimensions, got {len(embedding)}")
                
            return embedding
            
        except Exception as e:
            logger.error(f"Failed to generate embedding: {e}")
            raise
    
    def load_data(self):
        """Load clustered data and BERTopic model"""
        try:
            logger.info("Loading clustered vectors...")
            with open(self.config.CLUSTERED_VECTORS_PATH, "rb") as f:
                self.clustered_vectors = pickle.load(f)
            logger.info(f"Loaded clustered vectors: {len(self.clustered_vectors)} items")
            
            if isinstance(self.clustered_vectors, pd.DataFrame):
                logger.info(f"DataFrame shape: {self.clustered_vectors.shape}, columns: {list(self.clustered_vectors.columns)}")
            else:
                logger.info(f"Data type: {type(self.clustered_vectors)}")

            self.load_topic_model()
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            raise

    def load_topic_model(self):
        """Load BERTopic model - NOTE: This may need adjustment"""
        try:
            logger.info(f"Loading BERTopic model from: {self.config.TOPIC_MODEL_PATH}")
            
            # WARNING: If your BERTopic model was trained with all-MiniLM-L6-v2 (384 dims),
            # you might need to retrain it or handle the dimension mismatch
            
            # For now, load without embedding model to avoid conflicts
            self.topic_model = BERTopic.load(self.config.TOPIC_MODEL_PATH)
            topics = self.topic_model.get_topics()
            logger.info(f"Loaded BERTopic model with {len(topics)} topics")
            
            # Log warning about potential embedding model mismatch
            logger.warning("BERTopic model may have been trained with different embedding dimensions. "
                         "Summary embeddings will use 1536 dimensions, but topic model may use different dimensions.")
            
        except Exception as e:
            logger.error(f"Error loading BERTopic model: {e}")
            raise

    def group_documents_by_topic(self) -> Dict[int, List[str]]:
        """Group documents by their topic ID"""
        logger.info("Grouping documents by topic...")
        topic_to_docs = {}

        if isinstance(self.clustered_vectors, pd.DataFrame):
            df = self.clustered_vectors
            if 'topic_id' not in df.columns or 'text' not in df.columns:
                raise ValueError("clustered_vectors must include 'topic_id' and 'text'")
            for topic_id, group in df.dropna(subset=['topic_id', 'text']).groupby('topic_id'):
                if topic_id == -1:  # Skip noise cluster
                    continue
                topic_to_docs[int(topic_id)] = group['text'].tolist()
        else:
            for item in self.clustered_vectors:
                if not isinstance(item, dict): 
                    continue
                topic_id = item.get("topic_id")
                text = item.get("text")
                if topic_id is not None and text and topic_id != -1:
                    topic_to_docs.setdefault(topic_id, []).append(text)

        logger.info(f"Grouped documents into {len(topic_to_docs)} topics")
        return topic_to_docs

    def generate_summary(self, topic_id: int, docs: List[str]) -> str:
        """Generate summary for a single topic"""
        try:
            combined_text = "\n\n".join(docs[:self.config.MAX_DOCS_PER_TOPIC])
            
            logger.info(f"Generating {self.config.MAX_SUMMARY_WORDS}-word summary for topic {topic_id} with {len(docs)} documents")
            logger.info(f"Combined text length: {len(combined_text)} characters")
            
            prompt = (
                f"You are a helpful assistant. Write a comprehensive and detailed {self.config.MAX_SUMMARY_WORDS}-word summary "
                f"of the key themes, insights, and patterns found in the following documents. "
                f"Focus on capturing the main ideas, important details, and any nuanced perspectives present in the text.\n\n"
                f"Documents:\n{combined_text}\n\n"
                f"Please provide a {self.config.MAX_SUMMARY_WORDS}-word summary:"
            )
            
            # Adjust max_tokens based on summary length
            max_tokens = int(self.config.MAX_SUMMARY_WORDS * 1.5)  # Allow some buffer
            
            response = self.openai_client.chat.completions.create(
                model=self.config.SUMMARY_MODEL,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.5,
                max_tokens=max_tokens
            )
            
            summary = response.choices[0].message.content.strip()
            logger.info(f"Successfully generated summary for topic {topic_id}: {len(summary)} characters")
            return summary
            
        except Exception as e:
            logger.error(f"Failed to generate summary for topic {topic_id}: {e}")
            raise
    
    def save_to_s3(self, topic_id: int, summary_text: str):
        """Save summary to S3"""
        try:
            s3_key = f"{self.config.S3_PREFIX}{self.run_name}/topic_{topic_id}.json"
            
            summary_data = {
                "topic_id": topic_id,
                "summary_text": summary_text,
                "source": "BERTopic",
                "run_name": self.run_name,
                "summary_word_target": self.config.MAX_SUMMARY_WORDS,
                "timestamp": datetime.now().isoformat(),
                "model_used": self.config.SUMMARY_MODEL
            }
            
            self.s3_client.put_object(
                Bucket=self.config.S3_BUCKET,
                Key=s3_key,
                Body=json.dumps(summary_data, indent=2),
                ContentType="application/json"
            )
            logger.info(f"Saved topic {topic_id} to S3: {s3_key}")
            
        except Exception as e:
            logger.error(f"Failed to save topic {topic_id} to S3: {e}")
            raise
    
    def save_to_pinecone(self, topic_id: int, summary_text: str):
        """Save summary chunks to Pinecone - UPDATED FOR 1536 DIMS"""
        try:
            chunks = [summary_text[i:i+self.config.CHUNK_SIZE] 
                     for i in range(0, len(summary_text), self.config.CHUNK_SIZE)]
            
            vectors_to_upsert = []
            for i, chunk in enumerate(chunks):
                # Use OpenAI embedding instead of sentence transformer
                embedding = self.generate_embedding(chunk)
                vector_id = f"summary-{self.run_name}-{topic_id}-{i}"
                
                metadata = {
                    "type": "summary",
                    "topic_id": str(topic_id),
                    "chunk_id": i,
                    "source": "BERTopic",
                    "summary_quality": "v1.0",
                    "run_name": self.run_name,
                    "summary_word_target": self.config.MAX_SUMMARY_WORDS,
                    "timestamp": datetime.now().isoformat(),
                    "model_used": self.config.SUMMARY_MODEL,
                    "embedding_model": self.config.EMBEDDING_MODEL,
                    "embedding_dimensions": self.config.EMBEDDING_DIMENSIONS
                }
                
                vectors_to_upsert.append((vector_id, embedding, metadata))
            
            # Batch upsert for efficiency
            self.pinecone_index.upsert(vectors=vectors_to_upsert)
            logger.info(f"Saved {len(chunks)} chunks for topic {topic_id} to Pinecone with {self.config.EMBEDDING_DIMENSIONS} dimensions")
            
        except Exception as e:
            logger.error(f"Failed to save topic {topic_id} to Pinecone: {e}")
            raise
    
    def save_summaries_locally(self, summaries: Dict[int, str]):
        """Save summaries to local file for backup"""
        try:
            local_dir = f"rizzbot_data/summaries_{self.run_name}"
            os.makedirs(local_dir, exist_ok=True)
            
            # Save individual summaries
            for topic_id, summary_text in summaries.items():
                filename = os.path.join(local_dir, f"topic_{topic_id}.txt")
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(f"Topic ID: {topic_id}\n")
                    f.write(f"Run Name: {self.run_name}\n")
                    f.write(f"Target Words: {self.config.MAX_SUMMARY_WORDS}\n")
                    f.write(f"Timestamp: {datetime.now().isoformat()}\n")
                    f.write(f"Model: {self.config.SUMMARY_MODEL}\n")
                    f.write("-" * 50 + "\n")
                    f.write(summary_text)
            
            # Save consolidated file
            consolidated_file = os.path.join(local_dir, "all_summaries.json")
            with open(consolidated_file, 'w', encoding='utf-8') as f:
                summary_data = {
                    "run_name": self.run_name,
                    "summary_word_target": self.config.MAX_SUMMARY_WORDS,
                    "timestamp": datetime.now().isoformat(),
                    "model_used": self.config.SUMMARY_MODEL,
                    "total_topics": len(summaries),
                    "summaries": summaries
                }
                json.dump(summary_data, f, indent=2, ensure_ascii=False)
            
            logger.info(f"Saved {len(summaries)} summaries locally in {local_dir}")
            
        except Exception as e:
            logger.error(f"Failed to save summaries locally: {e}")
            raise
    
    def run_summarization(self) -> Dict[int, str]:
        """Run summarization for all topics - FIXED VERSION"""
        topic_to_docs = self.group_documents_by_topic()
        summaries = {}
        failed_topics = []

        logger.info(f"Starting {self.config.MAX_SUMMARY_WORDS}-word summarization for {len(topic_to_docs)} topics...")

        for topic_id in tqdm(topic_to_docs, desc=f"Generating {self.config.MAX_SUMMARY_WORDS}-word summaries"):
            try:
                docs = topic_to_docs[topic_id]
                summary = self.generate_summary(topic_id, docs)
                summaries[topic_id] = summary
                
                # Save to external services
                self.save_to_s3(topic_id, summary)
                self.save_to_pinecone(topic_id, summary)
                
            except Exception as e:
                logger.error(f"Topic {topic_id} failed: {e}")
                failed_topics.append(topic_id)

        # Save local backup
        self.save_summaries_locally(summaries)
        
        logger.info(f"Summarization completed for {self.run_name}")
        logger.info(f"Successful: {len(summaries)} | Failed: {len(failed_topics)}")
        
        if failed_topics:
            logger.warning(f"Failed topics: {failed_topics}")

        return summaries

    def run(self) -> Dict[int, str]:
        """Main execution method - SIMPLIFIED"""
        try:
            logger.info(f"Starting TopicSummarizer run: {self.run_name}")
            self.initialize_clients()
            self.load_data()
            summaries = self.run_summarization()
            logger.info(f"Run {self.run_name} completed successfully with {len(summaries)} summaries")
            return summaries
        except Exception as e:
            logger.error(f"Fatal error during execution: {e}")
            raise


def main():
    """Main entry point with support for different word counts"""
    try:
        # Run 500-word summaries
        print("=== Running 500-word summaries ===")
        summarizer_500 = TopicSummarizer("config.yaml", summary_words=500, run_name="500word_summaries")
        results_500 = summarizer_500.run()
        print(f"500-word summaries completed: {len(results_500)} summaries generated")
        
        # Run 1000-word summaries  
        print("\n=== Running 1000-word summaries ===")
        summarizer_1000 = TopicSummarizer("config.yaml", summary_words=1000, run_name="1000word_summaries")
        results_1000 = summarizer_1000.run()
        print(f"1000-word summaries completed: {len(results_1000)} summaries generated")
        
        # Final summary
        print(f"\n=== FINAL RESULTS ===")
        print(f"500-word summaries: {len(results_500)}")
        print(f"1000-word summaries: {len(results_1000)}")
        print(f"Total summaries generated: {len(results_500) + len(results_1000)}")
        
    except Exception as e:
        logger.error(f"Script failed: {e}")
        raise

def run_single_batch(word_count=500):
    """Helper function to run just one batch of summaries"""
    try:
        run_name = f"{word_count}word_summaries_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        summarizer = TopicSummarizer("config.yaml", summary_words=word_count, run_name=run_name)
        results = summarizer.run()
        
        print(f"\n=== RESULTS ===")
        print(f"Generated {len(results)} summaries of {word_count} words each")
        return results
        
    except Exception as e:
        logger.error(f"Script failed: {e}")
        raise

if __name__ == "__main__":
    # You can choose to run both or just one:
    
    # Option 1: Run both 500 and 1000 word summaries
    # main()
    
    # Option 2: Run just 1000-word summaries
    run_single_batch(word_count=1000)
    
    # Option 3: Run just 500-word summaries  
    # run_single_batch(word_count=500)

2025-07-01 18:18:29,953 - INFO - Loaded configuration from config.yaml
2025-07-01 18:18:29,954 - INFO - All required API keys found in configuration
2025-07-01 18:18:29,955 - INFO - Initialized TopicSummarizer for 1000-word summaries, run: 1000word_summaries_20250701_181829
2025-07-01 18:18:29,956 - INFO - Starting TopicSummarizer run: 1000word_summaries_20250701_181829
2025-07-01 18:18:29,957 - INFO - Initializing clients...
2025-07-01 18:18:29,963 - INFO - S3 client initialized
2025-07-01 18:18:30,263 - INFO - OpenAI client initialized
2025-07-01 18:18:31,222 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-01 18:18:31,227 - INFO - OpenAI API connection test successful
2025-07-01 18:18:31,229 - INFO - Pinecone client initialized
2025-07-01 18:18:31,230 - INFO - All clients initialized successfully
2025-07-01 18:18:31,231 - INFO - Loading clustered vectors...
2025-07-01 18:18:31,274 - INFO - Loaded clustered vectors: 470 items
2025-07-01


=== RESULTS ===
Generated 42 summaries of 1000 words each
