# Document Ingestion, Topic Extraction, and Subreddit Discovery

This notebook handles the complete workflow from document ingestion to subreddit discovery using the Reddit Marketing AI Agent.

**Workflow:**
1. Ingest document content
2. Extract relevant topics from the document
3. Discover and rank subreddits based on topics

## Cell 1: Setup and Document Ingestion

Initialize services and ingest a document with directly provided content.

In [None]:
import asyncio
import json
import os
import sys

# Add the parent directory to the system path to allow importing from src
sys.path.insert(0, os.path.abspath('../'))

from src.config.settings import settings
from src.storage.json_storage import JsonStorage
from src.storage.vector_storage import VectorStorage
from src.services.ingestion_service import IngestionService
from src.models.common import generate_id

# Initialize services
json_storage = JsonStorage()
vector_storage = VectorStorage()
ingestion_service = IngestionService(json_storage, vector_storage)

# Define organization ID (consistent across all notebooks)
organization_id = "my_marketing_org"

# Define document content and title
content = """
We are a cutting-edge AI-powered marketing automation platform that helps businesses 
optimize their social media presence and engagement. Our platform uses advanced 
machine learning algorithms to analyze customer behavior, predict trends, and 
automatically generate personalized content for various social media channels.

Key features include:
- Automated content generation using GPT models
- Real-time sentiment analysis and engagement tracking
- Multi-platform social media management (Twitter, LinkedIn, Reddit, Facebook)
- Advanced analytics and ROI tracking
- AI-driven audience segmentation and targeting
- Automated A/B testing for content optimization

Our target audience includes:
- Small to medium businesses looking to scale their marketing efforts
- Marketing agencies seeking automation tools
- E-commerce businesses wanting to increase online presence
- SaaS companies looking to improve customer acquisition
- Content creators and influencers seeking efficiency tools

We specialize in helping businesses increase their organic reach, improve engagement 
rates, and convert social media followers into paying customers through intelligent 
automation and data-driven insights.
"""

title = "AI Marketing Automation Platform - Company Overview"

# Ingest the document
print("Starting document ingestion...")
success, message, document_id = await ingestion_service.ingest_document(
    content=content,
    title=title,
    organization_id=organization_id,
    is_url=False
)

if success:
    print(f"✅ Document ingestion successful!")
    print(f"Document ID: {document_id}")
    print(f"Message: {message}")
    
    # Save document info to JSON file
    ingested_docs_data = {
        "document_id": document_id,
        "organization_id": organization_id,
        "title": title,
        "content_length": len(content),
        "ingestion_success": True
    }
    
    with open('ingested_docs.json', 'w') as f:
        json.dump(ingested_docs_data, f, indent=2)
    
    print(f"📁 Saved document info to ingested_docs.json")
else:
    print(f"❌ Document ingestion failed: {message}")
    # Save failure info
    ingested_docs_data = {
        "document_id": None,
        "organization_id": organization_id,
        "title": title,
        "ingestion_success": False,
        "error_message": message
    }
    
    with open('ingested_docs.json', 'w') as f:
        json.dump(ingested_docs_data, f, indent=2)

## Cell 2: Extract Topics

Extract relevant topics from the ingested document using the document_id.

In [None]:
import asyncio
import json
import os
import sys

# Add the parent directory to the system path to allow importing from src
sys.path.insert(0, os.path.abspath('../'))

from src.config.settings import settings
from src.storage.json_storage import JsonStorage
from src.storage.vector_storage import VectorStorage
from src.clients.llm_client import LLMClient
from src.clients.reddit_client import RedditClient
from src.services.subreddit_service import SubredditService

# Initialize services
json_storage = JsonStorage()
vector_storage = VectorStorage()
llm_client = LLMClient()
reddit_client = RedditClient(
    client_id=settings.REDDIT_CLIENT_ID,
    client_secret=settings.REDDIT_CLIENT_SECRET,
    username=settings.REDDIT_USERNAME,
    password=settings.REDDIT_PASSWORD
)
subreddit_service = SubredditService(reddit_client, llm_client, vector_storage, json_storage)

# Load document info from previous cell
print("Loading document information...")
try:
    with open('ingested_docs.json', 'r') as f:
        ingested_data = json.load(f)
    
    document_id = ingested_data['document_id']
    organization_id = ingested_data['organization_id']
    
    if not ingested_data['ingestion_success']:
        print(f"❌ Cannot proceed: Document ingestion failed in previous cell")
        print(f"Error: {ingested_data.get('error_message', 'Unknown error')}")
    else:
        print(f"📄 Document ID: {document_id}")
        print(f"🏢 Organization ID: {organization_id}")
        
        # Extract topics from the document
        print("\nExtracting topics from document...")
        success, message, topics = await subreddit_service.extract_topics_from_documents(
            organization_id=organization_id,
            document_ids=[document_id]
        )
        
        if success:
            print(f"✅ Topic extraction successful!")
            print(f"Extracted {len(topics)} topics:")
            for i, topic in enumerate(topics, 1):
                print(f"  {i}. {topic}")
            
            # Save topics to JSON file
            extracted_topics_data = {
                "topics": topics,
                "organization_id": organization_id,
                "document_id": document_id,
                "extraction_success": True,
                "topics_count": len(topics)
            }
            
            with open('extracted_topics_output.json', 'w') as f:
                json.dump(extracted_topics_data, f, indent=2)
            
            print(f"\n📁 Saved topics to extracted_topics_output.json")
        else:
            print(f"❌ Topic extraction failed: {message}")
            # Save failure info
            extracted_topics_data = {
                "topics": [],
                "organization_id": organization_id,
                "document_id": document_id,
                "extraction_success": False,
                "error_message": message
            }
            
            with open('extracted_topics_output.json', 'w') as f:
                json.dump(extracted_topics_data, f, indent=2)

except FileNotFoundError:
    print("❌ Error: ingested_docs.json not found. Please run Cell 1 first.")
except Exception as e:
    print(f"❌ Error loading document info: {str(e)}")

## Cell 3: Discover Subreddits

Discover and rank subreddits based on the extracted topics.

In [None]:
import asyncio
import json
import os
import sys

# Add the parent directory to the system path to allow importing from src
sys.path.insert(0, os.path.abspath('../'))

from src.config.settings import settings
from src.storage.json_storage import JsonStorage
from src.storage.vector_storage import VectorStorage
from src.clients.llm_client import LLMClient
from src.clients.reddit_client import RedditClient
from src.services.subreddit_service import SubredditService

# Initialize services
json_storage = JsonStorage()
vector_storage = VectorStorage()
llm_client = LLMClient()
reddit_client = RedditClient(
    client_id=settings.REDDIT_CLIENT_ID,
    client_secret=settings.REDDIT_CLIENT_SECRET,
    username=settings.REDDIT_USERNAME,
    password=settings.REDDIT_PASSWORD
)
subreddit_service = SubredditService(reddit_client, llm_client, vector_storage, json_storage)

# Load topics from previous cell
print("Loading extracted topics...")
try:
    with open('extracted_topics_output.json', 'r') as f:
        topics_data = json.load(f)
    
    topics = topics_data['topics']
    organization_id = topics_data['organization_id']
    
    if not topics_data['extraction_success']:
        print(f"❌ Cannot proceed: Topic extraction failed in previous cell")
        print(f"Error: {topics_data.get('error_message', 'Unknown error')}")
    else:
        print(f"📋 Loaded {len(topics)} topics:")
        for i, topic in enumerate(topics, 1):
            print(f"  {i}. {topic}")
        print(f"🏢 Organization ID: {organization_id}")
        
        # Discover and rank subreddits
        print("\nDiscovering subreddits based on topics...")
        success, message, ranked_subreddits = await subreddit_service.discover_and_rank_subreddits(
            topics=topics,
            organization_id=organization_id,
            use_rag_context=True
        )
        
        if success:
            print(f"✅ Subreddit discovery successful!")
            print(f"Found and ranked {len(ranked_subreddits)} subreddits:")
            for i, subreddit in enumerate(ranked_subreddits, 1):
                print(f"  {i}. r/{subreddit}")
            
            # Save subreddits to JSON file
            discovered_subreddits_data = {
                "ranked_subreddits": ranked_subreddits,
                "organization_id": organization_id,
                "source_topics": topics,
                "discovery_success": True,
                "subreddits_count": len(ranked_subreddits),
                "discovery_message": message
            }
            
            with open('discovered_subreddits_output.json', 'w') as f:
                json.dump(discovered_subreddits_data, f, indent=2)
            
            print(f"\n📁 Saved subreddits to discovered_subreddits_output.json")
        else:
            print(f"❌ Subreddit discovery failed: {message}")
            # Save failure info
            discovered_subreddits_data = {
                "ranked_subreddits": [],
                "organization_id": organization_id,
                "source_topics": topics,
                "discovery_success": False,
                "error_message": message
            }
            
            with open('discovered_subreddits_output.json', 'w') as f:
                json.dump(discovered_subreddits_data, f, indent=2)

except FileNotFoundError:
    print("❌ Error: extracted_topics_output.json not found. Please run Cell 2 first.")
except Exception as e:
    print(f"❌ Error loading topics: {str(e)}")