This notebook sets up the Agentic system.
The simplified flow of the system is as follows:
User gives input question as a prompt -> Preprocess the question via prompt tuning -> do a semantic search query over the summary database  -> 

(branch 1): is the answer in a summary (or spread over multiple)? -> use the summary/summaries to generate summary answer -> output answer to user

(branch 2): is the answer not in a summary? -> semantic search query the entire dataset for the answer -> retrieve the relevant chunks -> generate summary of the relevant results -> output answer to user

In [7]:
# This cell sets up the Rizzbot, a personal development and charisma expert chatbot.
# It uses LangChain, Pinecone, and OpenAI to provide answers based on a curated database of personal development content.
# The database is used as a Retrieval-Augmented Generation (RAG) system. 
import os
from typing import List, Dict, Optional
from dataclasses import dataclass
from dotenv import load_dotenv, find_dotenv

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_pinecone import PineconeVectorStore
from langchain.schema import Document

from langsmith import Client
import langsmith

from pinecone import Pinecone


# Env setup
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "rizzbot"
_ = load_dotenv(find_dotenv())

openai_api_key = os.getenv("OPENAI_API_KEY")
langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key)

@dataclass
class SearchResult:
    content: str
    score: float
    source: str
    metadata: Dict = None

class Rizzbot:
    def __init__(self):
        self.similarity_threshold = 0.85
        self.top_k = 3

        self.langsmith_client = Client(api_key=langsmith_api_key)

        self.llm = ChatOpenAI(
            model="gpt-4o",
            temperature=0.25,
            api_key=openai_api_key,
            tags=["rizzbot_v1"]
        )
        self.embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

        self.pc = Pinecone(api_key=pinecone_api_key)

        self._init_vector_stores()
        self._init_prompts()

    def _init_vector_stores(self):
        self.main_vectorstore = PineconeVectorStore(
            index_name="rizzbot",
            embedding=self.embeddings,
            text_key="full_text"
        )
        self.clusters_vectorstore = PineconeVectorStore(
            index_name="rizzbot-clusters", 
            embedding=self.embeddings
        )
        self.summaries_index = self.pc.Index("rizzbot-summaries-full-text")

    def _init_prompts(self):
        self.summary_answer_prompt = ChatPromptTemplate.from_template("""
        You are a charisma and personal development expert. Based on the following summary/summaries, 
        provide a comprehensive and helpful answer to the user's question.

        User Question: {question}

        Relevant Summary/Summaries:
        {summaries}

        Instructions:
        - Provide a clear, actionable answer based on the summaries
        - Be conversational and engaging
        - If the summaries don't fully address the question, mention this limitation
        - Keep your response focused and practical

        Answer:
        """)

        self.full_search_answer_prompt = ChatPromptTemplate.from_template("""
        You are a charisma and personal development expert. Based on the following relevant content 
        from the database, provide a comprehensive and helpful answer to the user's question.

        User Question: {question}

        Relevant Content:
        {content}

        Instructions:
        - Synthesize the information to provide a clear, actionable answer
        - Be conversational and engaging
        - Focus on practical advice and insights
        - If the content doesn't fully address the question, mention this limitation

        Answer:
        """)

        self.no_answer_response = (
            "My apologies, but I don't have a good answer for you based on the information in my database. "
            "My database is limited up to the past 5 years. For further research, we recommend you visit "
            "the source of my knowledge: https://www.youtube.com/user/Charismaoncommand/"
        )

    @langsmith.traceable(tags=["rizzbot_v1"])
    @langsmith.traceable(tags=["rizzbot_v1"])
    def search_summaries(self, question: str, topic_id_filter: Optional[int] = None) -> List[str]:
        try:
            embedded_query = self.embeddings.embed_query(question)

            filter_dict = {}
            if topic_id_filter is not None:
                filter_dict["topic_id"] = {"$eq": topic_id_filter}

            index = self.summaries_index

            print(f"Querying Pinecone with filter: {filter_dict or 'none'}")
            print(f"Querying Pinecone with: top_k={self.top_k}, filter={filter_dict}")

            query_response = index.query(
                vector=embedded_query,
                top_k=self.top_k,
                include_metadata=True
            )

            summaries = []
            for match in query_response.get("matches", []):
                score = match.get("score", 0.0)
                if score >= self.similarity_threshold:
                    metadata = match.get("metadata", {})
                    summary_text = metadata.get("full_text")
                    if summary_text:
                        summaries.append(summary_text)
                    else:
                        print(f"Match {match['id']} has no 'full_text' in metadata.")

            print(f"Found {len(summaries)} summary match(es).")
            return summaries

        except Exception as e:
            print(f"Error querying Pinecone summaries full text: {e}")
            return []

    @langsmith.traceable(tags=["rizzbot_v1"])
    def search_full_dataset(self, question: str) -> List[SearchResult]:
        try:
            results = self.main_vectorstore.similarity_search_with_score(question, k=self.top_k)

            return [
                SearchResult(
                    content=doc.page_content,
                    score=score,
                    source="full_dataset",
                    metadata=doc.metadata
                )
                for doc, score in results
            ]
        except Exception as e:
            print(f"Error searching full dataset: {e}")
            return []

    @langsmith.traceable(tags=["rizzbot_v1"])
    def generate_summary_answer(self, question: str, summaries: List[str]) -> str:
        combined_summaries = "\n\n".join(summaries)
        chain = self.summary_answer_prompt | self.llm | StrOutputParser()
        return chain.invoke({"question": question, "summaries": combined_summaries})

    @langsmith.traceable(tags=["rizzbot_v1"])
    def generate_full_search_answer(self, question: str, search_results: List[SearchResult]) -> str:
        combined_content = "\n\n".join([r.content for r in search_results])
        chain = self.full_search_answer_prompt | self.llm | StrOutputParser()
        return chain.invoke({"question": question, "content": combined_content})

    @langsmith.traceable(tags=["rizzbot_v1"])
    def answer_question(self, question: str, topic_id_filter: Optional[int] = None) -> Dict[str, any]:
        try:
            print("Searching summaries...")
            summaries = self.search_summaries(question, topic_id_filter)
            if summaries:
                print(f"Found {len(summaries)} relevant summaries from Pinecone")
                answer = self.generate_summary_answer(question, summaries)
                return {
                    "answer": answer,
                    "source": "summaries",
                    "num_sources": len(summaries),
                    "confidence_scores": []
                }

            if topic_id_filter is None:
                print("Falling back to full dataset...")
                full_results = self.search_full_dataset(question)
                if full_results:
                    answer = self.generate_full_search_answer(question, full_results)
                    return {
                        "answer": answer,
                        "source": "full_dataset",
                        "num_sources": len(full_results),
                        "confidence_scores": [r.score for r in full_results]
                    }

            print("No relevant results found.")
            return {
                "answer": self.no_answer_response,
                "source": "none",
                "num_sources": 0,
                "confidence_scores": []
            }

        except Exception as e:
            print(f"Error in answer_question: {e}")
            return {
                "answer": self.no_answer_response,
                "source": "error",
                "num_sources": 0,
                "confidence_scores": [],
                "error": str(e)
            }

def create_rizzbot():
    return Rizzbot()

@langsmith.traceable(tags=["rizzbot_v1"])
def ask_charisma_question(bot: Rizzbot, question: str, topic_id_filter: Optional[int] = None, verbose: bool = True):
    result = bot.answer_question(question, topic_id_filter)

    if verbose:
        print(f"\n{'='*50}")
        print(f"Question: {question}")
        if topic_id_filter is not None:
            print(f"Topic Filter: {topic_id_filter}")
        print(f"{'='*50}")
        print(f"Answer: {result['answer']}")
        print(f"\nMetadata:")
        print(f"  Source: {result['source']}")
        print(f"  Number of sources: {result['num_sources']}")
        if result['confidence_scores']:
            print(f"  Confidence scores: {[f'{score:.3f}' for score in result['confidence_scores']]}")
        print(f"{'='*50}\n")

    return result



In [9]:
# Demo cell 
# Demo: How to use the Rizzbot System
# Run this in a Jupyter notebook cell after the main system code

# Initialize the bot (this will take a moment as it connects to all services)
print("Initializing Rizzbot System...")
bot = create_rizzbot()
print("System ready!")

# Test questions
test_questions = [ 
    "Can you describe what makes David Dobrik charismatic?"
]

# Test the system with different questions
for question in test_questions:
    result = ask_charisma_question(bot, question, verbose=True)
    
    # You can also access individual components:
    # print(f"Just the answer: {result['answer']}")
    # print(f"Source type: {result['source']}")

# You can also use the bot directly for more control:
# custom_result = bot.answer_question("Your custom question here")
# print(custom_result)

Initializing Rizzbot System...
System ready!
Searching summaries...
Querying Pinecone with filter: none
Querying Pinecone with: top_k=3, filter={}
Found 0 summary match(es).
Falling back to full dataset...


KeyboardInterrupt: 