This notebook sets up the Agentic system.
The simplified flow of the system is as follows:
User gives input question as a prompt -> Preprocess the question via prompt tuning -> do a semantic search query over the summary database  -> 

(branch 1): is the answer in a summary (or spread over multiple)? -> use the summary/summaries to generate summary answer -> output answer to user

(branch 2): is the answer not in a summary? -> semantic search query the entire dataset for the answer -> retrieve the relevant chunks -> generate summary of the relevant results -> output answer to user

In [45]:
# rizzbot_agentic.py with logging

import os
import numpy as np
from typing import List, Dict, Optional, Tuple
from langchain.schema.runnable import RunnableLambda, RunnableBranch
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.agents import AgentExecutor, Tool, initialize_agent, AgentType
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
from langsmith import Client
from langchain.retrievers.multi_query import MultiQueryRetriever


class Rizzbot:
    def __init__(self):
        print("[INIT] Starting Rizzbot initialization...")
        _ = self._load_env()
        self.similarity_threshold = 0.3
        self.top_k = 3
        self.summary_threshold = 2  # Stop after finding this many docs in summaries
        self.min_docs_threshold = 2  # Minimum docs required to attempt answer generation

        os.environ["LANGCHAIN_TRACING_V2"] = "true"
        os.environ["LANGCHAIN_PROJECT"] = "rizzbot"
        print("[ENV] Environment variables set.")

        self.client = Client()

        self.main_llm = ChatOpenAI(model="gpt-4o", temperature=0.25)
        print("[LLM] Main LLM (gpt-4o) initialized.")

        self.expand_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.6)
        print("[LLM] Expansion LLM (gpt-3.5-turbo) initialized.")

        self.embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
        print("[Embeddings] OpenAI embeddings initialized.")

        self.pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
        print("[Pinecone] Pinecone client initialized.")

        self.summaries_vectorstore = PineconeVectorStore(
            index_name="rizzbot-summaries-full-text",
            embedding=self.embeddings,
            text_key="full_text"
        )
        print("[VectorStore] Summaries vector store initialized.")

        self.full_vectorstore = PineconeVectorStore(
            index_name="rizzbot", embedding=self.embeddings, text_key="full_text"
        )
        print("[VectorStore] Full vector store initialized.")

        self.no_answer_response = "Sorry bro, I couldn't find enough info to answer that confidently."

        self.base_prompt_template = ChatPromptTemplate.from_template("""
        You are a charisma and personal development expert helping someone improve their social skills.

        Context: {content}
        Question: {question}

        Instructions:
        1. Analyze the question and context. Check the vectorstores for an answer. If the answer can not be found in the vectorstore, answer: "Sorry bro, I couldn't find enough info in my database to answer that confidently.""
        2. If the question is not clear, ask for clarification.
        3. If the question is clear, provide actionable, specific advice based on the context.
        4. Use examples when possible
        5. Keep the tone encouraging and supportive
        6. If information is insufficient, explain what you'd need to give a better answer
        7. At the end of your response, include a "Sources:" section listing the document sources used

        Response:
        """)

        self._build_agent_chain()
        print("[INIT] Rizzbot initialized and ready.")

    def _load_env(self):
        from dotenv import load_dotenv, find_dotenv
        print("[ENV] Loading environment variables from .env file...")
        return load_dotenv(find_dotenv())

    def _embed_question(self, question: str) -> List[float]:
        print(f"[Embed] Embedding question: {question}")
        result = self.embeddings.embed_query(question)
        print(f"[Embed] Embedding result length: {len(result)}")
        return result

    def _cosine_similarity(self, vec1, vec2):
        vec1, vec2 = np.array(vec1), np.array(vec2)
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    def _filter_by_similarity(self, query_embedding, docs, threshold):
        filtered = []
        sources = []

        for doc in docs:
            try:
                doc_embedding = self.embeddings.embed_query(doc.page_content)
                sim = self._cosine_similarity(query_embedding, doc_embedding)
                print(f"[Similarity] Score: {sim:.4f} | Text: {doc.page_content[:80]}...")

                if sim >= threshold:
                    filtered.append(doc)
                    # Extract source information from document metadata
                    source_info = self._extract_source_info(doc)
                    sources.append(source_info)
            except Exception as e:
                print(f"[Similarity] Failed to embed doc: {e}")

        return filtered, sources

    def _extract_source_info(self, doc) -> str:
        """Extract source information from document metadata"""
        if hasattr(doc, 'metadata') and doc.metadata:
            # Try to get source information from metadata
            source = doc.metadata.get('source', 'Unknown source')
            title = doc.metadata.get('title', '')
            if title:
                return f"{title} ({source})"
            else:
                return source
        else:
            # Fallback to truncated content as identifier
            return f"Document: {doc.page_content[:50]}..."

    def _hybrid_query_search(self, question: str) -> Tuple[List[str], List[str]]:
        print(f"[Search:Hybrid] Embedding question...")
        question_embedding = self._embed_question(question)
        combined_results = []
        all_sources = []

        # First, try summaries vectorstore
        print(f"[Search:Hybrid] Trying summaries vectorstore...")
        try:
            retriever = MultiQueryRetriever.from_llm(
                retriever=self.summaries_vectorstore.as_retriever(search_kwargs={"k": self.top_k}),
                llm=self.expand_llm
            )
            docs = retriever.invoke(question)
            filtered, sources = self._filter_by_similarity(question_embedding, docs, self.similarity_threshold)
            print(f"[Search:Hybrid] {len(filtered)} docs passed threshold in summaries.")
            
            if len(filtered) > self.summary_threshold:
                print(f"[Search:Hybrid] Found {len(filtered)} docs in summaries (>{self.summary_threshold}), skipping full search.")
                combined_results.extend([doc.page_content for doc in filtered])
                all_sources.extend(sources)
                return combined_results, all_sources
            else:
                combined_results.extend([doc.page_content for doc in filtered])
                all_sources.extend(sources)
        except Exception as e:
            print(f"[Search:Hybrid] Retrieval failed for summaries: {e}")

        # If we didn't find enough in summaries, search full vectorstore
        print(f"[Search:Hybrid] Trying full vectorstore...")
        try:
            retriever = MultiQueryRetriever.from_llm(
                retriever=self.full_vectorstore.as_retriever(search_kwargs={"k": self.top_k}),
                llm=self.expand_llm
            )
            docs = retriever.invoke(question)
            filtered, sources = self._filter_by_similarity(question_embedding, docs, self.similarity_threshold)
            print(f"[Search:Hybrid] {len(filtered)} docs passed threshold in full.")
            combined_results.extend([doc.page_content for doc in filtered])
            all_sources.extend(sources)
        except Exception as e:
            print(f"[Search:Hybrid] Retrieval failed for full: {e}")

        return combined_results, all_sources

    def _build_agent_chain(self):
        print("[Chain] Building agent chain...")

        def format_content_with_sources(content_and_sources):
            """Format content with sources for the LLM"""
            content, sources = content_and_sources
            if content:
                formatted_content = content
                if sources:
                    formatted_content += f"\n\nSources: {', '.join(set(sources))}"
                return formatted_content
            else:
                return self.no_answer_response
    
        self.agent_chain = (
            {
                "question": lambda q: q,
                "content": format_content_with_sources,
            }
            | self.base_prompt_template
            | self.main_llm
            | StrOutputParser()
        )

        print("[Chain] Agent chain constructed.")

    def answer_question(self, question: str) -> str:
        print(f"[Answer] Received question: {question}")
        try:
            # Perform search once
            context, sources = self._hybrid_query_search(question)
            
            # Check if we have enough documents
            if not context or len(context) < self.min_docs_threshold:
                print(f"[Answer] Insufficient documents found ({len(context) if context else 0} docs, need {self.min_docs_threshold}). Returning fallback response.")
                return self.no_answer_response

            print(f"[Answer] Found {len(context)} relevant documents. Generating response with LLM...")
            
            # Format content with sources
            content_text = "\n\n".join(context)
            if sources:
                content_text += f"\n\nSources: {', '.join(set(sources))}"
            
            # Pass the pre-searched content to the chain
            answer = self.agent_chain.invoke({
                "question": question,
                "content": (content_text, sources)
            })
            
            print(f"[Answer] Answer generated successfully.")
            return answer
        except Exception as e:
            print(f"[Answer] Agentic pipeline failed: {e}")
            return self.no_answer_response
if __name__ == "__main__":
    print("[Test] Starting test run for Rizzbot...\n")
    
    bot = Rizzbot()
    sample_questions = ["I have a presentation tomorrow, how can I improve my presence on stage?", "How can I make a good first impression on a date?" , "What are some examples of behaviours that celebrities show that make them charismatic?", 
                        "What can I do to my body language to feel more confident?", "How can I connect with people better?"]

    
    for sample_question in sample_questions:
        print(f"\n[Test] Asking: {sample_question}\n")
        print(f"[Test] Question: {sample_question}")
        answer = bot.answer_question(sample_question)
        # save the answer to a .txt file
        with open("test_answers.txt", "a") as f:
            f.write(f"Question: {sample_question}\nAnswer: {answer}\n\n")
        print(f"[Test] Answer: {answer}\n")
    print("[Test] All questions answered. Check test_answers.txt for results.\n")

    print("\n[Test] Final Answer:")
    print(answer)


[Test] Starting test run for Rizzbot...

[INIT] Starting Rizzbot initialization...
[ENV] Loading environment variables from .env file...
[ENV] Environment variables set.
[LLM] Main LLM (gpt-4o) initialized.
[LLM] Expansion LLM (gpt-3.5-turbo) initialized.
[Embeddings] OpenAI embeddings initialized.
[Pinecone] Pinecone client initialized.
[VectorStore] Summaries vector store initialized.
[VectorStore] Full vector store initialized.
[Chain] Building agent chain...
[Chain] Agent chain constructed.
[INIT] Rizzbot initialized and ready.

[Test] Asking: I have a presentation tomorrow, how can I improve my presence on stage?

[Test] Question: I have a presentation tomorrow, how can I improve my presence on stage?
[Answer] Received question: I have a presentation tomorrow, how can I improve my presence on stage?
[Search:Hybrid] Embedding question...
[Embed] Embedding question: I have a presentation tomorrow, how can I improve my presence on stage?
[Embed] Embedding result length: 1536
[Search:

In [48]:
# Convert test_answers.txt to .csv file
import pandas as pd
def convert_txt_to_csv(txt_file: str, csv_file: str):
    print(f"[Convert] Converting {txt_file} to {csv_file}...")
    with open(txt_file, "r") as f:
        lines = f.readlines()

    data = []
    for i in range(0, len(lines), 3):
        question = lines[i].strip().replace("Question: ", "")
        answer = lines[i + 1].strip().replace("Answer: ", "")
        data.append({"Question": question, "Answer": answer})

    df = pd.DataFrame(data)
    df.to_csv(csv_file, index=False)
    print(f"[Convert] Conversion complete. Saved to {csv_file}.")
convert_txt_to_csv("test_answers.txt", "test_answers.csv")

[Convert] Converting test_answers.txt to test_answers.csv...
[Convert] Conversion complete. Saved to test_answers.csv.


In [50]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
import json
from dotenv import load_dotenv, find_dotenv
import re
import pandas as pd

_ = load_dotenv(find_dotenv())

open_ai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize evaluator LLM (use a more critical model like gpt-4 if available)
evaluator_llm = ChatOpenAI(model_name="gpt-4", temperature=0, api_key=open_ai_api_key)

# Evaluation criteria prompt
eval_prompt_template = PromptTemplate.from_template("""
You are an expert evaluator of communication advice.

Evaluate the following AI-generated answer based on these criteria (rate each 1-5):
- Helpfulness: Does it help the user improve charisma or social skill?
- Specificity: Is the advice concrete and not generic?
- Tone: Is it encouraging, empathetic, and natural?
- Groundedness: Does it appear based on the context provided?

Return your answer in JSON format like:
{{
  "helpfulness": 4,
  "specificity": 3,
  "tone": 5,
  "groundedness": 2,
  "comment": "Good tone and practical advice, but lacks reference to context."
}}

---

Question: {question}

Context: {context}

AI Answer:
{answer}
""")
def evaluate_responses_from_csv(csv_file: str):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    results = []

    for idx, row in df.iterrows():
        question = row["Question"]
        answer = row["Answer"]
        context = ""  # Adjust if you have context

        prompt = eval_prompt_template.format(question=question, context=context, answer=answer)
        response = evaluator_llm.predict(prompt)
        try:
            eval_result = json.loads(response)
        except json.JSONDecodeError:
            print("⚠️ Failed to parse response. Raw output:")
            print(response)
            eval_result = {}
        results.append({
            "question": question,
            "answer": answer,
            **eval_result
        })
    return results

# Example usage:
if __name__ == "__main__":
    eval_results = evaluate_responses_from_csv("test_answers.csv")
    df_eval = pd.DataFrame(eval_results)
    print(df_eval)
    df_eval.to_csv("evaluation_results.csv", index=False)


                                             question  \
0   I have a presentation tomorrow, how can I impr...   
1                     1. **Body Language and Space**:   
2                     2. **Storytelling Techniques**:   
3            3. **Confidence Through Vulnerability**:   
4                 4. **Handling Awkward Situations**:   
5                           5. **Active Engagement**:   
6   6. **Preparation and Anticipation of Questions**:   
7   Remember, the key to a successful presentation...   
8   - Document discussing body language and storyt...   
9   How can I make a good first impression on a date?   
10  1. **Be Authentic**: Authenticity is key to ma...   
11                                                NaN   
12  4. **Compliment Without Overdoing It**: Compli...   
13                                                NaN   
14  7. **Prepare Conversation Starters**: Having a...   
15                                                NaN   
16  - Document on authenticity 