This notebook sets up the Agentic system.
The simplified flow of the system is as follows:
User gives input question as a prompt -> Preprocess the question via prompt tuning -> do a semantic search query over the summary database  -> 

(branch 1): is the answer in a summary (or spread over multiple)? -> use the summary/summaries to generate summary answer -> output answer to user

(branch 2): is the answer not in a summary? -> semantic search query the entire dataset for the answer -> retrieve the relevant chunks -> generate summary of the relevant results -> output answer to user

In [2]:
# rizzbot_agentic.py with logging and conversation memory

import os
import numpy as np
from typing import List, Dict, Optional, Tuple
from datetime import datetime
from langchain.schema.runnable import RunnableLambda, RunnableBranch
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.agents import AgentExecutor, Tool, initialize_agent, AgentType
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
from langsmith import Client
from langchain.retrievers.multi_query import MultiQueryRetriever


class ConversationMemory:
    """Manages conversation history and user context"""
    
    def __init__(self, max_history_length: int = 10):
        self.max_history_length = max_history_length
        self.conversation_history: List[Dict] = []
        self.user_profile: Dict = {
            'social_goals': [],
            'challenges_mentioned': [],
            'advice_given': [],
            'preferences': {},
            'success_stories': [],
            'first_interaction': None
        }
    
    def add_interaction(self, question: str, response: str, sources: List[str] = None):
        """Add a new interaction to the conversation history"""
        interaction = {
            'timestamp': datetime.now().isoformat(),
            'question': question,
            'response': response,
            'sources': sources or []
        }
        
        self.conversation_history.append(interaction)
        
        # Keep only the last N interactions
        if len(self.conversation_history) > self.max_history_length:
            self.conversation_history = self.conversation_history[-self.max_history_length:]
        
        # Update user profile
        self._update_user_profile(question, response)
        
        # Set first interaction timestamp
        if self.user_profile['first_interaction'] is None:
            self.user_profile['first_interaction'] = interaction['timestamp']
    
    def _update_user_profile(self, question: str, response: str):
        """Extract insights from the interaction to update user profile"""
        question_lower = question.lower()
        
        # Identify social goals
        goal_keywords = {
            'dating': ['dating', 'date', 'romantic', 'girlfriend', 'boyfriend'],
            'networking': ['networking', 'professional', 'work', 'career', 'colleague'],
            'public_speaking': ['presentation', 'public speaking', 'speech', 'audience'],
            'confidence': ['confidence', 'shy', 'nervous', 'anxious'],
            'conversation': ['conversation', 'small talk', 'chat', 'talking']
        }
        
        for goal, keywords in goal_keywords.items():
            if any(keyword in question_lower for keyword in keywords):
                if goal not in self.user_profile['social_goals']:
                    self.user_profile['social_goals'].append(goal)
        
        # Identify challenges mentioned
        challenge_keywords = ['struggle', 'difficult', 'hard', 'problem', 'issue', 'challenge']
        if any(keyword in question_lower for keyword in challenge_keywords):
            # Extract the context around challenge keywords
            for keyword in challenge_keywords:
                if keyword in question_lower:
                    challenge_context = question[:100]  # Keep first 100 chars as context
                    if challenge_context not in self.user_profile['challenges_mentioned']:
                        self.user_profile['challenges_mentioned'].append(challenge_context)
        
        # Track advice topics given
        advice_keywords = ['advice', 'tip', 'suggestion', 'recommend', 'try']
        if any(keyword in response.lower() for keyword in advice_keywords):
            # Simple extraction - in a real implementation, you might use NLP to extract key topics
            self.user_profile['advice_given'].append({
                'timestamp': datetime.now().isoformat(),
                'topic': question[:50] + "...",  # Truncated question as topic
                'response_snippet': response[:100] + "..."
            })
    
    def get_conversation_context(self) -> str:
        """Get formatted conversation context for the prompt"""
        if not self.conversation_history:
            return ""
        
        context_parts = []
        
        # Add user profile summary
        if self.user_profile['social_goals']:
            context_parts.append(f"User's social goals: {', '.join(self.user_profile['social_goals'])}")
        
        if self.user_profile['challenges_mentioned']:
            recent_challenges = self.user_profile['challenges_mentioned'][-2:]  # Last 2 challenges
            context_parts.append(f"Recent challenges mentioned: {'; '.join(recent_challenges)}")
        
        # Add recent conversation history
        if len(self.conversation_history) > 1:
            context_parts.append("Recent conversation:")
            for interaction in self.conversation_history[-3:]:  # Last 3 interactions
                context_parts.append(f"Q: {interaction['question']}")
                context_parts.append(f"A: {interaction['response'][:100]}...")
        
        return "\n".join(context_parts)
    
    def get_user_profile_summary(self) -> str:
        """Get a summary of the user's profile"""
        profile_parts = []
        
        if self.user_profile['social_goals']:
            profile_parts.append(f"Goals: {', '.join(self.user_profile['social_goals'])}")
        
        if self.user_profile['challenges_mentioned']:
            profile_parts.append(f"Challenges: {len(self.user_profile['challenges_mentioned'])} mentioned")
        
        if self.user_profile['advice_given']:
            profile_parts.append(f"Advice given: {len(self.user_profile['advice_given'])} topics")
        
        if self.user_profile['first_interaction']:
            profile_parts.append(f"First interaction: {self.user_profile['first_interaction']}")
        
        return "; ".join(profile_parts) if profile_parts else "New user"
    
    def is_follow_up_question(self, question: str) -> bool:
        """Check if the current question is a follow-up to previous conversation"""
        if not self.conversation_history:
            return False
        
        follow_up_indicators = [
            'what about', 'how about', 'and if', 'but what', 'also', 'additionally',
            'follow up', 'more on', 'expand on', 'tell me more', 'what else'
        ]
        
        question_lower = question.lower()
        return any(indicator in question_lower for indicator in follow_up_indicators)


class Rizzbot:
    def __init__(self, session_id: str = None):
        print("[INIT] Starting Rizzbot initialization...")
        self.session_id = session_id or f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        self.memory = ConversationMemory()
        print(f"[INIT] Session ID: {self.session_id}")
        
        _ = self._load_env()
        self.similarity_threshold = 0.3
        self.top_k = 3
        self.summary_threshold = 3  # Stop after finding this many docs in summaries

        os.environ["LANGCHAIN_TRACING_V2"] = "true"
        os.environ["LANGCHAIN_PROJECT"] = "rizzbot"
        print("[ENV] Environment variables set.")

        self.client = Client()

        self.main_llm = ChatOpenAI(model="gpt-4o", temperature=0.25)
        print("[LLM] Main LLM (gpt-4o) initialized.")

        self.expand_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.6)
        print("[LLM] Expansion LLM (gpt-3.5-turbo) initialized.")

        self.embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
        print("[Embeddings] OpenAI embeddings initialized.")

        self.pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
        print("[Pinecone] Pinecone client initialized.")

        self.summaries_vectorstore = PineconeVectorStore(
            index_name="rizzbot-summaries-full-text",
            embedding=self.embeddings,
            text_key="full_text"
        )
        print("[VectorStore] Summaries vector store initialized.")

        self.full_vectorstore = PineconeVectorStore(
            index_name="rizzbot", embedding=self.embeddings, text_key="full_text"
        )
        print("[VectorStore] Full vector store initialized.")

        self.no_answer_response = "Sorry bro, I couldn't find enough info to answer that confidently."

        # Updated prompt template to include conversation context
        self.base_prompt_template = ChatPromptTemplate.from_template("""
        You are a charisma and personal development expert helping someone improve their social skills.

        User Profile: {user_profile}
        Conversation Context: {conversation_context}
        
        Current Question: {question}
        Relevant Content: {content}

        Instructions:
        1. Consider the user's profile and conversation history when answering
        2. If this is a follow-up question, reference previous interactions appropriately
        3. Provide actionable, specific advice based on the context and user's goals
        4. Use examples when possible, tailored to the user's specific situation
        5. Keep the tone encouraging and supportive
        6. If the question cannot be answered from the vectorstore content, respond: "Sorry bro, I couldn't find enough info in my database to answer that confidently."
        7. Build on previous advice when relevant
        8. At the end of your response, include a "Sources:" section listing the document sources used

        Response:
        """)

        self._build_agent_chain()
        print("[INIT] Rizzbot initialized and ready.")

    def _load_env(self):
        from dotenv import load_dotenv, find_dotenv
        print("[ENV] Loading environment variables from .env file...")
        return load_dotenv(find_dotenv())

    def _embed_question(self, question: str) -> List[float]:
        print(f"[Embed] Embedding question: {question}")
        result = self.embeddings.embed_query(question)
        print(f"[Embed] Embedding result length: {len(result)}")
        return result

    def _cosine_similarity(self, vec1, vec2):
        vec1, vec2 = np.array(vec1), np.array(vec2)
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    def _filter_by_similarity(self, query_embedding, docs, threshold):
        filtered = []
        sources = []

        for doc in docs:
            try:
                doc_embedding = self.embeddings.embed_query(doc.page_content)
                sim = self._cosine_similarity(query_embedding, doc_embedding)
                print(f"[Similarity] Score: {sim:.4f} | Text: {doc.page_content[:80]}...")

                if sim >= threshold:
                    filtered.append(doc)
                    # Extract source information from document metadata
                    source_info = self._extract_source_info(doc)
                    sources.append(source_info)
            except Exception as e:
                print(f"[Similarity] Failed to embed doc: {e}")

        return filtered, sources

    def _extract_source_info(self, doc) -> str:
        """Extract source information from document metadata"""
        if hasattr(doc, 'metadata') and doc.metadata:
            # Try to get source information from metadata
            source = doc.metadata.get('source', 'Unknown source')
            title = doc.metadata.get('title', '')
            if title:
                return f"{title} ({source})"
            else:
                return source
        else:
            # Fallback to truncated content as identifier
            return f"Document: {doc.page_content[:50]}..."

    def _hybrid_query_search(self, question: str) -> Tuple[List[str], List[str]]:
        print(f"[Search:Hybrid] Embedding question...")
        question_embedding = self._embed_question(question)
        combined_results = []
        all_sources = []

        # First, try summaries vectorstore
        print(f"[Search:Hybrid] Trying summaries vectorstore...")
        try:
            retriever = MultiQueryRetriever.from_llm(
                retriever=self.summaries_vectorstore.as_retriever(search_kwargs={"k": self.top_k}),
                llm=self.expand_llm
            )
            docs = retriever.invoke(question)
            filtered, sources = self._filter_by_similarity(question_embedding, docs, self.similarity_threshold)
            print(f"[Search:Hybrid] {len(filtered)} docs passed threshold in summaries.")
            
            if len(filtered) > self.summary_threshold:
                print(f"[Search:Hybrid] Found {len(filtered)} docs in summaries (>{self.summary_threshold}), skipping full search.")
                combined_results.extend([doc.page_content for doc in filtered])
                all_sources.extend(sources)
                return combined_results, all_sources
            else:
                combined_results.extend([doc.page_content for doc in filtered])
                all_sources.extend(sources)
        except Exception as e:
            print(f"[Search:Hybrid] Retrieval failed for summaries: {e}")

        # If we didn't find enough in summaries, search full vectorstore
        print(f"[Search:Hybrid] Trying full vectorstore...")
        try:
            retriever = MultiQueryRetriever.from_llm(
                retriever=self.full_vectorstore.as_retriever(search_kwargs={"k": self.top_k}),
                llm=self.expand_llm
            )
            docs = retriever.invoke(question)
            filtered, sources = self._filter_by_similarity(question_embedding, docs, self.similarity_threshold)
            print(f"[Search:Hybrid] {len(filtered)} docs passed threshold in full.")
            combined_results.extend([doc.page_content for doc in filtered])
            all_sources.extend(sources)
        except Exception as e:
            print(f"[Search:Hybrid] Retrieval failed for full: {e}")

        return combined_results, all_sources

    def _build_agent_chain(self):
        print("[Chain] Building agent chain...")

        def hybrid_search_with_sources(question):
            # Handle both string and dict inputs for robustness
            if isinstance(question, dict):
                question = question.get("question", "")
            results, sources = self._hybrid_query_search(question)
            if results:
                content = "\n\n".join(results)
                return content
            else:
                return self.no_answer_response
    
        self.agent_chain = (
            {
                "question": lambda q: q,
                "content": hybrid_search_with_sources,
                "conversation_context": lambda q: self.memory.get_conversation_context(),
                "user_profile": lambda q: self.memory.get_user_profile_summary(),
            }
            | self.base_prompt_template
            | self.main_llm
            | StrOutputParser()
        )

        print("[Chain] Agent chain constructed.")

    def answer_question(self, question: str) -> str:
        print(f"[Answer] Received question: {question}")
        print(f"[Memory] Is follow-up: {self.memory.is_follow_up_question(question)}")
        
        try:
            context, sources = self._hybrid_query_search(question)
            if not context:
                print("[Answer] No relevant documents found. Returning fallback response.")
                response = self.no_answer_response
            else:
                print("[Answer] Relevant context found. Generating response with LLM...")
                response = self.agent_chain.invoke(question)
                print(f"[Answer] Answer generated successfully.")
            
            # Add interaction to memory
            self.memory.add_interaction(question, response, sources)
            print(f"[Memory] Interaction added to memory. Total interactions: {len(self.memory.conversation_history)}")
            
            return response
        except Exception as e:
            print(f"[Answer] Agentic pipeline failed: {e}")
            error_response = self.no_answer_response
            self.memory.add_interaction(question, error_response, [])
            return error_response
    
    def get_conversation_summary(self) -> str:
        """Get a summary of the conversation so far"""
        return self.memory.get_conversation_context()
    
    def get_user_insights(self) -> Dict:
        """Get insights about the user based on conversation history"""
        return self.memory.user_profile
    
    def reset_conversation(self):
        """Reset the conversation memory"""
        self.memory = ConversationMemory()
        print("[Memory] Conversation memory reset.")


# Example usage:
if __name__ == "__main__":
    # Initialize the bot with a session ID
    bot = Rizzbot(session_id="test_session_001")
    
    # Example conversation
    questions = [
        "How do I start a conversation with someone I find attractive?",
        "What if they seem disinterested?",
        "How about at a networking event instead?",
        "I'm really nervous about public speaking. Any tips?"
    ]
    
    for question in questions:
        print(f"\n{'='*50}")
        print(f"User: {question}")
        response = bot.answer_question(question)
        print(f"Bot: {response}")
        print(f"User Profile: {bot.get_user_insights()}")

[INIT] Starting Rizzbot initialization...
[INIT] Session ID: test_session_001
[ENV] Loading environment variables from .env file...
[ENV] Environment variables set.
[LLM] Main LLM (gpt-4o) initialized.
[LLM] Expansion LLM (gpt-3.5-turbo) initialized.
[Embeddings] OpenAI embeddings initialized.
[Pinecone] Pinecone client initialized.
[VectorStore] Summaries vector store initialized.
[VectorStore] Full vector store initialized.
[Chain] Building agent chain...
[Chain] Agent chain constructed.
[INIT] Rizzbot initialized and ready.

User: How do I start a conversation with someone I find attractive?
[Answer] Received question: How do I start a conversation with someone I find attractive?
[Memory] Is follow-up: False
[Search:Hybrid] Embedding question...
[Embed] Embedding question: How do I start a conversation with someone I find attractive?
[Embed] Embedding result length: 1536
[Search:Hybrid] Trying summaries vectorstore...
[Similarity] Score: 0.3390 | Text: Topic ID: 44
Run Name: 1000wor

In [48]:
# Convert test_answers.txt to .csv file
import pandas as pd
def convert_txt_to_csv(txt_file: str, csv_file: str):
    print(f"[Convert] Converting {txt_file} to {csv_file}...")
    with open(txt_file, "r") as f:
        lines = f.readlines()

    data = []
    for i in range(0, len(lines), 3):
        question = lines[i].strip().replace("Question: ", "")
        answer = lines[i + 1].strip().replace("Answer: ", "")
        data.append({"Question": question, "Answer": answer})

    df = pd.DataFrame(data)
    df.to_csv(csv_file, index=False)
    print(f"[Convert] Conversion complete. Saved to {csv_file}.")
convert_txt_to_csv("test_answers.txt", "test_answers.csv")

[Convert] Converting test_answers.txt to test_answers.csv...
[Convert] Conversion complete. Saved to test_answers.csv.


In [50]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
import json
from dotenv import load_dotenv, find_dotenv
import re
import pandas as pd

_ = load_dotenv(find_dotenv())

open_ai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize evaluator LLM (use a more critical model like gpt-4 if available)
evaluator_llm = ChatOpenAI(model_name="gpt-4", temperature=0, api_key=open_ai_api_key)

# Evaluation criteria prompt
eval_prompt_template = PromptTemplate.from_template("""
You are an expert evaluator of communication advice.

Evaluate the following AI-generated answer based on these criteria (rate each 1-5):
- Helpfulness: Does it help the user improve charisma or social skill?
- Specificity: Is the advice concrete and not generic?
- Tone: Is it encouraging, empathetic, and natural?
- Groundedness: Does it appear based on the context provided?

Return your answer in JSON format like:
{{
  "helpfulness": 4,
  "specificity": 3,
  "tone": 5,
  "groundedness": 2,
  "comment": "Good tone and practical advice, but lacks reference to context."
}}

---

Question: {question}

Context: {context}

AI Answer:
{answer}
""")
def evaluate_responses_from_csv(csv_file: str):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    results = []

    for idx, row in df.iterrows():
        question = row["Question"]
        answer = row["Answer"]
        context = ""  # Adjust if you have context

        prompt = eval_prompt_template.format(question=question, context=context, answer=answer)
        response = evaluator_llm.predict(prompt)
        try:
            eval_result = json.loads(response)
        except json.JSONDecodeError:
            print("⚠️ Failed to parse response. Raw output:")
            print(response)
            eval_result = {}
        results.append({
            "question": question,
            "answer": answer,
            **eval_result
        })
    return results

# Example usage:
if __name__ == "__main__":
    eval_results = evaluate_responses_from_csv("test_answers.csv")
    df_eval = pd.DataFrame(eval_results)
    print(df_eval)
    df_eval.to_csv("evaluation_results.csv", index=False)


                                             question  \
0   I have a presentation tomorrow, how can I impr...   
1                     1. **Body Language and Space**:   
2                     2. **Storytelling Techniques**:   
3            3. **Confidence Through Vulnerability**:   
4                 4. **Handling Awkward Situations**:   
5                           5. **Active Engagement**:   
6   6. **Preparation and Anticipation of Questions**:   
7   Remember, the key to a successful presentation...   
8   - Document discussing body language and storyt...   
9   How can I make a good first impression on a date?   
10  1. **Be Authentic**: Authenticity is key to ma...   
11                                                NaN   
12  4. **Compliment Without Overdoing It**: Compli...   
13                                                NaN   
14  7. **Prepare Conversation Starters**: Having a...   
15                                                NaN   
16  - Document on authenticity 