In [None]:
import os
from typing import List, Tuple
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOpenAI
from langchain_classic.chains import create_retrieval_chains
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

load_dotenv()
api_key = os.getenv("paid_api")

if not api_key:
    raise ValueError("OPENAI_API_KEY not found in .env file")

print("API key loaded")

In [None]:



class LinkedInRAGAssistant:
    """RAG Assistant for answering questions about a LinkedIn profile"""
    
    def __init__(self, persist_directory: str = "./chroma_db", 
                 documents_directory: str = "./documents"):
        """
        Initialize the RAG Assistant
        
        Args:
            persist_directory: Directory to store ChromaDB
            documents_directory: Directory containing LinkedIn profile documents
        """
        self.persist_directory = persist_directory
        self.documents_directory = documents_directory
        self.chat_history: List[Tuple[str, str]] = []
        
        # Initialize embeddings (using free HuggingFace embeddings)
        self.embeddings = OpenAIEmbeddings(openai_api_key = api_key)
        
        # Initialize vector store
        self.vector_store = None
        self.retriever = None
        self.chain = None
        
    def load_and_process_documents(self, chunk_size: int = 1000, 
                                   chunk_overlap: int = 200) -> List:
        """
        Load and process documents from the documents directory
        
        Args:
            chunk_size: Size of each text chunk
            chunk_overlap: Overlap between chunks
            
        Returns:
            List of processed document chunks
        """
        print(f"Loading documents from {self.documents_directory}...")
        
        # Load all text files from directory
        loader = DirectoryLoader(
            self.documents_directory,
            glob="**/*.txt",
            loader_cls=TextLoader
        )
        documents = loader.load()
        
        print(f"Loaded {len(documents)} documents")
        
        # Split documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
        
        chunks = text_splitter.split_documents(documents)
        print(f"Split into {len(chunks)} chunks")
        
        return chunks
    
    def create_vector_store(self, chunks: List) -> None:
        """
        Create and persist vector store with document embeddings
        
        Args:
            chunks: List of document chunks to embed
        """
        print("Creating vector store with embeddings...")
        
        # Create ChromaDB vector store
        self.vector_store = Chroma.from_documents(
            documents=chunks,
            embedding=self.embeddings,
            persist_directory=self.persist_directory
        )
        
        print(f"Vector store created and persisted to {self.persist_directory}")
        
    def load_existing_vector_store(self) -> bool:
        """
        Load existing vector store from disk
        
        Returns:
            True if successful, False otherwise
        """
        try:
            self.vector_store = Chroma(
                persist_directory=self.persist_directory,
                embedding_function=self.embeddings
            )
            print("Loaded existing vector store")
            return True
        except Exception as e:
            print(f"Could not load existing vector store: {e}")
            return False
    
    def setup_retriever(self, k: int = 4) -> None:
        """
        Setup retriever from vector store
        
        Args:
            k: Number of documents to retrieve
        """
        if self.vector_store is None:
            raise ValueError("Vector store not initialized. Call create_vector_store first.")
        
        self.retriever = self.vector_store.as_retriever(
            search_kwargs={"k": k}
        )
        print(f"Retriever configured to fetch top {k} documents")
    
    def create_rag_chain(self, api_key: str = None) -> None:
        """
        Create RAG chain using LCEL with conversational memory
        
        Args:
            api_key: OpenAI API key (if None, uses environment variable)
        """
        if self.retriever is None:
            raise ValueError("Retriever not initialized. Call setup_retriever first.")
        
        # Initialize LLM
        llm = ChatOpenAI(
            model="gpt-3.5-turbo",
            temperature=0,
            api_key=api_key
        )
        
        # Create custom prompt with memory
        system_prompt = """You are an AI assistant helping to answer questions about Esther Kudoro's LinkedIn profile and professional background.

Use the following pieces of context from the profile to answer the question. If you cannot find the answer in the context, say so - don't make up information.

Always cite your sources by mentioning which part of the profile the information comes from (e.g., "According to the experience section..." or "Based on the skills listed...").

Context from LinkedIn profile:
{context}

Chat History:
{chat_history}

Answer the question conversationally, maintaining context from previous exchanges."""

        prompt = ChatPromptTemplate.from_messages([
            ("system", system_prompt),
            ("human", "{input}")
        ])
        
        # Create the RAG chain using LCEL
        def format_docs(docs):
            """Format retrieved documents with source information"""
            formatted = []
            for i, doc in enumerate(docs, 1):
                source = doc.metadata.get('source', 'Unknown')
                formatted.append(f"[Source {i}: {source}]\n{doc.page_content}")
            return "\n\n".join(formatted)
        
        def format_chat_history(history):
            """Format chat history for the prompt"""
            if not history:
                return "No previous conversation"
            
            formatted = []
            for human, ai in history[-3:]:  # Keep last 3 exchanges
                formatted.append(f"Human: {human}\nAssistant: {ai}")
            return "\n\n".join(formatted)
        
        # Build the chain with LCEL
        self.chain = (
            {
                "context": self.retriever | format_docs,
                "chat_history": lambda x: format_chat_history(self.chat_history),
                "input": RunnablePassthrough()
            }
            | prompt
            | llm
            | StrOutputParser()
        )
        
        print("RAG chain created with conversational memory")
    
    def ask(self, question: str) -> str:
        """
        Ask a question to the RAG assistant
        
        Args:
            question: The question to ask
            
        Returns:
            The assistant's answer with sources
        """
        if self.chain is None:
            raise ValueError("RAG chain not initialized. Call create_rag_chain first.")
        
        print(f"\nQuestion: {question}")
        
        # Get answer from chain
        answer = self.chain.invoke(question)
        
        # Update chat history
        self.chat_history.append((question, answer))
        
        print(f"Answer: {answer}\n")
        
        return answer
    
    def clear_history(self) -> None:
        """Clear chat history"""
        self.chat_history = []
        print("Chat history cleared")
    
    def initialize_from_scratch(self, chunk_size: int = 1000, 
                               chunk_overlap: int = 200) -> None:
        """
        Complete initialization pipeline from scratch
        
        Args:
            chunk_size: Size of text chunks
            chunk_overlap: Overlap between chunks
        """
        # Load and process documents
        chunks = self.load_and_process_documents(chunk_size, chunk_overlap)
        
        # Create vector store
        self.create_vector_store(chunks)
        
        # Setup retriever
        self.setup_retriever()
        
        # Create RAG chain
        self.create_rag_chain()
        
        print("\nâœ… RAG Assistant fully initialized!")
    
    def initialize_from_existing(self) -> None:
        """Initialize from existing vector store"""
        # Load existing vector store
        if not self.load_existing_vector_store():
            raise ValueError("No existing vector store found. Use initialize_from_scratch instead.")
        
        # Setup retriever
        self.setup_retriever()
        
        # Create RAG chain
        self.create_rag_chain()
        
        print("\nRAG Assistant initialized from existing data!")

    



In [None]:
def main():
    """Main function demonstrating the RAG assistant usage"""
    
    print("LinkedIn Profile RAG Assistant")
    print("="*60)
    
    # Initialize assistant
    assistant = LinkedInRAGAssistant()
    
    # Check if vector store already exists
    if os.path.exists(assistant.persist_directory):
        print("\n Found existing vector store. Loading...")
        try:
            assistant.initialize_from_existing()
        except Exception as e:
            print(f"Error loading existing store: {e}")
            print("Creating new vector store...")
            assistant.initialize_from_scratch()
    else:
        print("\nNo existing vector store found. Creating new one...")
        print("Make sure you have populated the linkedin_docs directory with content!")
        assistant.initialize_from_scratch()
    
    # Example questions to test the system
    test_questions = [
        "What is Esther Kudoro's professional background?",
        "What are her key skills?",
        "What experience does she have?",
        "Can you tell me about her education?",
        "What projects has she worked on?"
    ]
    
    print("\nTesting with Example Questions")
    print("="*60)
    
    for question in test_questions[:2]:  # Test with first 2 questions
        answer = assistant.ask(question)
        print()
    
    print("\nRAG Assistant ready for interactive use!")
    print("Use assistant.ask('your question') to query the system")
    
    return assistant


if __name__ == "__main__":
    # Run the main function
    assistant = main()
    
    # Interactive mode example
    print("\n" + "="*60)
    print("Starting interactive session...")
    print("="*60)
