In [1]:
from dotenv import load_dotenv
import gradio as gr
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
import os, base64, re, glob
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from bs4 import BeautifulSoup
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from langchain.prompts import PromptTemplate

In [2]:
MODEL = "gpt-4o-mini"  # Using the lower cost model
DB_NAME = "vector_db_emails"
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
MAX_EMAILS = 100

In [3]:
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [None]:
def gmail_authenticate():
    """Authenticate with Gmail API and return service"""
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)

    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)

        # Save the credentials
        with open('token.json', 'w') as token:
            token.write(creds.to_json())

    service = build('gmail', 'v1', credentials=creds)
    return service

def get_emails(service, label_ids=['INBOX'], max_results=MAX_EMAILS):
    """Get a list of email IDs from Gmail"""
    result = service.users().messages().list(userId='me', labelIds=label_ids, maxResults=max_results).execute()
    messages = result.get('messages', [])
    return messages

def clean_text(text):
    """Clean the text content of emails"""
    if not text:
        return ""
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove common email footer markers
    text = re.sub(r'Sent from my iPhone|Sent from Gmail|Get Outlook for|This email and any files.*?confidential|CONFIDENTIALITY NOTICE.*?$', '', text, flags=re.DOTALL|re.IGNORECASE)
    return text

def get_message_detail(service, msg_id):
    """Get detailed information about an email"""
    try:
        msg = service.users().messages().get(userId='me', id=msg_id, format='full').execute()
        payload = msg['payload']
        headers = payload['headers']

        subject = sender = date = "Unknown"
        for header in headers:
            if header['name'].lower() == 'subject':
                subject = header['value']
            elif header['name'].lower() == 'from':
                sender = header['value']
            elif header['name'].lower() == 'date':
                date = header['value']

        # Extract body with better handling of multipart messages
        body = extract_body(payload)
        
        # Clean the body text
        body = clean_text(body)
        
        return subject, sender, date, body
    except Exception as e:
        print(f"Error processing message {msg_id}: {str(e)}")
        return "Error", "Error", "Error", f"Error processing this message: {str(e)}"

def extract_body(payload):
    """Extract the email body with better MIME part handling"""
    body = ""
    
    if 'parts' in payload:
        # Multipart message
        for part in payload['parts']:
            if part['mimeType'] == 'text/plain':
                data = part['body'].get('data')
                if data:
                    body = base64.urlsafe_b64decode(data).decode('utf-8', errors='ignore')
                    break
            elif part['mimeType'] == 'text/html':
                data = part['body'].get('data')
                if data:
                    html = base64.urlsafe_b64decode(data).decode('utf-8', errors='ignore')
                    soup = BeautifulSoup(html, 'html.parser')
                    body = soup.get_text(separator=' ', strip=True)
                    break
            elif 'parts' in part:
                # Handle nested multipart messages
                nested_body = extract_body(part)
                if nested_body:
                    body = nested_body
                    break
    elif 'body' in payload and 'data' in payload['body']:
        # Single part message
        data = payload['body'].get('data')
        if data:
            if payload['mimeType'] == 'text/html':
                html = base64.urlsafe_b64decode(data).decode('utf-8', errors='ignore')
                soup = BeautifulSoup(html, 'html.parser')
                body = soup.get_text(separator=' ', strip=True)
            else:
                body = base64.urlsafe_b64decode(data).decode('utf-8', errors='ignore')
    
    return body

def save_email_as_md(subject, sender, date, body, index):
    """Save email content as markdown file"""
    folder_path = "knowledge-base/emails"
    os.makedirs(folder_path, exist_ok=True)

    # Create a safe filename
    safe_subject = re.sub(r'[^a-zA-Z0-9_\- ]', '', subject)[:50]  # limit length
    filename = f"{index:03d}_{safe_subject}.md"
    filepath = os.path.join(folder_path, filename)

    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(f"# Subject: {subject}\n\n")
        f.write(f"**From:** {sender}\n\n")
        f.write(f"**Date:** {date}\n\n")
        f.write(f"**Content:**\n\n{body}")
    
    return filepath

def fetch_and_save_emails():
    """Fetch emails from Gmail and save them as markdown files"""
    print("Authenticating with Gmail...")
    service = gmail_authenticate()
    
    print(f"Fetching up to {MAX_EMAILS} emails...")
    messages = get_emails(service, max_results=MAX_EMAILS)
    
    saved_files = []
    print(f"Processing {len(messages)} emails...")
    for idx, msg in enumerate(messages):
        msg_id = msg['id']
        subject, sender, date, body = get_message_detail(service, msg_id)
        if body and len(body) > 10:  # Only save emails with meaningful content
            filepath = save_email_as_md(subject, sender, date, body, idx)
            saved_files.append(filepath)
            if idx % 10 == 0 and idx > 0:
                print(f"Processed {idx} emails...")
    
    print(f"Saved {len(saved_files)} emails as markdown files.")
    return saved_files

def build_vectorstore():
    """Build a vector store from the saved email markdown files"""
    # Check if knowledge base directory exists
    if not os.path.exists("knowledge-base/emails"):
        print("No email data found. Fetching emails first...")
        fetch_and_save_emails()
    
    print("Loading documents...")
    # Load the documents
    text_loader_kwargs = {"encoding": "utf-8"}
    loader = DirectoryLoader("knowledge-base/emails", glob="**/*.md", 
                          loader_cls=TextLoader, 
                          loader_kwargs=text_loader_kwargs)
    documents = loader.load()
    
    # Add metadata
    for doc in documents:
        doc.metadata["doc_type"] = "email"
    
    print(f"Loaded {len(documents)} documents.")
    
    # Use a more advanced text splitter with better chunk handling
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n# ", "\n## ", "\n### ", "\n\n", "\n", " ", ""]
    )
    
    print("Splitting documents into chunks...")
    chunks = text_splitter.split_documents(documents)
    print(f"Created {len(chunks)} chunks.")
    
    # Create embeddings
    print("Creating vector embeddings...")
    embeddings = OpenAIEmbeddings()
    
    # Delete existing vector store if it exists
    if os.path.exists(DB_NAME):
        print("Removing existing vector store...")
        Chroma(persist_directory=DB_NAME, embedding_function=embeddings).delete_collection()
    
    # Create vector store
    print("Building new vector store...")
    vectorstore = Chroma.from_documents(
        documents=chunks, 
        embedding=embeddings, 
        persist_directory=DB_NAME
    )
    
    print(f"Vector store created with {vectorstore._collection.count()} chunks.")
    return vectorstore

def setup_rag_chain():
    """Set up the RAG chain with the vector store"""
    # Load or create vector store
    if not os.path.exists(DB_NAME):
        print("Building vector store...")
        vectorstore = build_vectorstore()
    else:
        print("Loading existing vector store...")
        embeddings = OpenAIEmbeddings()
        vectorstore = Chroma(persist_directory=DB_NAME, embedding_function=embeddings)
    
    # Create LLM
    llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
    
    # Create memory with explicit output_key to fix the error
    memory = ConversationBufferMemory(
        memory_key='chat_history',
        return_messages=True,
        output_key='answer'  # This tells the memory which key to store from the chain output
    )
    
    # Create retriever with search parameters
    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 5}  # Retrieve more documents for better context
    )
    
    # Create custom prompt template for better RAG responses
    custom_template = """You are a helpful assistant that can answer questions about emails.Do not answer anthing other then related to my emails
    
    Chat History:
    {chat_history}
    
    Context from relevant emails:
    {context}
    
    Human: {question}
    AI Assistant:"""
    
    QA_PROMPT = PromptTemplate(
        input_variables=["chat_history", "context", "question"],
        template=custom_template,
    )
    
    # Create ConversationalRetrievalChain with custom prompt and properly configured
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        combine_docs_chain_kwargs={"prompt": QA_PROMPT},
        return_source_documents=True,
        verbose=True
    )
    
    return conversation_chain

def chat(question, history):
    """Process a chat question with the RAG system"""
    # Ensure the RAG chain is set up
    global conversation_chain
    if 'conversation_chain' not in globals():
        conversation_chain = setup_rag_chain()
    
    # Process the question
    try:
        result = conversation_chain.invoke({"question": question})
        answer = result.get("answer", "No answer found")
        
        # Add source information if available
        source_docs = result.get("source_documents", [])
        if source_docs:
            email_sources = []
            for i, doc in enumerate(source_docs[:3]):  # Limit to top 3 sources
                if "source" in doc.metadata:
                    email_file = os.path.basename(doc.metadata["source"])
                    # Extract email subject from filename
                    match = re.search(r'\d+_(.+)\.md', email_file)
                    if match:
                        subject = match.group(1).replace('_', ' ')
                        email_sources.append(subject)
            
            if email_sources:
                answer += "\n\n*Sources: Emails about " + ", ".join(email_sources) + "*"
        
        return answer
    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"Error in chat: {error_details}")
        return f"I encountered an error: {str(e)}. Please try rephrasing your question."

# Main execution
if __name__ == "__main__":
    # Check if we need to fetch emails
    if not os.path.exists("knowledge-base/emails") or len(glob.glob("knowledge-base/emails/*.md")) < 5:
        print("Fetching emails...")
        fetch_and_save_emails()
    
    # Set up the RAG system
    conversation_chain = setup_rag_chain()
    print("Starting chat interface...")
    demo = gr.ChatInterface(
    
    # Create Gradio interface
        chat,
        title="Email Assistant",
        description="Ask questions about your emails",
        theme="hard",
        examples=[
            "What emails do I have about travel?",
            "Do I have any security alerts?",
            "What did the last email from Google say?",
            "Summarize my recent emails"
        ]
    )
    
    # Launch the interface
    demo.launch(debug=True)

Loading existing vector store...


  memory = ConversationBufferMemory(


Starting chat interface...



Sorry, we can't find the page you are looking for.
  self.chatbot = Chatbot(


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a helpful assistant that can answer questions about emails.Do not answer anthing other then related to my emails

    Chat History:
    

    Context from relevant emails:
    z3Qe2m%252FHYkNYDAsadbAlUO6Jex8d2LBFXsWbG6XsTweUcFI3%252FNFIY79KVtw0VUd%252FE%26utm_campaign%3Dhfdigestpins%26e_t%3D49ecc2ff801c47d29f3a1aebc4edc6c7%26e_t_s%3Dfooter%26utm_source%3D31%26utm_medium%3D2004

Like a master navigator, you’ve traversed word by word, never wavering, displaying a level of focus and determination that would humble even the most steadfast of scholars. We are truly honored to have such an intrepid reader. Bravo to you, the indefatigable champion of curiosity! Student holding smartphone in classroom with labeled objects in English and Spanish like chalkboard, desk, and projector screen.

Co1tLKRkcXYo2aZtMyMSo7TL5MtNJhSJumcXh8i8ygiheEMO3G7QXzhtGqH3LiLs

In [None]:
print("HI")