In [32]:
import mailbox
import os
from dotenv import load_dotenv
import gradio as gr
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.docstore.document import Document
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings


In [28]:
# Using OpenAI's low cost model

mbox_loc = "PUT THE PATH TO THE .mbox FILES HERE"
MODEL = "gpt-4o-mini"
db_name = "vector_db"

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [22]:
#function that extracts the emails data

def extract_emails(mbox_file):
    mbox = mailbox.mbox(mbox_file)
    emails = []
    for message in mbox:
        email_data = {
            "subject": message.get("subject", ""),
            "from": message.get("from", ""),
            "to": message.get("to", ""),
            "date": message.get("date", ""),
            "body": get_email_body(message),
        }
        emails.append(email_data)
    return emails

In [23]:
#used during the extraction

def get_email_body(message):
    if message.is_multipart():
        for part in message.walk():
            if part.get_content_type() == "text/plain":
                return part.get_payload(decode=True).decode(errors="replace")
    else:
        return message.get_payload(decode=True).decode(errors="replace")
    return ""

In [29]:
#Do the extraction

sentmail=extract_emails(mbox_loc)

In [30]:
# a quick test - this should show an email

print(sentmail[1])

{'subject': 'Re: Enjoy your ...', 'from': 'David Stitt <dave.stitt@gmail.com>', 'to': 'John OBrien <john3obrien@gmail.com>', 'date': 'Thu, 30 Mar 2023 22:03:14 -0400', 'body': 'Thank you sir - hope to see you all next time you are in town!\n\nOn Thu, Mar 30, 2023 at 2:44\u202fPM John OBrien <john3obrien@gmail.com> wrote:\n\n> .... day, Young Man!!!\n>\n> Hope you and yours are well.\n>\n> Miss you, Brother!\n>\n> Much Love,\n> Joy, John, Jan Braxton, Ian Joaquin, Ivan Xavier, John Stone, and\n> Josephine Simone\n>\n\n\n-- \n*Dave*\n*---------------------------------------------------------------------*\n*Fingerprint:*\n*16AE **24E3 **7B41 **69D7 **DA86 **6566 **823C **9475 **09F6 **4670*\n'}


In [51]:
# Transform emails into Documents

documents = [
    Document(
        page_content=email["body"],  # Use the email body as the main content
        metadata={
            "subject": email["subject"],
            "from": email["from"],
            "to": email["to"],
            "date": email["date"],
        }
    )
    for email in sentmail
]

In [None]:
#this is not necessary unless your emails are really long. If you don't use it you can comment them out

text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [None]:
# A quick test to make sure the chunkc look like you want them. As mentioned above, comment this out if not using chunking.

for chunk in chunks[:5]:  # Print the first 5 chunks
    print(chunk.page_content)
    print("Metadata:", chunk.metadata)
    print("=" * 50)

In [54]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

embeddings = OpenAIEmbeddings()

# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers
# Then uncomment this line instead
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Delete if already exists

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 1747 documents


In [55]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 1,747 vectors with 1,536 dimensions in the vector store


In [56]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']

# Thought about categorizing but have not decided if it's necessary or how to categrorize.
#doc_types = [metadata['doc_type'] for metadata in metadatas]
#colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [57]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG; k is how many chunks to use
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
# Let's try a simple question

query = "Ask a specific question about one of your emails. Example: how many emails are there from person@person.xom?"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

In [59]:
# keeps track of the chat history

def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [60]:
#Use in a gradio interface

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7868

To create a public link, set `share=True` in `launch()`.
