In [4]:
import chromadb
chroma_client = chromadb.PersistentClient(path="vectordb")

In [70]:
collection = chroma_client.create_collection(name="email_data")

In [71]:
import re

def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\xa0', '', text)
    text = re.sub(r'\r', '', text)
    text = re.sub(r'\u200c', '', text)

    return text

In [72]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=700,
        chunk_overlap=20
    )
    return text_splitter.split_text(text)

In [73]:
from server import service

def fetch_emails(service, user_id='me', max_results=100):
    try:
        # Fetch the list of messages
        results = service.users().messages().list(userId=user_id, maxResults=max_results).execute()
        messages = results.get('messages', [])
        
        email_data = []
        for message in messages:
            msg = service.users().messages().get(userId=user_id, id=message['id']).execute()
            email_data.append(msg)
        
        return email_data
    except Exception as e:
        print(f'An error occurred: {e}')
        return []

In [74]:
import base64

def add_emails_to_collection(collection, emails):
    for email in emails:
        # For example, extract subject, sender, etc.
        subject = next(header['value'] for header in email['payload']['headers'] if header['name'] == 'Subject')
        sender = next(header['value'] for header in email['payload']['headers'] if header['name'] == 'From')
        snippet = email.get('snippet', '')
        mssg_id = email['id']

        # Clean the extracted text
        clean_subject = clean_text(subject)
        clean_sender = clean_text(sender)
        clean_snippet = clean_text(snippet)

        # Extract image data if available
        image_data = []
        if 'parts' in email['payload']:
            for part in email['payload']['parts']:
                if part['filename'] and 'image' in part['mimeType']:
                    # Decode the image data
                    img_data = base64.urlsafe_b64decode(part['body']['data'])
                    image_data.append(img_data)

        # Split the snippet into chunks
        snippet_chunks = split_text(clean_snippet)

        # Add each chunk to the collection
        for i, chunk in enumerate(snippet_chunks):
            chunk_id = f"{mssg_id}_{i}"
            collection.add(
                ids=[chunk_id],
                metadatas=[{
                    'subject': clean_subject,
                    'sender': clean_sender,
                    'chunk_index': i
                }],
                documents=[chunk]
            )

In [75]:
emails = fetch_emails(service)
print(len(emails))
add_emails_to_collection(collection, emails)

100


In [76]:
results = collection.query(
    query_texts=["dropbox features"],
    n_results=5
)
print(results["documents"])

[['what are the new dropbox features', 'hello dropbox has recently introduced some exciting new features to help you better manage your digital content here are a few key updates 1 automated folders create folders that automatically', 'new features quality enhancements and much more', 'get creative cloud all apps one plan endless possibilities bring any idea to life with the creative cloud all apps plan get photoshop illustrator adobe express and the latest generative ai', 'create quickly and easily with templates from adobe express kick off the holiday spirit with a spectacular party invite making holidaythemed party invites is easy with adobe express browse from']]


# **Collection Exists!**

In [2]:
old_collection = chroma_client.get_collection(name="email_data")

In [3]:
results = old_collection.query(
    query_texts=["dropbox features"],
    n_results=5
)
print(results["documents"])

[['what are the new dropbox features', 'hello dropbox has recently introduced some exciting new features to help you better manage your digital content here are a few key updates 1 automated folders create folders that automatically', 'new features quality enhancements and much more', 'get creative cloud all apps one plan endless possibilities bring any idea to life with the creative cloud all apps plan get photoshop illustrator adobe express and the latest generative ai', 'create quickly and easily with templates from adobe express kick off the holiday spirit with a spectacular party invite making holidaythemed party invites is easy with adobe express browse from']]
