In [1]:
import email.utils
import os
import re
import time
from collections import Counter

from dotenv import load_dotenv
from imapclient import IMAPClient
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [2]:
# Load environment variables from .env file
load_dotenv()

# iCloud IMAP server details
imap_host = 'imap.mail.me.com'
imap_port = 993

# User login details from environment variables
icloud_email = os.getenv('ICLOUD_USERNAME')
app_specific_password = os.getenv('ICLOUD_PASSWORD')

In [3]:
# Connect to iCloud IMAP server
imap_server = IMAPClient(imap_host, use_uid=True)
imap_server.login(icloud_email, app_specific_password)

b'user boothchin logged in'

In [4]:
# Select the mailbox you want to search
imap_server.select_folder('INBOX')
messages = imap_server.search(['ALL'])

batch_size = 2000
chunk_size = 10000
from_addresses = []
all_from_addresses = []
unexpected_header_ids = []  # List to collect message IDs with unexpected headers

total_messages = len(messages)
processed_messages = 0

email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')

with tqdm(total=total_messages, desc="Processing emails", unit="email") as pbar:
    for i in range(0, total_messages, batch_size):
        batch = messages[i:i + batch_size]
        
        # Retry logic for fetch command
        retries = 3
        while retries > 0:
            try:
                fetch_data = imap_server.fetch(batch, ['BODY[HEADER]'])
                break  # Exit the retry loop if successful
            except Exception as e:
                print(f"Error fetching batch {i}-{i + batch_size}: {e}")
                retries -= 1
                if retries > 0:
                    print("Retrying...")
                    time.sleep(5)  # Wait for 5 seconds before retrying
                else:
                    print("Skipping this batch after multiple failed attempts.")
                    fetch_data = {}  # Skip this batch by setting an empty result
        
        for msg_id, data in fetch_data.items():
            if b'BODY[HEADER]' in data:
                header_data = data[b'BODY[HEADER]'].decode('utf-8', errors='ignore')
                from_match = re.search(r'From:\s*(.*?)\r\n', header_data, re.IGNORECASE)
                if from_match:
                    from_header = from_match.group(1)
                    email_address = email.utils.parseaddr(from_header)[1].lower()
                    if email_pattern.match(email_address):
                        from_addresses.append(email_address)
                else:
                    # Collect message ID if 'From' header is not found
                    unexpected_header_ids.append(msg_id)
            else:
                # Collect message ID if BODY[HEADER] is not present
                unexpected_header_ids.append(msg_id)

            processed_messages += 1
            pbar.update(1)

        # Process in chunks and write intermediate results
        if len(from_addresses) >= chunk_size:
            email_counts = Counter(from_addresses)

            # Write intermediate results
            with open(f'../data/results_{i}.txt', 'w') as f:
                for email_address, count in email_counts.most_common():
                    f.write(f"{email_address}: {count}\n")

            # Clear from_addresses to free up memory
            all_from_addresses.append(from_addresses)
            from_addresses.clear()

        # Update progress bar description
        pbar.set_description(f"Processed {processed_messages}/{total_messages} emails")

# Process any remaining addresses
if from_addresses:
    all_from_addresses.append(from_addresses)
    email_counts = Counter(from_addresses)
    with open('../data/results_final.txt', 'w') as f:
        for email_address, count in email_counts.most_common():
            f.write(f"{email_address}: {count}\n")


# Print or log unexpected header IDs
#if unexpected_header_ids:
#    print("Message IDs with unexpected headers:", unexpected_header_ids)


Processing emails:   0%|          | 0/68964 [00:00<?, ?email/s]

Error fetching batch 44000-46000: fetch failed: [UNAVAILABLE] Internal server error (took 39604 ms)
Retrying...
Error fetching batch 44000-46000: fetch failed: [UNAVAILABLE] Internal server error (took 20469 ms)
Retrying...
Error fetching batch 44000-46000: fetch failed: [UNAVAILABLE] Internal server error (took 37266 ms)
Skipping this batch after multiple failed attempts.
Error fetching batch 60000-62000: fetch failed: [UNAVAILABLE] Internal server error (took 7472 ms)
Retrying...
Error fetching batch 60000-62000: fetch failed: [UNAVAILABLE] Internal server error (took 7114 ms)
Retrying...
Error fetching batch 60000-62000: fetch failed: [UNAVAILABLE] Internal server error (took 7802 ms)
Skipping this batch after multiple failed attempts.
Error fetching batch 62000-64000: fetch failed: [UNAVAILABLE] Internal server error (took 9254 ms)
Retrying...
Error fetching batch 62000-64000: fetch failed: [UNAVAILABLE] Internal server error (took 26348 ms)
Retrying...
Error fetching batch 62000-6

In [6]:
all_from_addresses

[[], []]

In [5]:
# Count occurrences of each email address
email_counts = Counter(from_addresses)

# Print results
for email_address, count in email_counts.most_common():
    print(f"{email_address}: {count}")

imap_server.logout()


b''