# <b>Email Data ETL

In [1]:
import imaplib
import email
from email.header import decode_header
import webbrowser
import os
from bs4 import BeautifulSoup
import sqlite3

## <b>Extract Data


In [None]:
# Set email credentials and server details
username = 'your_email@example.com'
password = 'your_password'
imap_server = 'imap.example.com'

# Connect to the server and go to its inbox
mail = imaplib.IMAP4_SSL(imap_server)
mail.login(username, password)
mail.select("inbox")

# Search for all emails in the inbox
status, messages = mail.search(None, "ALL")

# Convert messages to a list of email IDs
email_ids = messages[0].split()

for email_id in email_ids:
    # Fetch the email by ID
    status, msg_data = mail.fetch(email_id, "(RFC822)")
    
    # Parse the email
    for response_part in msg_data:
        if isinstance(response_part, tuple):
            msg = email.message_from_bytes(response_part[1])
            subject = decode_header(msg["Subject"])[0][0]
            if isinstance(subject, bytes):
                subject = subject.decode()
            from_ = msg.get("From")
            print("Subject:", subject)
            print("From:", from_)
            
            # If the email message is multipart
            if msg.is_multipart():
                for part in msg.walk():
                    # Extract content type of the email
                    content_type = part.get_content_type()
                    content_disposition = str(part.get("Content-Disposition"))

                    try:
                        # Get the email body
                        body = part.get_payload(decode=True).decode()
                        print("Body:", body)
                    except:
                        pass

## <b>Transform Data

In [None]:
def clean_email_body(body):
    soup = BeautifulSoup(body, "html.parser")
    return soup.get_text()

for email_id in email_ids:
    status, msg_data = mail.fetch(email_id, "(RFC822)")
    for response_part in msg_data:
        if isinstance(response_part, tuple):
            msg = email.message_from_bytes(response_part[1])
            subject = decode_header(msg["Subject"])[0][0]
            if isinstance(subject, bytes):
                subject = subject.decode()
            from_ = msg.get("From")
            
            if msg.is_multipart():
                for part in msg.walk():
                    content_type = part.get_content_type()
                    content_disposition = str(part.get("Content-Disposition"))
                    try:
                        body = part.get_payload(decode=True).decode()
                        clean_body = clean_email_body(body)
                        print("Clean Body:", clean_body)
                    except:
                        pass

## <b>Load Data

In [None]:
# Connect to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('emails.db')
cursor = conn.cursor()

# Create a table to store email data
cursor.execute('''
CREATE TABLE IF NOT EXISTS emails (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    subject TEXT,
    sender TEXT,
    body TEXT
)
''')

# Insert email data into the table
def insert_email(subject, sender, body):
    cursor.execute('''
    INSERT INTO emails (subject, sender, body) VALUES (?, ?, ?)
    ''', (subject, sender, body))
    conn.commit()

for email_id in email_ids:
    status, msg_data = mail.fetch(email_id, "(RFC822)")
    for response_part in msg_data:
        if isinstance(response_part, tuple):
            msg = email.message_from_bytes(response_part[1])
            subject = decode_header(msg["Subject"])[0][0]
            if isinstance(subject, bytes):
                subject = subject.decode()
            from_ = msg.get("From")
            
            if msg.is_multipart():
                for part in msg.walk():
                    content_type = part.get_content_type()
                    content_disposition = str(part.get("Content-Disposition"))
                    try:
                        body = part.get_payload(decode=True).decode()
                        clean_body = clean_email_body(body)
                        insert_email(subject, from_, clean_body)
                    except:
                        pass

# Close the connection
conn.close()
mail.logout()