# <b>Email Data ETL

In [1]:
import imaplib
import email
from email.header import decode_header
import webbrowser
import os
from bs4 import BeautifulSoup
import sqlite3

## <b>Extract Data


In [None]:
# Setting email credentials and server details
username = 'your_email@example.com'
password = 'your_password'
imap_server = 'imap.example.com'

# Connecting to the server and go to its inbox
mail = imaplib.IMAP4_SSL(imap_server)
mail.login(username, password)
mail.select("inbox")

# Searching for all emails in the inbox
status, messages = mail.search(None, "ALL")

# Converting messages to a list of email IDs
email_ids = messages[0].split()

for email_id in email_ids:
    # Fetching the email by ID
    status, msg_data = mail.fetch(email_id, "(RFC822)")
    
    # Parsing the email
    for response_part in msg_data:
        if isinstance(response_part, tuple):
            msg = email.message_from_bytes(response_part[1])
            subject = decode_header(msg["Subject"])[0][0]
            if isinstance(subject, bytes):
                subject = subject.decode()
            from_ = msg.get("From")
            print("Subject:", subject)
            print("From:", from_)
            
            # If the email message is multipart
            if msg.is_multipart():
                for part in msg.walk():
                    # Extracting content type of the email
                    content_type = part.get_content_type()
                    content_disposition = str(part.get("Content-Disposition"))

                    try:
                        # Getting the email body
                        body = part.get_payload(decode=True).decode()
                        print("Body:", body)
                    except:
                        pass

## <b>Transform Data

In [None]:
def clean_email_body(body):
    soup = BeautifulSoup(body, "html.parser")  # Parsing HTML content using BeautifulSoup
    return soup.get_text()  # Extracting text content from HTML

# Iterating through email IDs to fetch and process each email
for email_id in email_ids:
    status, msg_data = mail.fetch(email_id, "(RFC822)")  # Fetching email message data
    for response_part in msg_data:
        if isinstance(response_part, tuple):
            msg = message_from_bytes(response_part[1])  # Parsing email message from bytes
            subject = decode_header(msg["Subject"])[0][0]  # Decoding and get email subject
            if isinstance(subject, bytes):
                subject = subject.decode()  # Decoding subject if it's in bytes
            from_ = msg.get("From")  # Getting sender information
            
            if msg.is_multipart():  # Checking if the message is multipart (contains attachments, etc.)
                for part in msg.walk():  # Iterating through each part of the email
                    content_type = part.get_content_type()  # Getting content type (text/html, text/plain, etc.)
                    content_disposition = str(part.get("Content-Disposition"))  # Getting content disposition

                    try:
                        body = part.get_payload(decode=True).decode()  # Getting and decode email body
                        clean_body = clean_email_body(body)  # Cleaning HTML content from email body
                        print("Clean Body:", clean_body)  # Printing cleaned body text

                    except Exception as e:
                        print(f"Error processing email part: {str(e)}")

## <b>Load Data

In [None]:
# Connecting to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('emails.db')
cursor = conn.cursor()

# Creating a table to store email data
cursor.execute('''
CREATE TABLE IF NOT EXISTS emails (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    subject TEXT,
    sender TEXT,
    body TEXT
)
''')

# Inserting email data into the table
def insert_email(subject, sender, body):
    cursor.execute('''
    INSERT INTO emails (subject, sender, body) VALUES (?, ?, ?)
    ''', (subject, sender, body))
    conn.commit()

for email_id in email_ids:
    status, msg_data = mail.fetch(email_id, "(RFC822)")
    for response_part in msg_data:
        if isinstance(response_part, tuple):
            msg = email.message_from_bytes(response_part[1])
            subject = decode_header(msg["Subject"])[0][0]
            if isinstance(subject, bytes):
                subject = subject.decode()
            from_ = msg.get("From")
            
            if msg.is_multipart():
                for part in msg.walk():
                    content_type = part.get_content_type()
                    content_disposition = str(part.get("Content-Disposition"))
                    try:
                        body = part.get_payload(decode=True).decode()
                        clean_body = clean_email_body(body)
                        insert_email(subject, from_, clean_body)
                    except:
                        pass

# Closing the connection
conn.close()
mail.logout()