In [21]:
import re
import tldextract

FREE_EMAIL_PROVIDERS = {
    "gmail.com", "yahoo.com", "outlook.com", "hotmail.com",
    "aol.com", "icloud.com", "protonmail.com", "zoho.com",
    "gmx.com", "yandex.com", "mail.com", "googlemail.coms"
}

def extract_email(raw_email: str) -> str:
    """
    Extracts the email address from formats like:
    - orderadmin@asml.com
    - orderadmin <orderadmin@asml.com>
    """
    match = re.search(r'[\w\.-]+@[\w\.-]+', raw_email)
    if match:
        return match.group(0).lower()
    return ""

def extract_company_website(raw_email: str, email_body: str = "") -> str | None:
    email = extract_email(raw_email)
    if not email:
        return None

    domain = email.split("@")[-1].strip()

    # Case 1: Company email
    if domain not in FREE_EMAIL_PROVIDERS:
        extracted = tldextract.extract(domain)
        return "https://www." + f"{extracted.domain}.{extracted.suffix}"

    # Case 2: Free provider, search in email body
    if email_body:
        urls = re.findall(r'https?://[^\s]+', email_body)
        if urls:
            return urls[0]
        # search for bare domains like "company.com"
        domains = re.findall(r'\b(?:[a-z0-9-]+\.)+[a-z]{2,}\b', email_body)
        for d in domains:
            if d.lower() not in FREE_EMAIL_PROVIDERS:
                return "https://" + d.lower()

    return None

# ---- Examples ----
emails = [
    ("orderadmin@asml.com", ""),
    ("OrderAdmin <orderadmin@asml.com>", ""),
    ("jane.smith@gmail.com", "Please visit https://openai.com for details."),
]

for e, body in emails:
    print(e, "→", extract_company_website(e, body))


orderadmin@asml.com → https://www.asml.com
OrderAdmin <orderadmin@asml.com> → https://www.asml.com
jane.smith@gmail.com → https://openai.com


In [None]:
import os
import json
import csv
from email import policy
from email.parser import BytesParser

def get_body(msg):
    """Extracts the body from an email message object."""
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == "text/plain":
                return part.get_content()
    else:
        return msg.get_content()
    
def eml_to_json(path_to_eml: str) -> dict:
    """Reads an EML file and converts it to a JSON-like dictionary."""
    with open(path_to_eml, 'rb') as f:
        msg = BytesParser(policy=policy.default).parse(f)

    mail_json = {
        "from": msg["from"],
        "to": msg["to"],
        "subject": msg["subject"],
        "date": msg["date"],
        "body": get_body(msg),
    }
    return mail_json

if __name__ == "__main__":
    eml_folder = "../src/tests/MailsRubitherm"
    from_mails = []
    urls = []
    for filename in os.listdir(eml_folder):
        if filename.endswith(".eml"):   
            filepath = os.path.join(eml_folder, filename)
            mail_json = eml_to_json(filepath)
            from_mails.append((mail_json['from'], mail_json['body']))
    
    for (mail, body) in from_mails:
        url = extract_company_website(mail, body)
        if url:
            urls.append((mail,url))
    print(urls)

NameError: name 'extract_company_website' is not defined