In [3]:
import pandas as pd
from email.parser import BytesParser
from email import policy
import os
import io
import re

In [25]:
def mail_parser(mail_file, attachements_dir):

    with open(mail_file, "rb") as f:
        msg = BytesParser(policy=policy.default).parse(f)

    sender = msg["From"]
    receiver = msg["To"]
    subject = msg["Subject"]
    date = msg["Date"]

    html_body = msg.get_body(preferencelist=('html',))
    
    if html_body:
        body_content = html_body.get_content()
    else:
        print("Geen HTML body gevonden in deze mail.")
        body_content = ""

    # for part in msg.iter_attachments():
    #     filename = part.get_filename()
    #     if filename:
    #         filepath = os.path.join(attachements_dir, f"{filename}")
    #         with open(filepath, "wb") as f:
    #             f.write(part.get_payload(decode=True))
    #         print(f"Attachment saved: {filename}", flush=True)

    return (sender, receiver, subject, date, body_content)

def read_table_mail_pd(html_content):
    try:
        html_io = io.StringIO(html_content)
        all_tables = pd.read_html(html_io)
    except ValueError:
        print("Geen tabellen gevonden in de HTML.")
        return None

    container_pattern = r'[A-Z]{4}\d{7}'
    pattern_t1 = r'(\d{2}BE\w*)'
    pattern_bei = r'(\d{2}BEI\w*)'

    for df in all_tables:
        if df.astype(str).stack().str.contains(container_pattern, regex=True).any():
            
            extracted_data = []

            for _, row in df.iterrows():
                row_string = " ".join(row.astype(str))
                
                cont_match = re.search(container_pattern, row_string)
                be_match = re.search(pattern_t1, row_string)
                bei_match = re.search(pattern_bei, row_string)

                if be_match:
                    extracted_data.append({
                        'Container': cont_match.group(0) if cont_match else "Onbekend",
                        'BE_Nummer': be_match.group(1),
                        'BEI_Nummer': bei_match.group(1) if bei_match else None
                    })
            
            return pd.DataFrame(extracted_data) if extracted_data else None

In [26]:
attachements_dir = "./work/files"
file_dir = "./work/files/mail_4_mrn.eml"
_, _, _, _, body = mail_parser(file_dir, attachements_dir)

In [27]:
df =read_table_mail_pd(body)

In [28]:
df

Unnamed: 0,Container,BE_Nummer,BEI_Nummer
0,HAMU4524994,25BE000001165776J0,
