## Imports

In [12]:
from typing import Optional, Dict, Union
from lxml.html import HtmlElement
from pygrok import Grok
import pandas as pd

from mailxtract.html import LXMLExtractor, field, FieldContext
from mailxtract.binary import SESMessage, BinaryExtractor

In [13]:
raw_email_file = "./2vm2kl34l34tnlrqeklsfp0jnpfu6ondn2g303g1"

## Extraction

### Open email

In [14]:
reader = SESMessage.from_file(raw_email_file)
raw_email = BinaryExtractor(reader)

In [17]:
print(raw_email.message)

Return-Path: <forwarding-noreply@google.com>
Received: from mail-yb1-f170.google.com (mail-yb1-f170.google.com [209.85.219.170])
 by inbound-smtp.eu-west-1.amazonaws.com with SMTP id 2vm2kl34l34tnlrqeklsfp0jnpfu6ondn2g303g1
 for linkedin-jobs-9s53lwpvs88lw2gv@datalake.joffreybvn.be;
 Sat, 09 Apr 2022 12:19:56 +0000 (UTC)
X-SES-Spam-Verdict: PASS
X-SES-Virus-Verdict: PASS
Received-SPF: pass (spfCheck: domain of google.com designates 209.85.219.170 as permitted sender) client-ip=209.85.219.170; envelope-from=forwarding-noreply@google.com; helo=mail-yb1-f170.google.com;
Authentication-Results: amazonses.com;
 spf=pass (spfCheck: domain of google.com designates 209.85.219.170 as permitted sender) client-ip=209.85.219.170; envelope-from=forwarding-noreply@google.com; helo=mail-yb1-f170.google.com;
 dkim=pass header.i=@google.com;
 dmarc=pass header.from=google.com;
X-SES-RECEIPT: AEFBQUFBQUFBQUFIdVNvRzIvOWZHalZjZXlCc3FzaHdXdGxLL09UZFhGdW9VUXVSYmpyKzBjaXl0eWFINUd4QUhzTDRsMmZmZmFLdVFiKzZxT1g3

### Read headers

In [15]:
metadata = raw_email.get_header()
html_email = raw_email.get_html()

TypeError: argument should be bytes, buffer or ASCII string, not 'NoneType'

In [None]:
with open("Output.html", "w") as text_file:
    text_file.write(html_email)

### Get email data

In [7]:
class Job(LXMLExtractor):

    def __init__(self, html_element: Union[str, HtmlElement], position_category: str):
        super().__init__(html_element)
        self.data['position_category'] = position_category

    def extract_description(self) -> dict:

        # Setup default values
        fields = ['company_city', 'company_region', 'company_country', 'remote_working_policy']
        default_values: Dict[str, None] = {key:None for key in fields}

        # Extract the description minus the company name
        description: str = self.get_xpath(xpath='.//p/text()', first=True)
        description = description.replace(self.company_name(), '')

        # Parse the city, region, country and remote working policy
        pattern = '%{DATA:company_city}, %{DATA:company_region}, %{WORD:company_country} \(%{DATA:remote_working_policy}\)'
        data: dict = Grok(pattern).match(description)

        return data or default_values

    @field()
    def position_category(self) -> str:
        pass

    @field(
        missing_func='get_xpath',
        args={'xpath': './/td[@style="padding-bottom:4px"]/a/text()', 'first': True}
    )
    def position_name(self) -> Optional[str]:
        pass

    @field(
        missing_func='get_xpath',
        args={'xpath': './/img[@width="48"]/@alt', 'first': True}
    )
    def company_name(self) -> Optional[str]:
        pass

    @field(
        missing_func='get_xpath',
        args={'xpath': './/img[@width="48"]/@src', 'first': True}
    )
    def company_logo(self) -> Optional[str]:
        pass

    @field(missing_func='extract_description')
    def company_city(self) -> Optional[str]:
        pass

    @field(missing_func='extract_description')
    def company_region(self) -> Optional[str]:
        pass

    @field(missing_func='extract_description')
    def company_country(self) -> Optional[str]:
        pass

    @field(missing_func='extract_description')
    def remote_working_policy(self) -> Optional[str]:
        pass

In [8]:
class Email(LXMLExtractor):

    def __init__(self, html_element: Union[str, HtmlElement]):
        super().__init__(html_element)

    @field(
        missing_func='get_xpath',
        args={'xpath': '//h2/a/span/text()', 'first': True}
    )
    def position(self) -> str:
        pass


    @field(
        missing_func='get_xpath',
        args={'xpath': '//table[@style="padding:10px 24px"]'}
    )
    def jobs(self, context: FieldContext):
        for job in self.data[context.name]:
            yield Job(
                html_element=job,
                position_category=self.position()
            ).as_dict()

In [9]:
email = Email(html_email)

In [10]:
df = pd.DataFrame(email.jobs())

In [11]:
df

Unnamed: 0,company_city,company_country,company_logo,company_name,company_region,position_category,position_name,remote_working_policy
0,Gand,Belgique,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,Bricsys,Region flamande,Data Engineer,Data Engineer,Hybride
1,Laakdal,Belgique,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,Apollo Solutions,Region flamande,Data Engineer,Data Engineer,Hybride
2,Anvers,Belgique,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,Process Automation Solutions,Region flamande,Data Engineer,MES & Data Historian Engineer,Sur site
3,,,https://media-exp1.licdn.com/dms/image/C4D0BAQ...,Ossia Belgium,,Data Engineer,Data Engineer,
4,Malines,Belgique,https://media-exp1.licdn.com/dms/image/C4D0BAQ...,Ordina Belgium,Region flamande,Data Engineer,Senior (Big) Data Engineer,Sur site
5,Malines,Belgique,https://media-exp1.licdn.com/dms/image/C560BAQ...,Exellys,Region flamande,Data Engineer,Data Warehouse Engineer and BI Expert,Sur site
