## Imports

In [263]:
import email
import quopri
import unicodedata
import functools
from email.message import Message
from typing import Optional, List, Dict, Callable, Any, Union
import lxml.html
from lxml.html import HtmlElement
from pygrok import Grok
import pandas as pd

In [194]:
raw_email_file = "./bp2p1gint3o7nnnr7k7m3tnoitr3ah0v7tgf1jg1"

## Extraction

### Open email

In [195]:
with open(raw_email_file, mode='rb') as file:
    raw_email: bytes = file.read()

In [196]:
message: Message = email.message_from_string(raw_email.decode('utf-8'))

### Read headers

In [197]:
sender: str = message['from']
date: str = message['date']

In [198]:
print(sender)
print(date)

Joffrey Bienvenu <joffreybvn@gmail.com>
Wed, 6 Apr 2022 13:08:43 +0200


### Get HTML email

Process: Raw to raw

In [199]:
def decode_email(html_email: str) -> str:
    """Decode an HTML email into its usable form for a web browser."""
    utf8_string: str = quopri.decodestring(html_email).decode("utf-8")  # Decode quoted-printable
    uni_decoded: str = unicodedata.normalize("NFKD", utf8_string)  # Decode unicode strings
    ascii_removed: str = uni_decoded.encode('ascii', 'ignore').decode('utf-8') # Remove ASCII characters
    no_break: str = ascii_removed.splitlines()[0]  # Remove line breaks
    return no_break

In [200]:
html_email: Optional[str] = None

# Get the HTML part of the email
for part in message.walk():
    if part.get_content_type() == 'text/html':
        html_email = decode_email(part.get_payload())

In [201]:
with open("Output.html", "w") as text_file:
    text_file.write(html_email)

### Get email data

In [202]:
document = lxml.html.fromstring(html_email)

In [203]:
document

<Element div at 0x11d9226b0>

In [204]:
# @dataclass
# class Job:
#     position_category: str
#     position_name: str
#     company_name: str
#     company_logo: Optional[str] = None
#     company_city: Optional[str] = None
#     company_region: Optional[str] = None
#     company_country: Optional[str] = None
#     remote_working_policy: Optional[str] = None

In [340]:
from abc import ABC, abstractmethod

class HTMLExtractor(ABC):

    def __init__(self, document: str):
        self._document: HtmlElement = lxml.html.fromstring(document)
        self._data = dict()

    def _return_field(self, *args, **kwargs) -> Optional[Any]:
        """Return the field from the _data dictionnry of this class. Work with the 'field' decorator."""
        return self._data[args[0]]

    def get_xpath(self, xpath: str) -> Optional[str]:
        try:
            return self._document.xpath(xpath)[0]
        except IndexError as error:
            return None

    @abstractmethod
    def as_dict(self, fields: List[str]) -> Dict[str, Optional[str]]:
        return {field: self.__getattribute__(field) for field in fields}

def field(
        missing_func: Optional[str] = None,
        args: Optional[dict] = None
):
    # Assign empty dict as default options
    options = args
    if options is None:
        options: dict = {}

    def inner(func):
        @functools.wraps(func)
        def wrapper(self: Union[HTMLExtractor, 'Job'], *args, **kwargs):
            # Get the field name from the function name
            field_name: str = func.__name__

            # If the data is missing, set it to None by default
            if field_name not in self._data:
                field_value = None

                # Run the custom function to retrieve data
                if missing_func:
                    missing_function: Callable = self.__getattribute__(missing_func)
                    field_value: Any = missing_function(**options)

                # Fill the missing field
                if isinstance(field_value, dict):
                    self._data |= field_value
                else:
                    self._data[field_name] = field_value

            # Run the code normally
            return func(self, field_name, *args, **kwargs)

        return wrapper
    return inner


class Job(HTMLExtractor):

    def __init__(self, html_element: HtmlElement, position_category: str):
        super().__init__(html_element)
        self._data['position_category'] = position_category

    def extract_description(self) -> dict:

        # Setup default values
        fields = ['company_city', 'company_region', 'company_country', 'remote_working_policy']
        default_values: Dict[str, None] = {key:None for key in fields}

        # Extract the description minus the company name
        description: str = self.get_xpath(xpath='.//p/text()')
        description = description.replace(self._data['company_name'], '')

        # Parse the city, region, country and remote working policy
        pattern = '%{DATA:company_city}, %{DATA:company_region}, %{WORD:company_country} \(%{DATA:remote_working_policy}\)'
        data: dict = Grok(pattern).match(description)

        return data or default_values

    @property
    @field()
    def position_category(self, *args, **kwargs) -> str:
        return self._return_field(*args, **kwargs)

    @property
    @field(missing_func='get_xpath', args={'xpath': './/td[@style="padding-bottom:4px"]/a/text()'})
    def position_name(self, *args, **kwargs) -> Optional[str]:
        return self._return_field(*args, **kwargs)

    @property
    @field(missing_func='get_xpath', args={'xpath': './/img[@width="48"]/@alt'})
    def company_name(self, *args, **kwargs) -> Optional[str]:
        return self._return_field(*args, **kwargs)

    @property
    @field(missing_func='get_xpath', args={'xpath': './/img[@width="48"]/@src'})
    def company_logo(self, *args, **kwargs) -> Optional[str]:
        return self._return_field(*args, **kwargs)

    @property
    @field(missing_func='extract_description')
    def company_city(self, *args, **kwargs) -> Optional[str]:
        return self._return_field(*args, **kwargs)

    @property
    @field(missing_func='extract_description')
    def company_region(self, *args, **kwargs) -> Optional[str]:
        return self._return_field(*args, **kwargs)

    @property
    @field(missing_func='extract_description')
    def company_country(self, *args, **kwargs) -> Optional[str]:
        return self._return_field(*args, **kwargs)

    @property
    @field(missing_func='extract_description')
    def remote_working_policy(self, *args, **kwargs) -> Optional[str]:
        return self._return_field(*args, **kwargs)

    def as_dict(self, **kwargs) -> Dict[str, Optional[str]]:
        return super().as_dict([
            "position_category",
            "position_name",
            "company_name",
            "company_logo",
            "company_city",
            "company_region",
            "company_country",
            "remote_working_policy"
        ])

In [341]:
position: str = document.xpath('//h2/a/span/text()')[0]

In [342]:
jobs: List[Job] = list()
raw_jobs: List[HtmlElement] = document.xpath('//table[@style="padding:10px 24px"]')

for raw_job in raw_jobs:
    jobs.append(Job(
        html_element=raw_job,
        position_category=position
    ))

TypeError: Can't instantiate abstract class Job with abstract method as_dict

In [338]:
df = pd.DataFrame([job.as_dict() for job in jobs])

In [339]:
df

Unnamed: 0,position_category,position_name,company_name,company_logo,company_city,company_region,company_country,remote_working_policy
0,Data Engineer,Data Engineer,Bricsys,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,Gand,Region flamande,Belgique,Hybride
1,Data Engineer,Data Engineer,Apollo Solutions,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,Laakdal,Region flamande,Belgique,Hybride
2,Data Engineer,MES & Data Historian Engineer,Process Automation Solutions,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,Anvers,Region flamande,Belgique,Sur site
3,Data Engineer,Data Engineer,Ossia Belgium,https://media-exp1.licdn.com/dms/image/C4D0BAQ...,,,,
4,Data Engineer,Senior (Big) Data Engineer,Ordina Belgium,https://media-exp1.licdn.com/dms/image/C4D0BAQ...,Malines,Region flamande,Belgique,Sur site
5,Data Engineer,Data Warehouse Engineer and BI Expert,Exellys,https://media-exp1.licdn.com/dms/image/C560BAQ...,Malines,Region flamande,Belgique,Sur site
