## Imports

In [1]:
import email
import quopri
import unicodedata
from email.message import Message
from typing import Optional, List, Dict, Union
import lxml.html
from lxml.html import HtmlElement
from pygrok import Grok
import pandas as pd

In [2]:
raw_email_file = "./bp2p1gint3o7nnnr7k7m3tnoitr3ah0v7tgf1jg1"

## Extraction

### Open email

In [3]:
with open(raw_email_file, mode='rb') as file:
    raw_email: bytes = file.read()

In [4]:
message: Message = email.message_from_string(raw_email.decode('utf-8'))

### Read headers

In [5]:
sender: str = message['from']
date: str = message['date']

In [6]:
print(sender)
print(date)

Joffrey Bienvenu <joffreybvn@gmail.com>
Wed, 6 Apr 2022 13:08:43 +0200


### Get HTML email

Process: Raw to raw

In [7]:
def decode_email(html_email: str) -> str:
    """Decode an HTML email into its usable form for a web browser."""
    utf8_string: str = quopri.decodestring(html_email).decode("utf-8")  # Decode quoted-printable
    uni_decoded: str = unicodedata.normalize("NFKD", utf8_string)  # Decode unicode strings
    ascii_removed: str = uni_decoded.encode('ascii', 'ignore').decode('utf-8') # Remove ASCII characters
    no_break: str = ascii_removed.splitlines()[0]  # Remove line breaks
    return no_break

In [8]:
html_email: Optional[str] = None

# Get the HTML part of the email
for part in message.walk():
    if part.get_content_type() == 'text/html':
        html_email = decode_email(part.get_payload())

In [9]:
with open("Output.html", "w") as text_file:
    text_file.write(html_email)

### Get email data

In [10]:
document = lxml.html.fromstring(html_email)

In [11]:
from mailxtract.extractors import HTMLExtractor, field


class Job(HTMLExtractor):

    def __init__(self, html_element: Union[str, HtmlElement], position_category: str):
        super().__init__(html_element)
        self._data['position_category'] = position_category

    def extract_description(self) -> dict:

        # Setup default values
        fields = ['company_city', 'company_region', 'company_country', 'remote_working_policy']
        default_values: Dict[str, None] = {key:None for key in fields}

        # Extract the description minus the company name
        description: str = self.get_xpath(xpath='.//p/text()')
        description = description.replace(self._data['company_name'], '')

        # Parse the city, region, country and remote working policy
        pattern = '%{DATA:company_city}, %{DATA:company_region}, %{WORD:company_country} \(%{DATA:remote_working_policy}\)'
        data: dict = Grok(pattern).match(description)

        return data or default_values

    @field()
    def position_category(self, context) -> str:
        pass

    @field(
        missing_func='get_xpath',
        args={'xpath': './/td[@style="padding-bottom:4px"]/a/text()'}
    )
    def position_name(self, context) -> Optional[str]:
        pass

    @field(
        missing_func='get_xpath',
        args={'xpath': './/img[@width="48"]/@alt'}
    )
    def company_name(self, context) -> Optional[str]:
        pass

    @field(
        missing_func='get_xpath',
        args={'xpath': './/img[@width="48"]/@src'}
    )
    def company_logo(self, context) -> Optional[str]:
        pass

    @field(missing_func='extract_description')
    def company_city(self, context) -> Optional[str]:
        pass

    @field(missing_func='extract_description')
    def company_region(self, context) -> Optional[str]:
        pass

    @field(missing_func='extract_description')
    def company_country(self, context) -> Optional[str]:
        pass

    @field(missing_func='extract_description')
    def remote_working_policy(self, context) -> Optional[str]:
        pass

    def as_dict(self, **kwargs) -> Dict[str, Optional[str]]:
        return super().as_dict([
            "position_category",
            "position_name",
            "company_name",
            "company_logo",
            "company_city",
            "company_region",
            "company_country",
            "remote_working_policy"
        ])

In [12]:
position: str = document.xpath('//h2/a/span/text()')[0]

In [13]:
jobs: List[Job] = list()
raw_jobs: List[HtmlElement] = document.xpath('//table[@style="padding:10px 24px"]')

for raw_job in raw_jobs:
    jobs.append(Job(
        html_element=raw_job,
        position_category=position
    ))

In [14]:
df = pd.DataFrame([job.as_dict() for job in jobs])

In [15]:
df

Unnamed: 0,position_category,position_name,company_name,company_logo,company_city,company_region,company_country,remote_working_policy
0,Data Engineer,Data Engineer,Bricsys,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,Gand,Region flamande,Belgique,Hybride
1,Data Engineer,Data Engineer,Apollo Solutions,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,Laakdal,Region flamande,Belgique,Hybride
2,Data Engineer,MES & Data Historian Engineer,Process Automation Solutions,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,Anvers,Region flamande,Belgique,Sur site
3,Data Engineer,Data Engineer,Ossia Belgium,https://media-exp1.licdn.com/dms/image/C4D0BAQ...,,,,
4,Data Engineer,Senior (Big) Data Engineer,Ordina Belgium,https://media-exp1.licdn.com/dms/image/C4D0BAQ...,Malines,Region flamande,Belgique,Sur site
5,Data Engineer,Data Warehouse Engineer and BI Expert,Exellys,https://media-exp1.licdn.com/dms/image/C560BAQ...,Malines,Region flamande,Belgique,Sur site
