In [67]:
import pdfplumber
from dataclasses import dataclass, replace, asdict
import re
import pandas as pd
from datetime import datetime
import locale
from pyarrow import parquet as pq
import os
import io
#import urllib3

In [58]:
for file in os.listdir("pdf"):
    if file.endswith("vertretungenfremderstaatendl-data.pdf"):
        print(os.path.join("pdf", file))

pdf/2022.03.10_vertretungenfremderstaatendl-data.pdf
pdf/2022.03.25_vertretungenfremderstaatendl-data.pdf
pdf/2022.03.14_vertretungenfremderstaatendl-data.pdf
pdf/2021.04.13_vertretungenfremderstaatendl-data.pdf
pdf/2022.03.17_vertretungenfremderstaatendl-data.pdf
pdf/2020.09.23_vertretungenfremderstaatendl-data.pdf
pdf/2022.02.03_vertretungenfremderstaatendl-data.pdf
pdf/2020.11.19_vertretungenfremderstaatendl-data.pdf
pdf/2020.02.05_vertretungenfremderstaatendl-data.pdf
pdf/2022.02.17_vertretungenfremderstaatendl-data.pdf
pdf/2021.06.25_vertretungenfremderstaatendl-data.pdf
pdf/2022.01.26_vertretungenfremderstaatendl-data.pdf
pdf/2022.03.04_vertretungenfremderstaatendl-data.pdf


### Open PDF directly from web without saving locally

In [43]:
url = 'https://www.auswaertiges-amt.de/blob/199684/ada2000f738fe8e4cec27ab998eabc02/vertretungenfremderstaatendl-data.pdf'
http = urllib3.PoolManager()
temp = io.BytesIO()
temp.write(http.request("GET", url).data)
t = pdfplumber.open(temp)

FileNotFoundError: [Errno 2] No such file or directory: 'https://www.auswaertiges-amt.de/blob/199684/ada2000f738fe8e4cec27ab998eabc02/vertretungenfremderstaatendl-data.pdf'

In [18]:
pdf = pdfplumber.open("pdf/2022.03.10_vertretungenfremderstaatendl-data.pdf")

### date of release


In [19]:
for line in pdf.pages[0].extract_text().splitlines():

    if line.startswith('Stand:'):
        line = line.strip('Stand: ')
        locale.setlocale(locale.LC_TIME, "de_DE") 
        date = datetime.strptime(line, '%d. %B %Y').date()

In [25]:
date

datetime.date(2022, 3, 10)

In [29]:
str(date.year)

'2022'

In [35]:
os.path.join(None, str(date.year), str(date.month), str(date.day))

TypeError: expected str, bytes or os.PathLike object, not NoneType

### Country Detection
Test if first word on page is upper case.

In [406]:
# get_country_index
country_index = {}

# Start on page 2, because page 1 is the cover and would be a false positive
for page in pdf.pages[1:]:
    
    # split text on page into lines
    lines = page.extract_text().splitlines()
    # if there is at least one line and the first line is uppercase
    if len(lines) and lines[0] == lines[0].upper():
        
        # set end page for previous country in tupel (index start page, index end page+1)
        if country_index:
            country_index[country_name] = (country_index[country_name][0], page.page_number-1)
        
        # save country and page index (page number - 1) in index
        country_name = lines[0].title()
        country_index[country_name] = (page.page_number-1, None)

### Select all page for a country

In [407]:
country = country_index['Heiliger Stuhl']
pages = []

for page in pdf.pages[slice(*country)]:
    pages.append(page.extract_text())

text = '\n'.join(pages)

In [409]:
lines = text.splitlines()

In [410]:
country_name = lines[0].capitalize()
country_name_long = lines[1]

In [441]:
people = extract_diplomats(lines)

In [442]:
df = pd.json_normalize(asdict(obj) for obj in people.values())
df['country'] = country_name
df['country_long'] = country_name_long
df['date'] = date


In [443]:
df

Unnamed: 0,title,gender,name,position_name,date_since,order,partner_gender,partner_name,country,country_long,date
0,S. E.,,Erzbischof Nikola ETEROVIĆ,außerordentlicher und bevollmächtigter Botscha...,20.11.2013,1,,,Heiliger Stuhl,Heiliger Stuhl,2022-03-10
1,,Herr,Chun Yean CHOONG,Nuntiatursekretär,10.02.2020,2,,,Heiliger Stuhl,Heiliger Stuhl,2022-03-10
2,,Herr,Sebastiano SANNA,Nuntiatursekretär,07.02.2022,3,,,Heiliger Stuhl,Heiliger Stuhl,2022-03-10


### Name Extraction
Startet mit S. E., Herr, Frau

Vor- und Nachname: manchmal nichts groß, manchmal komplett groß, manchmal nur Nachname groß.

Sonderfall Indonesion (Seite 100): drei Bindestricht vor Namen: Herr --- YUL EDISON

Manche Länder haben keine Personen, z. B. Sao Tome (Seite 225)

### Extract features
* country_name
* country_name_long
* date
* person
    * title [S. E., NULL]
    * gender = [Herr, Frau]
    * name
    * position_name
    * order (position on the list)
    * since_date
    * ? belongs_to = Millitärattachestab
    * partner_gender = NULL
    * partner_name = NULL

In [None]:
@dataclass
class Person:
    """Class for keeping track of an item in inventory."""
    title: str
    gender: str
    name: str
    position_name: str = None
    date_since: str = None
    order: int = None
    partner_gender: str = None
    partner_name: str = None


In [440]:
def extract_diplomats(lines):
    diplomats = {}
    order = 0

    for idx, line in enumerate(lines):
        
        if line.lower().startswith(('s. e.', 'i. e.', 'herr', 'frau')):
            # new name found
            if 'idx_last_found' in locals() and idx == idx_last_found + 1:
                # new name follows after last name -> last name ist partner
                # previous_person is diplomat
                # person is partner
                diplomats[previous_person_key] = replace(diplomats[previous_person_key], 
                                                partner_gender=person.gender, 
                                                partner_name=person.name)
                del diplomats[person.order] # delete partner object
                order -= 1
                del idx_last_found
            
            if line.lower().startswith(('s. e.', 'i. e.')):
                title = line[:5]
                line = line[5:].strip()
            else:
                title = None

            if line.lower().startswith(('herr', 'frau')):

                line = line.split(' ', maxsplit=1)
                if len(line) != 2:
                    continue
                
                gender = line[0]
                line = line[1]
            else:
                gender = None

            name = line.strip('- ')

            idx_last_found = idx
            previous_person_key = order
            order += 1
            person = Person(title, gender, name, order=order)

            diplomats[person.order] = person
        
        elif re.search('\(\d{2}\.\d{2}\.\d{4}\)', line):
            # second line of person containing position and date

            line = line.split(',')

            position: str = ', '.join(line[:-1]).strip() # first part until the last comma
            date_since: str = line[-1].strip('() ') # date in brackers

            diplomats[person.order] = replace(person, 
                                            position_name=position, 
                                            date_since=date_since)
            #previous_person = diplomats[person.order]
        
        else:
            # if nothing was found in this line, reset 
            if 'idx_last_found' in locals() and idx == idx_last_found + 1:
                idx_last_found = idx
                

    # if last line was person -> this was a partner
    if 'idx_last_found' in locals() and idx == idx_last_found:
        diplomats[previous_person_key] = replace(diplomats[previous_person_key], 
                                                partner_gender=person.gender, 
                                                partner_name=person.name)
        del diplomats[person.order] # delete partner object

    return diplomats

### Example Afghanistan

country_name: line 0, capitalize  
country_name_long: line 1

In [415]:
p2 = pdf.pages[95]

In [None]:
im = p2.to_image(resolution=150)
im.draw_rects(p2.extract_words())

In [416]:
p2_text = p2.extract_text()
p2_lines = p2_text.splitlines()

In [419]:
country_name = p2_lines[0].title()
country_name_long = p2_lines[1]

In [420]:
print(country_name)
print(country_name_long)

Heiliger Stuhl
Heiliger Stuhl


### Name detection

In [421]:
diplomats = {}
order = 0

for idx, line in enumerate(p2_lines):
    
    if line.lower().startswith(('s. e.', 'herr', 'frau')):
        # new name found
        if 'idx_last_found' in locals() and idx == idx_last_found + 1:
            # new name follows after last name -> last name ist partner
            # previous_person is diplomat
            # person is partner
            diplomats[previous_person_key] = replace(diplomats[previous_person_key], 
                                               partner_gender=person.gender, 
                                               partner_name=person.name)
            del diplomats[person.order] # delete partner object
            order -= 1
        
        if line.lower().startswith('s. e.'):
            title = 'S. E.'
            line = line[5:].strip()
        else:
            title = None

        if line.lower().startswith(('herr', 'frau')):

            line = line.split(' ', maxsplit=1)
            if len(line) != 2:
                continue
            
            gender = line[0]
            line = line[1]
        else:
            gender = None

        name = line.strip('- ')

        idx_last_found = idx
        previous_person_key = order
        order += 1
        person = Person(title, gender, name, order=order)

        diplomats[person.order] = person
    
    elif 'idx_last_found' in locals() and \
         idx == idx_last_found + 1 and \
         re.search('\(\d{2}\.\d{2}\.\d{4}\)', line):
        # second line of person containing position and date
        line = line.split('(')

        position: str = ', '.join(line[:-1]).strip()
        date_since: str = line[-1].strip('() ')

        diplomats[person.order] = replace(person, 
                                          position_name=position, 
                                          date_since=date_since)
        previous_person = diplomats[person.order]

# if last line was person -> this was a partner
if 'idx_last_found' in locals() and idx == idx_last_found:
    diplomats[previous_person_key] = replace(diplomats[previous_person_key], 
                                             partner_gender=person.gender, 
                                             partner_name=person.name)
    del diplomats[person.order] # delete partner object

In [424]:
text.splitlines()

['HEILIGER STUHL',
 'Heiliger Stuhl',
 'Kanzlei:',
 'Postfach:',
 '61 30 58',
 'Adresse:',
 'Lilienthalstraße 3 a',
 '10965 Berlin',
 'Telefon:',
 '+49 30 61 62 40',
 'Fax:',
 '+49 30 61 62 43 00',
 'E-Mail:',
 'apostolische@nuntiatur.de',
 'Homepage:',
 'http://www.nuntiatur.de',
 'Konsularbezirke:',
 'Bundesgebiet',
 'S. E. Erzbischof  Nikola  ETEROVIĆ',
 'außerordentlicher und bevollmächtigter Botschafter, (20.11.2013)',
 'Herr  Chun Yean  CHOONG',
 'Nuntiatursekretär, (10.02.2020)',
 'Herr  Sebastiano  SANNA',
 'Nuntiatursekretär, (07.02.2022)']

In [422]:
diplomats

{1: Person(title=None, gender='Herr', name='Chun Yean  CHOONG', position_name='Nuntiatursekretär,', date_since='10.02.2020', order=1, partner_gender=None, partner_name=None),
 2: Person(title=None, gender='Herr', name='Sebastiano  SANNA', position_name='Nuntiatursekretär,', date_since='07.02.2022', order=2, partner_gender=None, partner_name=None)}

In [435]:
line = 'S. E. Erzbischof  Nikola  ETEROVIĆ'

In [436]:
if line.lower().startswith('s. e.'):
    title = 'S. E.'
    line = line[5:].strip()

In [437]:
line

'Erzbischof  Nikola  ETEROVIĆ'

In [434]:
line.split('  ', maxsplit=1)

ValueError: not enough values to unpack (expected 2, got 1)

In [432]:
b

'Nikola  ETEROVIĆ'

In [346]:
line[-1].strip('() ')

'15.06.2021'

In [348]:
', '.join(line[:-1]).strip()

'Heeres-,  Luftwaffen- und Marineattaché'

In [301]:
line[1].strip('() ')

'29.03.2021'

### Read in Parquet