In [1]:
from bs4 import BeautifulSoup as bs
import lxml
import os
import json

In [42]:
path = "dcp-data/nameregs/"
files = os.listdir(path)

In [43]:
def get_common_occupations():

    tokens = ['occupation','I','a','c','d,''h','l','m','p','z']

    occupations = []

    for file in files:
        with open(path+file,"r",encoding="utf8") as file:
            content = file.read()
            bs_content = bs(content, "lxml")

        for token in tokens:
            try:
                occ = bs_content.find(type=token).text.lower()
                if occ != '' and ',' not in occ:
                    occupations.append(occ)
            except AttributeError:
                pass

    common_occs = set([
        occ for occ in occupations
        if occupations.count(occ) >= 5
    ])
    
    return common_occs

In [45]:
def extractPersonData(filepath, common_occs):
    
    person_dict = {}
    
    with open(filepath,"r",encoding="utf8") as file:
        content = file.read()
        bs_content = bs(content, "lxml")
        
        # extract id 
        # (could cut this down to just an id number)
        try:
            person_dict['id'] = bs_content.person["xml:id"]
        except:
            person_dict['id'] = None
            
        # extract sex
        try: 
            sex = bs_content.sex["value"]
            if sex.upper() == 'M':
                person_dict['sex'] = 'M'
            elif sex.upper() in ['W', 'F']:
                person_dict['sex'] = 'F'
            else:
                person_dict['sex'] = ''
        except:
            person_dict['sex'] = ''
            
        # extract name
        names = []
        try:
            forenames = [
                name.text for name in bs_content.find_all("forename")
            ]
        except:
            forenames = ['']
        names += forenames
        try: 
            surname = bs_content.surname.text
        except:
            surname = ''
        names.append(surname)
        person_dict['name'] = ' '.join(names)
        
        # extract occupation
        for token in ['occupation','I','a','c','d,''h','l','m','p','z']:
            try:
                occ = bs_content.find(type=token).text.lower()
                if occ in common_occs:
                    person_dict['occupation'] = occ
            except AttributeError:
                pass
        if not person_dict.get('occupation'):
            person_dict['occupation'] = 'no common occupation'
        
    return person_dict

In [46]:
path = "dcp-data/nameregs/"
files = os.listdir(path)

common_occs = get_common_occupations()

people_dict = []
for file in files:
    person_data = extractPersonData(path+file, common_occs)
    pid = person_data.get("id")
    people_dict.append(person_data)
#     # some nameregs don't have an id; these aren't going to be any use to us
#     if pid:
#         people_dict[pid] = person_data
        
# dump to json
with open('people.json', 'w') as jsonfile:
    json.dump(people_dict, jsonfile)