In [2]:
import requests
from bs4 import BeautifulSoup

# debug/test data: prepare bs4-soup from 1st officer name list
url = "http://www.pdavis.nl/SeaOfficers.php?page=1"
req = requests.get(url)
cont = req.content
soup = BeautifulSoup(cont, 'lxml')

# parsing names and biography links

individual biography pages display the whole name as one unbroken string, while the officer name list has given names and family name in discrete table columns. because this greatly decreases the probability of name parsing errors, we opt to not iterate over individual biography pages but first parse a list of names and the urls of their biography pages from the name lists, then iterate over these urls for extracting personal data.

## plan

**input**:

* name list url

**output**:

* list of namedtuples:
  * given name(s)
  * surname
  * url to this officer’s biography page

In [3]:
from collections import namedtuple

person = namedtuple('person', ['givenname', 'surname', 'url'])

def parse_name_url(soup):
    """isolate table from name list soup, return parsed names and bio links."""
    table = soup.find('table', class_='tbl width100')
    rows = table.find_all('tr')[1:]

    return [
        person(
            row.td.string.lstrip(),
            row.td.next_sibling.a.string,
            'http://www.pdavis.nl/'+row.td.next_sibling.a['href'])
        for row in rows
    ]


# debug/test data: name list soup -> names, urls
name_data = parse_name_url(soup)
name_data

[person(givenname='Charles', surname='Adam', url='http://www.pdavis.nl/ShowBiog.php?id=1'),
 person(givenname='John', surname='Adams', url='http://www.pdavis.nl/ShowBiog.php?id=114'),
 person(givenname='Richard', surname='Adams', url='http://www.pdavis.nl/ShowBiog.php?id=1451'),
 person(givenname='Edward Stanley', surname='Adeane', url='http://www.pdavis.nl/ShowBiog.php?id=1519'),
 person(givenname='John de Courcy Andrew', surname='Agnew', url='http://www.pdavis.nl/ShowBiog.php?id=1150'),
 person(givenname='William Cornwallis', surname='Aldham', url='http://www.pdavis.nl/ShowBiog.php?id=696'),
 person(givenname='Pelham', surname='Aldrich', url='http://www.pdavis.nl/ShowBiog.php?id=1472'),
 person(givenname='Robert Dawes', surname='Aldrich', url='http://www.pdavis.nl/ShowBiog.php?id=1132'),
 person(givenname='John Williams', surname='Aldridge', url='http://www.pdavis.nl/ShowBiog.php?id=365'),
 person(givenname='John Hobhouse Inglis', surname='Alexander', url='http://www.pdavis.nl/ShowBi

# extracting data from biography pages

each officer biography page contains all its information in one big table that spans several topics. for easier processing, we first read the table and split it into discrete chunks of rows, each spanning one topic: biographical data, rank data, and stationing data.

## personal data

**input**:

* table snippet

**output**:

* date of birth
* date of death

## rank data

**input**:

* table snippet

**output**:

* list of namedtuples
  * date of seniority
  * attained rank

## stationing data

stationing statements usually of the form ‘(rank) in (ship name)’, therefore we extract it via regex. most ship names are links, but not all. using a regex allows us to also catch stationing statements that do not contain a link.

**input**:

* table snippet

**output**:

* list of namedtuples
  * begin of stationing
  * end of stationing
  * ship name

In [4]:
import re

def get_bio_tables(url):
    """from a biography page url, read table and split into topical chunks."""
    bio_request = requests.get(url)
    bio_soup = BeautifulSoup(bio_request.content, 'lxml')

    bio_table = bio_soup.find('table', class_='tbl width100')
    bio_rows = bio_table.find_all('tr')

    bio_rowgroups = []
    bio_rowgroup = []

    for row in bio_rows:
        bio_rowgroup.append(row)
        if row.td.has_attr('colspan') and row.td['colspan'] == '4' and re.match('\s', row.td.text):
            bio_rowgroups.append(bio_rowgroup)
            bio_rowgroup = []
    bio_rowgroups.append(bio_rowgroup)
    return bio_rowgroups

# debug/test data: extract biography table snippets for first name in name list (charles adam)
blah = name_data[0]
bio_rowgroups = get_bio_tables(blah.url)
bio_rowgroups

[[<tr><td style="border:none;"> </td><td style="border:none;"> </td><td style="border:none;"> </td><td style="border:none;"> </td></tr>,
  <tr><td class="tblfil" colspan="3" style="border-right:none;"><b> Charles Adam R.N.</b></td><td class="tblfil" style="border-left:none;"><a href="OfficerExplanation.htm">Explanation</a></td></tr>,
  <tr><td colspan="4">Son of the Rt Hon. William Adam, Lord Chief Commissioner of the Jury Court of Scotland.</td></tr>,
  <tr><td colspan="4"> </td></tr>],
 [<tr><td class="tblfil2 right">Date (from)</td><td class="tblfil2 right">(Date to)</td><td class="tblfil2" colspan="2">Personal</td>
  </tr>,
  <tr><td class="right">6 October 1780</td><td> </td><td colspan="2">Born.</td></tr>,
  <tr><td class="right">20 May 1831</td><td class="right">6 July 1841</td><td colspan="2">M.P. for Clackmannan and Kinross</td></tr>,
  <tr><td class="right">10 January 1835</td><td> </td><td colspan="2">K.C.B. (Knight Commander of the Bath).</td></tr>,
  <tr><td class="right">

In [17]:
def format_date(datestring):
    """format a datestring into a numbers-only datestring"""
    monthnames = {
        'January': '1',
        'February': '2',
        'March': '3',
        'April': '4',
        'May': '5',
        'June': '6',
        'July': '7',
        'August': '8',
        'September': '9',
        'October': '10',
        'November': '11',
        'December': '12'
    }
    datestring = re.sub('\s', '_', datestring)
    for key in monthnames.keys():
        datestring = datestring.replace(key, monthnames[key])
    datestring = re.sub('[()]', '', datestring)
    return datestring


def born_die(bio_born_die):
    """from a biography table snippet, extract date of birth and death."""
    entf = [blubb for blubb in bio_born_die if len(blubb.find_all('td')) > 1]
    date_born = ''
    date_died = ''
    for item in entf:
        cells = item.find_all('td')
        if re.match('.orn', cells[2].text):
            date_born = cells[0].text
            #print('born: ' + date_born)
        if re.match('.ied', cells[2].text):
            date_died = cells[0].text
            #print('died: ' + date_died)
    return (format_date(date_born), format_date(date_died))

def process_ranks(table_soup):
    """from a rank table snippet, parse ranks and their seniority dates."""
    ranks = table_soup[1:-1]
    if not 'No rank data' in ranks[0].text:
        rank = namedtuple('rank', ['date', 'title'])
        return [rank(format_date(row.td.text), row.find('td', {'colspan':'3'}).text) for row in ranks]

def process_stations(table_soup):
    """from a stationing table snippet, extract stationing on ships and return with dates."""
    stationing = namedtuple('stationing', ['datefrom', 'dateto', 'station', 'station_id'])
    ranks = [
        'cadet',
        'Volunteer'
        'Mate',
        'Midshipman',
        'Lieutenant',
        'Commander',
        'Captain',
        'Admiral'
    ]
    rankpattern = '('+'|'.join(ranks)+') (\(acting\) )?in ([^(,]+)'
    result_list = []
    
    for row in table_soup[1:]:
        datefrom = format_date(row.td.text)
        dateto = format_date(row.td.next_sibling.text)
        station_td = row.td.next_sibling.next_sibling
        text = ''.join(text for text in station_td.find_all(text=True)
                       if not text.parent.name == 'sup')
        station = re.search(rankpattern, text, re.IGNORECASE)
        if not station == None:
            station = station.group(3).rstrip()
            try:
                station_id = station_td.find('a', href=re.compile('ShowShip.+'))['href'].replace('ShowShip.php?id=', '')
            except TypeError:
                station_id = ''
            result_list.append(stationing(datefrom, dateto, station, station_id))
            
    return result_list

# debug/test data: extract biographical data, rank progression, stationing of charles adam from table snippets
print(born_die(bio_rowgroups[1]))
print(process_ranks(bio_rowgroups[2]))
print(process_stations(bio_rowgroups[3]))

('6_10_1780', '19_9_1853')
[rank(date='15_12_1790', title='Entered Navy'), rank(date='8_2_1798', title='Lieutenant'), rank(date='16_5_1798', title='Commander'), rank(date='12_6_1799', title='Captain'), rank(date='27_5_1825', title='Rear-Admiral'), rank(date='10_1_1837', title='Vice-Admiral'), rank(date='8_1_1848', title='Admiral')]
[stationing(datefrom='1799', dateto='_', station='Sybille', station_id='2463'), stationing(datefrom='23_5_1803', dateto='1805', station='Chiffonne', station_id='2391'), stationing(datefrom='27_8_1805', dateto='6_4_1810', station='Resistance', station_id='1977'), stationing(datefrom='1811', dateto='1813', station='Invincible', station_id='1627'), stationing(datefrom='16_5_1814', dateto='29_6_1814', station='Impregnable', station_id='1599'), stationing(datefrom='15_12_1814', dateto='7_2_1816', station='Royal Sovereign', station_id='2015'), stationing(datefrom='20_7_1821', dateto='25_5_1825', station='Royal Sovereign', station_id='2015')]


In [18]:
import pandas as pd

# actual use case
# prepare empty dataframes
# iterate over all officer name lists, process each officer, fill data into dataframes
# finally, write dataframes to .csv

names_df = pd.DataFrame(columns=["id", "forename", "surname", "born", "died", "link"])
ranks_df = pd.DataFrame(columns=["id", "date", "rank"])
stations_df = pd.DataFrame(columns=["id", "date_from", "date_to", "station", "station_id"])
urls = ["http://www.pdavis.nl/SeaOfficers.php?page="+str(n) for n in range(1,9)]

for url_n, url in enumerate(urls):
    req = requests.get(url)
    cont = req.content
    soup = BeautifulSoup(cont, 'lxml')
    names = parse_name_url(soup)
    
    old_names_len = len(names_df)
    for i,name in enumerate(names):
        print('processing', name.givenname, name.surname)
        person_id = 'p_'+str(url_n)+'_'+str(i)

        bio_tables = get_bio_tables(name.url)

        born, died = born_die(bio_tables[1])
        names_df.loc[old_names_len+i] = [person_id,
                                         name.givenname,
                                         name.surname,
                                         born,
                                         died,
                                         name.url]

        ranks = process_ranks(bio_tables[2])
        old_rank_len = len(ranks_df)
        if ranks:
            for j, rank in enumerate(ranks):
                ranks_df.loc[old_rank_len+j] = [person_id,
                                                rank.date,
                                                rank.title]

        stations = process_stations(bio_tables[3])
        old_station_len = len(stations_df)
        for k, station in enumerate(stations):
            stations_df.loc[old_station_len+k] = [person_id,
                                                  station.datefrom,
                                                  station.dateto,
                                                  station.station,
                                                  station.station_id]

names_df.to_csv("names.csv", encoding='utf-8')
ranks_df.to_csv("ranks.csv", encoding='utf-8')
stations_df.to_csv("stations.csv", encoding='utf-8')

processing Charles Adam
processing John Adams
processing Richard Adams
processing Edward Stanley Adeane
processing John de Courcy Andrew Agnew
processing William Cornwallis Aldham
processing Pelham Aldrich
processing Robert Dawes Aldrich
processing John Williams Aldridge
processing John Hobhouse Inglis Alexander
processing John Richard Alexander
processing Prince Alfred
processing Bird Allen
processing Henry Murray Edward Allen
processing William Allen
processing Constantine O'Donnel Allingham
processing James Anderson
processing Warren Hastings Anderson
processing Charles William Andrew
processing William Henry Annesley
processing Charles Vernon Anson
processing Talavera Vernon Anson
processing Benjamin Aplin
processing Elphinstone D'Oyly D'Auvergne Aplin
processing William Arlett
processing Whaley Armitage
processing William Armytage
processing Richard Arthur
processing William Arthur
processing Charles John Austen
processing Francis William Austen
processing Horatio Thomas Austin
pr