In [169]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [170]:
seed_url = 'http://www.census.nationalarchives.ie/pages/1911/Tyrone/Tullyclunagh/'
request = requests.get(seed_url)
soup = BeautifulSoup(request.content, "html.parser")

townland_url_extensions = []
seed_url_extension = seed_url.replace(
    'http://www.census.nationalarchives.ie', '')
for link in soup.find_all('a'):
    if link.get('href').startswith(seed_url_extension):
        if link.get('href') != seed_url_extension:
            townland_url_extensions.append(link.get('href'))
            
townland_url_extensions

['/pages/1911/Tyrone/Tullyclunagh/Aghnamoe/',
 '/pages/1911/Tyrone/Tullyclunagh/Cornamucklagh/',
 '/pages/1911/Tyrone/Tullyclunagh/Coyagh/',
 '/pages/1911/Tyrone/Tullyclunagh/Coyagh_Glebe/',
 '/pages/1911/Tyrone/Tullyclunagh/Cranny/',
 '/pages/1911/Tyrone/Tullyclunagh/Drumderg_Glebe/',
 '/pages/1911/Tyrone/Tullyclunagh/Drummallard/',
 '/pages/1911/Tyrone/Tullyclunagh/Glengeen/',
 '/pages/1911/Tyrone/Tullyclunagh/Lissaneden/',
 '/pages/1911/Tyrone/Tullyclunagh/Meenagar/',
 '/pages/1911/Tyrone/Tullyclunagh/Polfore/',
 '/pages/1911/Tyrone/Tullyclunagh/Rahony/',
 '/pages/1911/Tyrone/Tullyclunagh/Rathkeeran_beg/',
 '/pages/1911/Tyrone/Tullyclunagh/Tattycor/',
 '/pages/1911/Tyrone/Tullyclunagh/Tullyclunagh/']

In [171]:
house_urls = []

for url_extension in townland_url_extensions:
    request = requests.get('http://www.census.nationalarchives.ie' 
                           + url_extension)
    soup = BeautifulSoup(request.content, "html.parser")
    
    for link in soup.find_all('a'):
        if link.get('href').startswith(url_extension):
            if link.get('href') != (url_extension):
                house_urls.append('http://www.census.nationalarchives.ie' +
                                  link.get('href'))
                
house_urls = list(set(house_urls))

house_urls

['http://www.census.nationalarchives.ie/pages/1911/Tyrone/Tullyclunagh/Aghnamoe/870688/',
 'http://www.census.nationalarchives.ie/pages/1911/Tyrone/Tullyclunagh/Aghnamoe/870703/',
 'http://www.census.nationalarchives.ie/pages/1911/Tyrone/Tullyclunagh/Polfore/870799/',
 'http://www.census.nationalarchives.ie/pages/1911/Tyrone/Tullyclunagh/Aghnamoe/870699/',
 'http://www.census.nationalarchives.ie/pages/1911/Tyrone/Tullyclunagh/Drummallard/870747/',
 'http://www.census.nationalarchives.ie/pages/1911/Tyrone/Tullyclunagh/Meenagar/870793/',
 'http://www.census.nationalarchives.ie/pages/1911/Tyrone/Tullyclunagh/Aghnamoe/870701/',
 'http://www.census.nationalarchives.ie/pages/1911/Tyrone/Tullyclunagh/Drumderg_Glebe/870732/',
 'http://www.census.nationalarchives.ie/pages/1911/Tyrone/Tullyclunagh/Rahony/870821/',
 'http://www.census.nationalarchives.ie/pages/1911/Tyrone/Tullyclunagh/Tullyclunagh/870838/',
 'http://www.census.nationalarchives.ie/pages/1911/Tyrone/Tullyclunagh/Coyagh_Glebe/870722

In [172]:
residents = []

for url in house_urls:
    split_url = url.split('/')
    townland_or_street = split_url[-3]
    house_id = split_url[-2]
    
    request = requests.get(url)
    soup = BeautifulSoup(request.content, "html.parser")
    
    heading = (str(soup.find_all('h1')[0]))
    trimmed_heading = heading.replace('<h1>Residents of a house ', '')
    split_heading = trimmed_heading.split(' ', 1)
    house = split_heading[0]
    house

    table = soup.find('table')
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    
    resident_number = 1
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        cols.insert(0, resident_number)
        cols.insert(0, house)
        cols.insert(0, townland_or_street)
        cols.insert(0, house_id)
        residents.append([ele for ele in cols])
        resident_number += 1

column_headers_1901 = ['house_id', 'townland_or_street', 'house',
                  'resident_number', 'surname', 'forename', 'age', 'sex', 
                  'relation_to_head', 'religion', 'birthplace', 'occupation', 
                  'literacy', 'irish_language', 'marital_status', 
                  'specified_illnesses']

column_headers_1911 = ['house_id', 'townland_or_street', 'house',
                  'resident_number', 'surname', 'forename', 'age', 'sex', 
                  'relation_to_head', 'religion', 'birthplace', 'occupation', 
                  'literacy', 'irish_language', 'marital_status', 
                  'specified_illnesses', 'years_married', 'children_born',
                  'children_living']

year = seed_url.split('/')[-4]
county = seed_url.split('/')[-3]
ded = seed_url.split('/')[-2]

if year == '1901':
    seed_DED_df = pd.DataFrame(residents, columns=column_headers_1901)

if year == '1911':
    seed_DED_df = pd.DataFrame(residents, columns=column_headers_1911)

seed_DED_df.insert(1, 'ded', ded)
seed_DED_df.insert(1, 'county', county)

seed_DED_df = seed_DED_df.sort_values(by=['house_id', 'resident_number'])
seed_DED_df = seed_DED_df.reset_index(drop='True')

seed_DED_df

Unnamed: 0,house_id,county,ded,townland_or_street,house,resident_number,surname,forename,age,sex,...,religion,birthplace,occupation,literacy,irish_language,marital_status,specified_illnesses,years_married,children_born,children_living
0,870683,Tyrone,Tullyclunagh,Aghnamoe,1,1,Kelly,Francis,33,Male,...,R Catholic,Tyrone,Farmer,Read write,English,Married,-,,-,-
1,870683,Tyrone,Tullyclunagh,Aghnamoe,1,2,Kelly,Ellie,30,Female,...,R Catholic,Tyrone,-,Read write,English,Married,-,,-,-
2,870683,Tyrone,Tullyclunagh,Aghnamoe,1,3,Creighton,Pat,22,Male,...,R Catholic,Tyrone,Farmer Servant,Read write,English,Single,-,-,-,-
3,870684,Tyrone,Tullyclunagh,Aghnamoe,2,1,Patterson,James,73,Male,...,Presbyterian,Co Tyrone,Retired Farmer,Cannot read,-,Widower,-,-,-,-
4,870685,Tyrone,Tullyclunagh,Aghnamoe,3,1,Lynch,Joseph,67,Male,...,Roman Catholic,Co Fermanagh,Labourer,Cannot write,English,Widow,-,19,-,-
5,870686,Tyrone,Tullyclunagh,Aghnamoe,4,1,Patterson,James,55,Male,...,Presbyterian,Co Tyrone,Farmer,Read only,-,Married,-,-,-,-
6,870686,Tyrone,Tullyclunagh,Aghnamoe,4,2,Patterson,Mary Anne,40,Female,...,Presbyterian,Co Tyrone,-,Read and write,-,Married,-,15,7,7
7,870686,Tyrone,Tullyclunagh,Aghnamoe,4,3,Patterson,Samuel Thomas,15,Male,...,Presbyterian,Co Tyrone,-,Read and write,-,Single,-,-,-,-
8,870686,Tyrone,Tullyclunagh,Aghnamoe,4,4,Patterson,Robert Andrew,13,Male,...,Presbyterian,Co Tyrone,Scholar,Read and write,-,Single,-,-,-,-
9,870686,Tyrone,Tullyclunagh,Aghnamoe,4,5,Patterson,Joseph,11,Male,...,Presbyterian,Co Tyrone,Scholar,Read and write,-,Single,-,-,-,-


In [173]:
seed_DED_df.to_csv(year + '_' + county + '_' + ded + '_raw.csv')