# Process BHO topographical dictionaries

In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup as bs
import lxml
import glob
import re
import json
import utils
import pathlib
import pandas as pd

In [2]:
path_to_bho = "../materials/bho_topographical_dictionaries_xml/" # Directory containing the "England/", "Wales/", and "Scotland/" subdirectories

### Parse BHO topographical dictionaries

In [3]:
regex_section = r"s[0-9]+\">.*"

entry_id = 0
bho_dict = dict()
for path in glob.glob(path_to_bho + "*/*"):
    # Read the XML file
    with open(path, "r", encoding="ISO-8859-1") as file:
        # Read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        content = "".join(content)
        lcountry = path.split("/")[-2]
        bs_content = bs(content, "lxml")
        report_title = bs_content.html.body.find("title").getText()
        if report_title not in ["Preface", "Index", "Appendix", "Elizabeth", "Appendix 2", "Corrections and additions", "Addenda", "Appendix 1", "Marginal references", "Tables explaining numbers", "Corrigenda", "Introduction"]:
            sectext = str(bs_content.html.body)
            sections = sectext.split("<section id=\"")
            for s in sections:
                s = s.strip()
                if re.match(regex_section, s):
                    title, text = utils.clean_section(s)
                    if text and len(title) > 1:
                        entry_id += 1
                        bho_dict[entry_id] = {"report_title": report_title,
                                              "place_name": title,
                                              "description": text,
                                              "xmlfile": path.split("/")[-1],
                                              "country": lcountry}

In [4]:
with open('bho_parsed.json', 'w') as f:
    json.dump(bho_dict, f)

### Clean parsed topographical dictionaries

In [5]:
queries = []
incrid = []
title = []
toponyms = []
contextwords = []
redirected = []
bhocontent = []
country = []
xmlfile = []
report_title = []

counter = 0
with open('bho_parsed.json') as f:
    
    my_dict = json.load(f)
    sorted_keys = sorted(my_dict.items(), key=lambda item: item[1]['place_name'])
    
    for t in sorted_keys:
        dvalue = t[1]
        toponym_altnames = []
        context_words = []
        content = []
        is_redirected = True
        place_name = dvalue["place_name"]
        description = dvalue["description"]
        # Process only redirections:
        if not "—See " in place_name:
            toponym_altnames, context_words = utils.preprocess_title(place_name)
            is_redirected = False
            content = utils.process_content(description)
        
        incrid.append(counter)
        title.append(place_name)
        toponyms.append(toponym_altnames)
        contextwords.append(context_words)
        redirected.append(is_redirected)
        bhocontent.append(content)
        country.append(dvalue["country"])
        xmlfile.append(dvalue["xmlfile"])
        report_title.append(dvalue["report_title"])
        
        for t in toponym_altnames:
            queries.append(t)
            
        counter += 1

queries = list(set(queries))
outputFolder = "../toponym_matching/toponyms/"

pathlib.Path(outputFolder).mkdir(parents=True, exist_ok=True)

utils.format_for_candranker(outputFolder + "bho_queries", queries)

bhodf = pd.DataFrame()
bhodf["id"] = incrid
bhodf["title"] = title
bhodf["toponyms"] = toponyms
bhodf["contextwords"] = contextwords
bhodf["redirected"] = redirected
bhodf["content"] = bhocontent
bhodf["country"] = country
bhodf["xmlfile"] = xmlfile
bhodf["report_title"] = report_title

bhodf.to_csv("bho.csv", index=False)

In [6]:
bhodf.head()

Unnamed: 0,id,title,toponyms,contextwords,redirected,content,country,xmlfile,report_title
0,0,Ab-Kettleby (St. James),[Ab-Kettleby],[St. James],False,"[AB-KETTLEBY (St. James), a parish, in the uni...",England,50742.xml,Abingdon - Ackton
1,1,"Abbas-Combe, or Temple-Combe (St. Mary)","[Temple-Combe, Abbas-Combe]",[St. Mary],False,"[ABBAS-COMBE, or Temple-Combe (St. Mary), a pa...",England,50741.xml,Abbas-Combe - Aberystwith
2,2,"Abberbury, county Salop.—See Alberbury.",[],[],True,[],England,50741.xml,Abbas-Combe - Aberystwith
3,3,Abberley (St. Michael),[Abberley],[St. Michael],False,"[ABBERLEY (St. Michael), a parish, in the unio...",England,50741.xml,Abbas-Combe - Aberystwith
4,4,Abbertoft,[Abbertoft],[],False,"[ABBERTOFT, a hamlet, in the parish of Willoug...",England,50741.xml,Abbas-Combe - Aberystwith
