# Process BHO topographical dictionaries

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup as bs
import lxml
import glob
import re
import json
import utils
import pathlib
import pandas as pd
from difflib import SequenceMatcher

In [None]:
path_to_bho = "/resources/bho/" # Directory containing the "England/", "Wales/", and "Scotland/" subdirectories

### Parse BHO topographical dictionaries

In [None]:
regex_section = r"s[0-9]+\">.*"

entry_id = 0
bho_dict = dict()
for path in glob.glob(path_to_bho + "*/*"):
    # Read the XML file
    with open(path, "r", encoding="ISO-8859-1") as file:
        # Read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        content = "".join(content)
        lcountry = path.split("/")[-2]
        bs_content = bs(content, "lxml")
        report_title = bs_content.html.body.find("title").getText()
        if report_title not in ["Preface", "Index", "Appendix", "Elizabeth",
                                "Appendix 2", "Corrections and additions",
                                "Addenda", "Appendix 1", "Marginal references",
                                "Tables explaining numbers", "Corrigenda",
                                "Introduction"]:
            sectext = str(bs_content.html.body)
            sections = sectext.split("<section id=\"")
            for s in sections:
                s = s.strip()
                if re.match(regex_section, s):
                    title, text = utils.clean_section(s)
                    # Filter out wrong headers and section titles:
                    if "TOPOGRAPHICAL" not in title and not(not "-" in report_title and not "—See" in title and title.strip().endswith(".")) and text and len(title) > 1 and not title.strip() == "Ecclesiastical jurisdiction":
                        # Clean noise, eg.: Stirling<figure graphic="\images\fig96.gif" id="fig96" number=""><caption><emph type="i">Observe. Reverse. Ancient Burgh Seal</emph>.</caption></figure>
                        if "<" in title:
                            title = title.split("<")[0]
                        entry_id += 1
                        bho_dict[entry_id] = {"report_title": report_title,
                                          "place_name": title,
                                          "description": text,
                                          "xmlfile": path.split("/")[-1],
                                          "country": lcountry}

In [None]:
with open('bho_parsed.json', 'w') as f:
    json.dump(bho_dict, f)

### Clean parsed topographical dictionaries

In [None]:
queries = []
incrid = []
title = []
toponyms = []
contextwords = []
redirected = []
bhocontent = []
country = []
xmlfile = []
report_title = []

counter = 0
with open('bho_parsed.json') as f:
    
    my_dict = json.load(f)
    sorted_keys = sorted(my_dict.items(), key=lambda item: item[1]['place_name'])
    
    for t in sorted_keys:
        dvalue = t[1]
        toponym_altnames = []
        context_words = []
        content = []
        is_redirected = True
        place_name = dvalue["place_name"]
        description = dvalue["description"]
        # Process only redirections:
        if not "—See " in place_name:
            toponym_altnames, context_words = utils.preprocess_title(place_name, dvalue["country"])
            is_redirected = False
            content = utils.process_content(description)
        
        incrid.append(counter)
        title.append(place_name)
        toponyms.append(toponym_altnames)
        contextwords.append(context_words)
        redirected.append(is_redirected)
        bhocontent.append(content) 
        country.append(dvalue["country"])
        xmlfile.append(dvalue["xmlfile"])
        report_title.append(dvalue["report_title"])
        
        for t in toponym_altnames:
            queries.append(t)
            
        counter += 1

# Prepare queries as input for DeezyMatch
queries = list(set(queries))
outputFolder = "../toponym_matching/toponyms/"
pathlib.Path(outputFolder).mkdir(parents=True, exist_ok=True)
utils.format_for_candranker(outputFolder + "bho_queries", queries)

# Store structured dataset as csv
bhodf = pd.DataFrame()
bhodf["id"] = incrid
bhodf["title"] = title
bhodf["toponyms"] = toponyms
bhodf["contextwords"] = contextwords
bhodf["redirected"] = redirected
bhodf["content"] = bhocontent
bhodf["country"] = country
bhodf["xmlfile"] = xmlfile
bhodf["report_title"] = report_title

bhodf.to_csv("bho.csv", index=False)

In [None]:
annotations_folder = "../annotations/bho_content/"
pathlib.Path(annotations_folder).mkdir(parents=True, exist_ok=True)
for i, row in bhodf.iterrows():
    if row["redirected"] == False:
        with open(annotations_folder + row["country"] + str(row["id"]) + ".txt", "w") as fw:
            fw.write("[TITLE] " + row["title"])
            fw.write("\n[1stPARAGRAPH] " + row["content"][0])