# Process BHO topographical dictionaries

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup as bs
import lxml
import glob
import re
import json
import utils
import pandas as pd

### Parse BHO topographical dictionaries

In [None]:
regex_section = r"s[0-9]+\">.*"

entry_id = 0
bho_dict = dict()
for path in glob.glob("/resources/bho/*/*"):
    # Read the XML file
    with open(path, "r", encoding="ISO-8859-1") as file:
        # Read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        content = "".join(content)
        bs_content = bs(content, "lxml")
        report_title = bs_content.html.body.find("title").getText()
        if report_title not in ["Preface", "Index", "Appendix", "Elizabeth", "Appendix 2"]:
            sectext = str(bs_content.html.body)
            sections = sectext.split("<section id=\"")
            for s in sections:
                s = s.strip()
                if re.match(regex_section, s):
                    title, text = utils.clean_section(s)
                    if text:
                        entry_id += 1
                        bho_dict[entry_id] = {"report_title": report_title,
                                              "place_name": title,
                                              "description": text,
                                              "filepath": path}

In [None]:
with open('bho_parsed.json', 'w') as f:
    json.dump(bho_dict, f)

### Clean parsed topographical dictionaries

In [None]:
queries = []
incrid = []
title = []
toponyms = []
contextwords = []
redirected = []
bhocontent = []
with open('bho_parsed.json') as f:
    
    my_dict = json.load(f)
    new_dict = dict()
    for k in my_dict:
        toponym_altnames = []
        context_words = []
        content = []
        is_redirected = True
        place_name = my_dict[k]["place_name"]
        description = my_dict[k]["description"]
        # Process only redirections:
        if not "—See " in place_name:
            toponym_altnames, context_words = utils.preprocess_title(place_name)
            is_redirected = False
            content = utils.process_content(description)
        
        incrid.append(k)
        title.append(place_name)
        toponyms.append(toponym_altnames)
        contextwords.append(context_words)
        redirected.append(is_redirected)
        bhocontent.append(content)
        
        for t in toponym_altnames:
            queries.append(t)

queries = list(set(queries))
utils.format_for_candranker("../toponym_matching/gazetteers/bho_queries", queries)

bhodf = pd.DataFrame()
bhodf["id"] = incrid
bhodf["title"] = title
bhodf["toponyms"] = toponyms
bhodf["contextwords"] = contextwords
bhodf["redirected"] = redirected
bhodf["content"] = bhocontent
bhodf.to_csv("bho.csv", index=False)