In [None]:
import pandas as pd
import itertools
import string
import re
import json
import numpy as np

In [None]:
# From Quicks intro:
with open('companies.txt') as json_file:
    companies = json.load(json_file)

# From Quicks intro:
stntypes = ["AOT", "CLO", "CO", "CO N", "CO TT", "ND", "NG", "NON-TT", "OP", "P", "REOP",
            "TT", "HL", "LL", "HB", "HBA", "NG", "TT"]

# Most common last tokens:
keywords = ["PLATFORM", "PLATFORMS", "HALT", "INTERNATIONAL", "JUNCTION", "CAMP", "CENTRAL", 'ROAD', 
            'JUNCTION', 'BRIDGE', 'STREET', 'PARK', 'LANE', 'HILL', 'COLLIERY', 'TOWN', 'GREEN', 
            'CENTRAL', 'CROSSING', 'NORTH', 'LEVEL', 'EAST', 'DOCK', 'WEST', 'GATE', 'CROSS', 'HALT', 
            'SOUTH', 'MILL', 'END', 'SIDING', 'HALL', 'HOUSE']

In [None]:
def process_altnames(altnames, mainst, subst):
    repl = altnames[0]
    prev = altnames[1]
    altrn = altnames[2]
    added = altnames[3]
    dropped = altnames[4]

    # Add or drop tokens from main or sub station:
    # Often the description mentions that a token has been added to
    # or dropped from the station name, e.g. 
    # "op 1 January 1857; JUNCTION added 1 April 1864"
    # Here we're modifying the alternate names to add or drop these:
    modaltnames = []
    for x in list(added + dropped):
        if not x in mainst:
            modaltnames.append(mainst + " " + x)
        else:
            modaltnames.append(re.sub(r"\b%s\b" % x, "", mainst))
        if not x in subst:
            modaltnames.append(subst + " " + x)
        else:
            modaltnames.append(re.sub(r"\b%s\b" % x, "", subst))
    modaltnames = list(set(modaltnames))
    altnames = list(altnames)
    altnames.append(modaltnames)

    # Find full tokens:
    # Substations and alternate names are very often abbreviated, e.g.
    # "[...] became Y R & B P 1 November 1870 [...]",
    # where "Y R & B P" is short for "York Road and Battersea Park",
    # where full tokens are found either in the main station, substation
    # or in the alternate names. Therefore, in this step we are collecting
    # all full tokens and assume that an initial will refer to the first
    # mentioned token that starts with this initial.
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    flt_altnames = [mainst]
    flt_altnames.append(subst)
    flt_altnames += list(set([item for sublist in altnames for item in sublist]))
    altname_tokens = []
    for altn in flt_altnames:
        altn = altn.translate(translator).split()
        altname_tokens += [x for x in altn if re.match(r"\b[A-Z]{2,}\b", x) and x not in altname_tokens and x not in ("AND", "FOR")]

    # Replace initials with token:
    proc_altnames = []
    for x in list(set(repl + prev + altrn + modaltnames)):
        tsub = []
        remsub = []
        initial_repl = False
        for w in x.split():
            if w == "&":
                w = "AND"
            if len(w) == 1:
                for altt in altname_tokens:
                    if altt.startswith(w) and not altt in tsub:
                        tsub.append(altt)
                        remsub.append(w)
                        break
                if not any(altt.startswith(w) for altt in altname_tokens):
                    tsub.append(w)
            else:
                tsub.append(w)

        proc_altnames.append(" ".join(tsub))

    # Only accept an alternate name if it's not a keyword in Quicks and has length of at least three characters:
    proc_altnames = [x for x in proc_altnames if len(x) >= 3 and not x in list(companies.keys()) + stntypes + keywords and not x in (mainst, subst)]

    return proc_altnames

In [None]:
def parse_description(description, mainst, subst):
    
    alternate_names = []
    
    # -----------------------------------
    # Remove non-printable characters:
    description = ''.join([x if x in string.printable else ' ' for x in description])
    
    # -----------------------------------
    # Remove notes (in parentheses), and text in curly and square brackets:
    description = re.sub(r'\([^)]*?\)', '', description)
    description = re.sub(r'\[[^)]*?\]', '', description)
    description = re.sub(r'\{[^)]*?\}', '', description)
    
    # -----------------------------------
    # Remove extra white spaces:
    description = re.sub(' +', ' ', description)
    
    # -----------------------------------
    # Capture alternate names...
    re_altname = r"(\b[A-Z]+(?:[A-Z \&\'\-(St|for|at|on|upon)])*[A-Z])+\b"
    
    # ... in their context:
    re_replacedby = r"\b(?:[Bb]ecame|[Rr]enamed|[Ll]ater|[Aa]ltered to|[Rr]eplaced by)\b " + re_altname
    re_previously1 = r"\b(?:as|[Ww]as|[Oo]riginally|[Aa]t first|[Ee]arly)\b:? " + re_altname
    re_previously2 = re_altname + " (?:until )(?:(?:[0-9]{1,2})? *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?)? *(?:[12][0-9]{3}))"
    re_previously3 = re_altname + " (?:until)"
    re_previously3 = r"\b(?:[Uu]ntil [0-9]{4}) " + re_altname
    re_alternatively1 = r"\b(?:[Rr]eferred to|[Rr]efers to|[Ee]rratically|[Aa]lias|[Bb]rad had|hb had|[Ll]isted under|[Ii]ndiscriminately|[Nn]otice has|where)\b " + re_altname
    re_alternatively2 = re_altname + " (?:(?:(?:in )?(?:hb|[Bb]rad|NB|[Mm]urray))|(?:until renamed))"
    re_alternatively3 = re_altname + " (?:(?:[0-9]{1,2})? *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?)? *(?:[12][0-9]{3}))"
    re_added1 = r"\b(?:[Aa]dded)\b " + re_altname
    re_added2 = re_altname + " (?:(?:was )?added)"
    re_dropped1 = r"\b(?:[Dd]ropped)\b " + re_altname
    re_dropped2 = re_altname + " (?:dropped)"
    re_referenced = r"\b(?:[Ss]ee|[Ss]ee under)\b " + re_altname
    
    # Find all occurrences of alternate names in the description:
    replacedby = re.findall(re_replacedby, description)
    previously = re.findall(re_previously1, description) + re.findall(re_previously2, description) + re.findall(re_previously3, description)
    alternatively = re.findall(re_alternatively1, description) + re.findall(re_alternatively2, description) + re.findall(re_alternatively3, description)
    added = re.findall(re_added1, description) + re.findall(re_added2, description)
    dropped = re.findall(re_dropped1, description) + re.findall(re_dropped2, description)
    referenced = re.findall(re_referenced, description)
    
    alternate_names = (replacedby, previously, alternatively, added, dropped)
    proc_altnames = process_altnames(alternate_names, mainst, subst)
    
    return proc_altnames, referenced

In [None]:
df = pd.read_pickle("quicks_processed.pkl")

alt_mainId = []
alt_substId = []
ref_mainId = []
ref_substId = []
altnames = []
referenced = []
for i, row in df.iterrows():
    t = parse_description(row["Description"], row["MainStation"], row["SubStFormatted"])
    for x in t[0]:
        alt_mainId.append(row["MainId"])
        alt_substId.append(row["SubId"])
        altnames.append(x)
    for x in t[1]:
        ref_mainId.append(row["MainId"])
        ref_substId.append(row["SubId"])
        referenced.append(x)
        
# Dataframe of alternate names:
df_altnames = pd.DataFrame()
df_altnames["Altname"] = altnames
df_altnames["MainId"] = alt_mainId
df_altnames["SubId"] = alt_substId
        
# Dataframe of referenced names:
df_referenced = pd.DataFrame()
df_referenced["Referenced"] = referenced
df_referenced["MainId"] = ref_mainId
df_referenced["SubId"] = ref_substId

df_altnames.to_pickle("quicks_altnames_df.pkl")
df_referenced.to_pickle("quicks_referenced_df.pkl")

In [None]:
def format_for_candranker(gazname, unique_placenames_array):
    """
    This function returns the unique alternate names in a given gazetteer
    in the format required by DeezyMatch candidate ranker."""
    with open(gazname + ".txt", "w") as fw:
        for pl in unique_placenames_array:
            pl = pl.strip()
            if pl:
                pl = pl.replace('"', "")
                fw.write(pl.strip() + "\t0\tfalse\n")

In [None]:
unique_placenames_array = list(set(list(np.array(df_altnames["Altname"]))))
format_for_candranker("../toponym_matching/toponyms/quicks_altnames", unique_placenames_array)

unique_placenames_array = list(set(list(np.array(df_referenced["Referenced"]))))
format_for_candranker("../toponym_matching/toponyms/quicks_referenced", unique_placenames_array)