In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import zipfile
import re
import json
import collections
from lxml import etree
from xml.etree.ElementTree import XML
from random import shuffle
import pathlib
import itertools
import string
from difflib import SequenceMatcher
import numpy as np

docxFileName = "../resources/quicks/quick_section4.docx"
docxZip = zipfile.ZipFile(docxFileName)
documentXML = docxZip.read('word/document.xml')
et = etree.XML(documentXML)
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

### Using xpath to find main stations

In [None]:
mainstation_xpath = './w:r[not(preceding-sibling::w:r//w:t)][not(w:rPr/w:sz[@w:val=\"16\"])][..//w:b[not(@w:val=\"0\")]]/w:t[1]'
mainstation_xpath2 = './w:r[not(preceding-sibling::w:r//w:t)][not(w:rPr/w:sz[@w:val=\"16\"])][../w:pPr[w:pStyle]/w:rPr/w:b[@w:val=\"0\"]]/w:t[1]'
mainstation_xpath3 = './w:r[not(preceding-sibling::w:r//w:t)][not(w:rPr/w:sz[@w:val=\"16\"])][../w:pPr[w:pStyle[@w:val=\"Heading1\"]]/w:rPr]/w:t[1]'
mainstation_xpath4 = './w:r[not(preceding-sibling::w:r//w:t)][not(w:rPr/w:sz[@w:val=\"16\"])][../w:pPr[w:pStyle[@w:val=\"Heading2\"]]/w:rPr]/w:t[1]'
mainstation_xpath5 = './w:r[not(preceding-sibling::w:r//w:t)][not(w:rPr/w:sz[@w:val=\"16\"])][../w:pPr[w:pStyle[@w:val=\"Heading3\"]]/w:rPr]/w:t[1]'
mainstation_xpath6 = './w:r[not(preceding-sibling::w:r//w:t)][not(w:rPr/w:sz[@w:val=\"16\"])][../w:pPr[w:pStyle[@w:val=\"Heading4\"]]/w:rPr]/w:t[1]'
first_token_para_xpath = './w:r[//w:t]/w:t[1]'

stations = pd.DataFrame(columns=['station','type','description'])
        
def is_mainst(para, mainstation, initial_letter, counter):
    paraxp = para.xpath(first_token_para_xpath, namespaces=ns)
    mainxpath = ""
    if paraxp:
        
        # If text is capitalized (with exception for stations starting with "Mc")L
        if paraxp[0].text.isupper() or paraxp[0].text.startswith("Mc") and paraxp[0].text[2:].isupper():
            
            # See if xpath matches a mainstation xpath:
            mainxpath = para.xpath(mainstation_xpath, namespaces=ns)
            if not mainxpath:
                mainxpath = para.xpath(mainstation_xpath2, namespaces=ns)
            if not mainxpath:
                mainxpath = para.xpath(mainstation_xpath3, namespaces=ns)
            if not mainxpath:
                mainxpath = para.xpath(mainstation_xpath4, namespaces=ns)
            if not mainxpath:
                mainxpath = para.xpath(mainstation_xpath5, namespaces=ns)
            if not mainxpath:
                mainxpath = para.xpath(mainstation_xpath6, namespaces=ns)
                
            # Filter out station names of length 1, station names that start with initial
            # of previous main station (e.g. "Y BOOTHAM JUNCTION") or that start with an
            # open square bracket or parenthesis:
            if mainxpath and len(mainxpath[0].text.strip()) > 1 and not mainxpath[0].text.startswith(initial_letter + " ") and not mainxpath[0].text.startswith("[") and not mainxpath[0].text.startswith("("):
                counter += 1
                mainstation = mainxpath[0].text
                initial_letter = mainstation[0]
                return mainstation, initial_letter, counter
            else:
                return mainstation, initial_letter, counter
        else:
            return mainstation, initial_letter, counter
    else:
        return mainstation, initial_letter, counter

mainstation = ""
initial_letter = ""
lowerstation = ""
dText = dict()
counter = 0
for i, para in enumerate(et.xpath('//w:p', namespaces=ns)):
    text = para.xpath('./w:r/w:t', namespaces=ns)
    description = " ".join([t.text for t in text])
    mainstation, initial_letter, counter = is_mainst(para, mainstation, initial_letter, counter)
    print(mainstation)
    description = description.lstrip('\x01').strip()
    if description:
        if (counter, mainstation) in dText:
            dText[(counter, mainstation)].append(description)
        else:
            description = re.sub('^(' + re.escape(mainstation) + ')', '\1', description).lstrip('\x01').strip()
            description = re.sub(r" +", " ", description).lstrip('\x01').strip()
            if description:
                dText[(counter, mainstation)] = [description]

### Indexing main stations

In [None]:
dStations = collections.OrderedDict(dText)

indices = []
stations = []
descriptions = []
for k in dStations:
    indices.append(k[0])
    stations.append(k[1])
    descriptions.append(dStations[k])

stationdf = pd.DataFrame(columns=["Index", "Station", "Description"])
stationdf["Index"] = indices
stationdf["Station"] = stations
stationdf["Description"] = descriptions
stationdf = stationdf.set_index("Index")

### Using regex to find substations

In [None]:
# From Quicks intro:
companies = {"AN Jt":"Ashby & Nuneaton Joint.",
    "ANSW":"Alexandra (Newport & South Wales) Docks & Railway.",
    "Ax Jt":"Axholme Joint.",
    "Bak":"Bakerloo Line.",
    "BC":"Bishops Castle.",
    "BE":"Bristol & Exeter.",
    "BG":"Birmingham & Gloucester.",
    "BLCJ":"Birkenhead, Lancashire & Cheshire Junction.",
    "BM":"Brecon & Merthyr.",
    "BPGV":"Burry Port & Gwendraeth Valley.",
    "BR":"British Railways (whilst nationalised).",
    "BT":"Blyth & Tyne.",
    "BWA":"Bideford, Westward Ho! & Appledore. ",
    "Cal":"Caledonian.",
    "Cam":"Cambrian.",
    "Camp":"Campbeltown & Machrihanish Light.",
    "CE":"Clifton Extension Joint.",
    "Cen":"Central Line.",
    "CGU":"City of Glasgow Union.",
    "CHP":"Cromford & High Peak.",
    "CKP":"Cockermouth, Keswick & Penrith.",
    "CLC":"Cheshire Lines Committee. ",
    "CMDP":"Cleobury Mortimer & Ditton Priors Light.",
    "CO Jt":"Croydon & Oxted Joint.",
    "Croydon":"Croydon Tramlink.",
    "CVH":"Colne Valley & Halstead.",
    "CW Jc":"Cleator & Workington Junction.",
    "DA":"Dundee & Arbroath (original and later Joint).",
    "DB Jt":"Dumbarton & Balloch Joint.",
    "Dist":"District Line (strictly Metropolitan District).",
    "Dock":"Docklands Light.",
    "DPA":"Dundee & Perth & Aberdeen Junction.",
    "EA":"East Anglian.",
    "EC":"Eastern Counties.",
    "Ed & Dalk":"Edinburgh & Dalkeith.",
    "EG":"Edinburgh & Glasgow.",
    "EK":"East Kent Light.",
    "EU":"Eastern Union.",
    "EWYU":"East & West Yorkshire Union.",
    "Fur":"Furness.",
    "FYN":"Freshwater, Yarmouth & Newport.",
    "GBK Jt":"Glasgow, Barrhead & Kilmarnock Joint.",
    "GC":"Great Central.",
    "GC GI":"Great Central (Grimsby & Immingham electric tramway).",
    "GE":"Great Eastern.",
    "GJ":"Grand Junction. ",
    "Glyn":"Glyn Valley Tramway. ",
    "GN":"Great Northern.",
    "GNS":"Great North of Scotland.",
    "GP Jt":"Glasgow & Paisley Joint.",
    "GSW":"Glasgow & South Western.",
    "GU":"Glasgow Underground.",
    "GW":"Great Western. ",
    "HB":"Hull & Barnsley.",
    "HC":"Hammersmith & City Joint.",
    "High":"Highland.",
    "IoW":"Isle of Wight.",
    "IWC":"Isle of Wight Central.",
    "Jub":"Jubilee Line.",
    "KB":"Kilsyth & Bonnybridge Joint.",
    "KE":"Knot(t) End.",
    "KES":"Kent & East Sussex Light.",
    "L&B":"London & Birmingham.",
    "LBSC":"London, Brighton & South Coast.",
    "LCD":"London Chatham & Dover. ",
    "LM":"Liverpool & Manchester.",
    "LMS":"London, Midland & Scottish.",
    "LNE":"London & North Eastern.",
    "LNW":"London & North Western.",
    "LO":"Liverpool Overhead.",
    "LPJ":"Lancaster & Preston Junction.",
    "LPTB":"London Passenger Transport Board.",
    "LSW":"London & South Western.",
    "LTS":"London, Tilbury & Southend.",
    "LU":"Lancashire Union.",
    "LY":"Lancashire & Yorkshire.",
    "Lynton ":"Lynton & Barnstaple. ",
    "Manch":"Manchester Metrolink.",
    "MC":"Maryport & Carlisle.",
    "Met":"Metropolitan Railway/ Line",
    "Met GNC":"Metropolitan (Great Northern & City Section).",
    "MGN":"Midland and Great Northern Joint line. ",
    "Mid":"Midland.",
    "MK":"Monkland & Kirkintilloch.",
    "MS&L":"Manchester, Sheffield & Lincolnshire.",
    "MSJA":"Manchester, South Junction & Altrincham.",
    "MSWJ":"Midland & South Western Junction.",
    "N&B":"Neath & Brecon.",
    "NB":"North British.",
    "NC":"Newcastle & Carlisle.",
    "NE":"North Eastern.",
    "Newtyle":"Dundee & Newtyle.",
    "Nidd":"Nidd Valley Light.",
    "NL":"North London.",
    "Nor":"Northern Line.",
    "Norfolk & S":"Norfolk & Suffolk Joint.",
    "NS":"North Staffordshire.",
    "NSWJ":"North & South Western Junction Joint.",
    "NU":"North Union/North Union Joint.",
    "NWNG":"North Wales Narrow Gauge.",
    "OAGB":"Oldham, Ashton & Guide Bridge Junction Joint. ",
    "PDSW":"Plymouth, Devonport & South Western Junction.",
    "Picc":"Piccadilly Line.",
    "PLA":"Port of London Authority.",
    "PPW Jt":"Portpatrick & Wigtownshire Joint.",
    "PT":"Port Talbot Railway & Docks.",
    "PW":"Preston & Wyre (original and later Joint).",
    "Raven":"Ravenglass & Eskdale.",
    "RHD":"Romney, Hythe & Dymchurch. ",
    "Rhy":"Rhymney.",
    "RSB":"Rhondda & Swansea Bay.",
    "Rye & C":"Rye & Camber Tramway.",
    "S&D":"Stockton & Darlington.",
    "Scot Cent":"Scottish Central.",
    "SD Jt":"Somerset & Dorset Joint.",
    "SE":"South Eastern.",
    "SEC":"South Eastern & Chatham.",
    "SH Jt":"Shrewsbury & Hereford Joint.",
    "SIT":"Swansea Improvements & Tramways Company (Swansea & Mumbles).",
    "SK":"Swinton & Knottingley Joint.",
    "SM":"Shropshire & Montgomeryshire Light.",
    "SMJ":"Stratford-upon-Avon & Midland Junction. ",
    "SR":"Southern.",
    "SSMWC":"South Shields, Marsden & Whitburn Colliery.",
    "SW Jt":"Severn & Wye Joint.",
    "SY":"South Yorkshire (later part of GC).",
    "TFG Jt":"Tottenham & Forest Gate Joint. ",
    "TH Jt":"Tottenham & Hampstead Joint.",
    "TV":"Taff Vale.",
    "TWM":"Tyne & Wear Metro.",
    "Vic":"Victoria Line.",
    "VoR":"Vale of Rheidol.",
    "W Lancs":"West Lancashire.",
    "WCE Jt":"Whitehaven, Cleator & Egremont Joint.",
    "WCP":"Weston, Clevedon & Portishead Light.",
    "WELCP":"West End of London & Crystal Palace.",
    "WH":"Welsh Highland.",
    "WL":"West London Joint (including Extension). ",
    "WMC":"Wilsontown, Morningside & Coltness.",
    "WMCQ ":"Wrexham, Mold & Connah’s Quay.",
    "WP Jt":"Weymouth & Portland Joint.",
    "WRG Jt":"West Riding & Grimsby Joint.",
    "WS":"West Sussex (Selsey Tramway).",
    "WSC Jt":"Woodside & South Croydon Jt.",
    "WSM":"West Somerset Mineral"}

# From Quicks intro:
stntypes = ["AOT", "CLO", "CO", "CO N", "CO TT", "ND", "NG", "NON-TT", "OP", "P", "REOP", "TT", "HL", "LL", "HB", "HBA", "NG", "TT"]

# Most common last tokens:
keywords = ["PLATFORM", "PLATFORMS", "HALT", "INTERNATIONAL", "JUNCTION", "CAMP", "CENTRAL", 'ROAD', 
            'JUNCTION', 'BRIDGE', 'STREET', 'PARK', 'LANE', 'HILL', 'COLLIERY', 'TOWN', 'GREEN', 
            'CENTRAL', 'CROSSING', 'NORTH', 'LEVEL', 'EAST', 'DOCK', 'WEST', 'GATE', 'CROSS', 'HALT', 
            'SOUTH', 'MILL', 'END', 'SIDING', 'HALL', 'HOUSE']

In [None]:
def process_decription(mainst, description, substationId):
    print(mainst)
    
    # Original formatting error such as "DYKEBAR [", "Cal]"
    if mainst.endswith("["):
        mainst = mainst[:-1]
        description[0] = "[" + description[0]
    
    dSubstations = dict()
    
    rsubst = r"[A-Z ?\-?\&? ?]+ "
    rsubstInitial = r"^(" + mainst[0] + "[ |\-]([A-Z ?\-?\&? ?]+)+) "
    
    substname = ""
    
    for line in description:
        match1 = re.match(rsubst, line)
        match2 = re.match(rsubstInitial, line)
        if match2:
            if not match2.group(0).strip() in companies:
                substname = match2.group(0).strip()
                substationId += 1
        elif match1:
            if len(match1.group(0).strip()) > 1 and not match1.group(0).strip() in companies:
                substname = match1.group(0).strip()
                substationId += 1
        if substname == "":
            substname = mainst
            substationId += 1
        stup = (substationId, substname)
        if not stup in dSubstations:
            line = re.sub(r"^" + substname, "", line)
            dSubstations[stup] = line.strip()
        else:
            dSubstations[stup] += " " + line.strip()
                
    return dSubstations, substationId

In [None]:
cols = ['MainId', 'MainStation', 'SubId', 'SubStation', 'Description']
lst = []
subInd = 0
for i, row in stationdf.iterrows():
    main_station = row["Station"]
    description = row["Description"]
    dSubstations, subInd = process_decription(main_station, description, subInd)
    for ss in dSubstations:
        lst.append([i, main_station, ss[0], ss[1], dSubstations[ss]])
subsdf = pd.DataFrame(lst, columns=cols)

### Renaming abbreviated substations

In [None]:
def subst_rename(main, sub):
    sub = sub.replace("&", " AND ")
    sub = re.sub(' +', ' ', sub)
    rsub = []
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
    main = main.translate(translator).split()
    sub = sub.translate(translator).split()
    if sub != main:
        
        # Sometimes, first token is split by whitespace. Join split tokens:
        # e.g. ['F', 'ISHPONDS']
        if len(sub) == 2:
            if sub[0] + sub[1] == main[0]:
                sub = [sub[0] + sub[1]]
        # e.g. ['L', 'ITTLE', 'ORMESBY']
        if len(sub) > 2:
            if sub[0] + sub[1] == main[0]:
                sub = [sub[0] + sub[1]] + sub[2:]
        
        # Sometimes, first token is split by whitespace. Join split tokens:
        # e.g. 'CROSS KEYS' and 'CROSSKEYS'
        if len(main) == 2:
            if main[0] + main[1] == sub[0]:
                main = [main[0] + main[1]]
        # e.g. 
        if len(main) > 2:
            if main[0] + main[1] == sub[0]:
                main = [main[0] + main[1]] + main[2:]
                
        # CASE 1: Main station is just one token, substation is more than one token:
        if len(main) == 1 and len(sub) > 1:
            
            # e.g. 'ALTON' and 'ALTON PARK'
            if main[0] == sub[0]:
                rsub += sub
            
            # e.g. 'BONNYRIGG' and 'BONNYRIGGE DEPOT'
            elif sub[0].startswith(main[0]):
                rsub += sub
            
            # e.g. 'BIRMINGHAM' and 'B NEW STREET'
            elif main[0][0] == sub[0][0] and len(sub[0]) == 1:
                rsub.append(main[0])
                rsub += sub[1:]
            
            # e.g. 'BARGEDDIE' and 'BARGE DDI E'
            elif "".join(sub) == main[0]:
                rsub += main
                
            # e.g. 'ALLOA' and 'SOUTH ALLOA'
            elif any(x == main[0] for x in sub):
                rsub += sub
            
            # e.g. 'AIRDRIE' and 'COMMONHEAD A NORTH'
            elif any(len(x) == 1 and x[0] == main[0][0] for x in sub):
                for x in sub:
                    if len(x) == 1 and x[0] == main[0][0]:
                        rsub.append(main[0])
                    else:
                        rsub.append(x)
                        
            # e.g. 'CAERNARVON' and 'CARNARVON CASTLE'
            elif any(SequenceMatcher(None, x, main[0]).ratio() >= 0.8 for x in sub):
                rsub = sub
                
            # e.g. 'SOUTHPORT' and 'STEAMPORT MUSEUM'
            else:
                rsub.append(main[0])
                rsub += sub
                
        # CASE 2: Substation has length 1, mainstation length > 1
        elif len(sub) == 1 and len(main) > 1:
            
            # e.g. 'CLYDACH ON TAWE' and 'CLYDACH'
            if any(SequenceMatcher(None, x, sub[0]).ratio() >= 0.8 for x in main):
                rsub = sub
                
            # e.g. 'HIGHGATE ROAD' and 'HL'
            elif sub[0] in keywords or sub[0] in stntypes:
                rsub += main
                rsub += sub
                
            # e.g. 'BLAENAU FFESTINIOG' and 'DINAS'
            else:
                rsub = sub
                
        # CASE 3: Substation has length 1, mainstation length 1
        elif len(sub) == 1 and len(main) == 1:
            
            # e.g. 'BELMONT' and 'JUNCTION'
            if sub[0] in keywords or sub[0] in stntypes:
                rsub += main
                rsub += sub
            
            # e.g. 'SELHURST' and 'SELHUST'
            elif SequenceMatcher(None, sub[0], main[0]).ratio() >= 0.8:
                rsub += main
            
            # e.g. 'WALKER' and 'WALKERGATE'
            elif sub[0].startswith(main[0]):
                rsub += sub
                
            # e.g. 'TILBURY' and 'BERTHS'
            else:
                rsub += main
                rsub += sub
        
        # CASE 4: otherwise
        # e.g. 'FINCHLEY ROAD' and 'F R AND FROGNAL'
        # e.g. 'B ON S AND QUORN' and 'BARROW ON SOAR AND QUORN'
        else:
            tempmain = []
            remsub = []
            tsub = []
            for x in sub:
                if len(x) == 1:
                    for m in main:
                        if m.startswith(x) and not m in tsub and not m in remsub:
                            tsub.append(m)
                            remsub.append(x)
                    else:
                        tsub.append(x)
                else:
                    tsub.append(x)
                    
            rsub = [x for x in tsub if not x in remsub]
            
            # e.g. "ST A" or "S C" as substation names:
            if len(max(rsub, key=len)) <= 2:
                rsub = main
                
        rsub = " ".join(rsub)
        if re.search(r"\bLL\b", rsub):
            rsub = re.sub(r"\bLL\b", "LOW LEVEL", rsub)
        if re.search(r"\bHL\b", rsub):
            rsub = re.sub(r"\bHL\b", "HIGH LEVEL", rsub)
            
        return rsub
    else:
        return " ".join(sub)
        
subsdf['SubStFormatted'] = subsdf.apply(lambda row: subst_rename(row["MainStation"], row["SubStation"]), axis = 1)
subsdf = subsdf[["MainId", "SubId", "MainStation", "SubStation", "SubStFormatted", "Description"]]
subsdf.to_pickle('quicks_processed.pkl')

In [None]:
def format_for_candranker(gazname, unique_placenames_array):
    """
    This function returns the unique alternate names in a given gazetteer
    in the format required by DeezyMatch candidate ranker."""
    with open(gazname + ".txt", "w") as fw:
        for pl in unique_placenames_array:
            pl = pl.strip()
            if pl:
                pl = pl.replace('"', "")
                fw.write(pl.strip() + "\t0\tfalse\n")

In [None]:
unique_placenames_array = list(set(list(np.array(subsdf["MainStation"]))))
format_for_candranker("../toponym_matching/toponyms/quicks_mainst_queries", unique_placenames_array)

unique_placenames_array = list(set(list(np.array(subsdf["SubStFormatted"]))))
format_for_candranker("../toponym_matching/toponyms/quicks_subst_queries", unique_placenames_array)