In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import zipfile
import re
import json
import collections
from lxml import etree
from xml.etree.ElementTree import XML
from random import shuffle
import pathlib
import itertools
import numpy as np
import utils

docxFileName = "/resources/quick/quick_section4.docx"
docxZip = zipfile.ZipFile(docxFileName)
documentXML = docxZip.read('word/document.xml')
et = etree.XML(documentXML)
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

pathlib.Path('outputs/').mkdir(parents=True, exist_ok=True)

### Find main stations

In [3]:
mainstation = ""
lowerstation = ""
dText = dict()
counter = 0
for i, para in enumerate(et.xpath('//w:p', namespaces=ns)):
    text = para.xpath('./w:r/w:t', namespaces=ns)
    description = " ".join([t.text for t in text])
    mainstation, counter = utils.is_mainst(para, mainstation, counter, ns)
    description = description.lstrip('\x01').strip()
    if description:
        if (counter, mainstation) in dText:
            dText[(counter, mainstation)].append(description)
        else:
            description = re.sub('^(' + re.escape(mainstation) + ')', '\1', description).lstrip('\x01').strip()
            description = re.sub(r" +", " ", description).lstrip('\x01').strip()
            if description:
                dText[(counter, mainstation)] = [description]

### Index main stations

In [4]:
dStations = collections.OrderedDict(dText)

indices = []
stations = []
descriptions = []
for k in dStations:
    indices.append(k[0])
    stations.append(k[1])
    descriptions.append(dStations[k])

stationdf = pd.DataFrame(columns=["Index", "Station", "Description"])
stationdf["Index"] = indices
stationdf["Station"] = stations
stationdf["Description"] = descriptions
stationdf = stationdf.set_index("Index")

### Detect substations

In [5]:
stations = pd.DataFrame(columns=['station','type','description'])
cols = ['MainId', 'MainStation', 'SubId', 'SubStation', 'Description']
lst = []
subInd = 0
for i, row in stationdf.iterrows():
    main_station = row["Station"]
    description = row["Description"]
    dSubstations, subInd = utils.process_decription(main_station, description, subInd)
    for ss in dSubstations:
        lst.append([i, main_station, ss[0], ss[1], dSubstations[ss]])
subsdf = pd.DataFrame(lst, columns=cols)

### Renaming abbreviated substations

In [6]:
subsdf['SubStFormatted'] = subsdf.apply(lambda row: utils.subst_rename(row["MainStation"], row["SubStation"]), axis = 1)
subsdf = subsdf[["MainId", "SubId", "MainStation", "SubStation", "SubStFormatted", "Description"]]
subsdf.to_pickle('outputs/quicks_processed.pkl')
subsdf.to_csv('outputs/quicks_processed.tsv', sep="\t", index=False)

### Formatting station names as required by DeezyMatch

In [7]:
unique_placenames_array = list(set(list(np.array(subsdf["MainStation"]))))
utils.format_for_candranker("../toponym_matching/toponyms/quicks_mainst_queries", unique_placenames_array)

unique_placenames_array = list(set(list(np.array(subsdf["SubStFormatted"]))))
utils.format_for_candranker("../toponym_matching/toponyms/quicks_subst_queries", unique_placenames_array)

### Find disambiguators and companies

In [8]:
parsedf = subsdf.copy()
parsedf[['Disambiguator', 'Companies', 'FirstCompanyWkdt', 'AltCompaniesWkdt']] = parsedf.apply(lambda row: pd.Series(list(utils.detect_companies(row["Description"]))), axis = 1)

### Extract map information

In [9]:
parsedf[['LocsMaps', 'LocsMapsDescr']] = parsedf.apply(lambda row: pd.Series(list(utils.detect_mapsInfo(row["Description"]))), axis = 1)

### Extact alternate and referenced railway stations

In [10]:
parsedf[['Altnames', 'Referenced']] = parsedf.apply(lambda row: pd.Series(list(utils.detect_altnames(row["Description"], row["MainStation"], row["SubStFormatted"]))), axis = 1)

### Capture opening and closing dates

In [11]:
parsedf[['Opening', 'Closing']] = parsedf.apply(lambda row: pd.Series(list(utils.capture_dates(row["Description"]))), axis = 1)

### Prepare altnames, refs and disambiguators as DeezyMatch queries

In [12]:
utils.prepare_alt_queries(parsedf, "Altnames")

### Store resulting dataframe

In [13]:
parsedf.to_pickle('outputs/quicks_parsed.pkl')
parsedf.to_csv('outputs/quicks_parsed.tsv', sep="\t", index=False)

### Create dev and test dataframes

In [14]:
annotations = pd.read_csv('resources/annotations.tsv', sep='\t')
df_dev = annotations[annotations["DevTest"] == "Dev"]
df_test = annotations[annotations["DevTest"] == "Test"]

df_test = pd.merge(df_test, parsedf, on=["MainId", "SubId", "MainStation", "SubStation", "SubStFormatted", "Description"])
df_dev = pd.merge(df_dev, parsedf, on=["MainId", "SubId", "MainStation", "SubStation", "SubStFormatted", "Description"])

df_dev.to_pickle('outputs/quicks_dev.pkl')
df_dev.to_csv('outputs/quicks_dev.tsv', sep="\t", index=False)

df_test.to_pickle('outputs/quicks_test.pkl')
df_test.to_csv('outputs/quicks_test.tsv', sep="\t", index=False)