In [1]:
import pandas as pd
import itertools
import string
import re
import string
from difflib import SequenceMatcher

In [6]:
def parse_description(description):
    opening_dates = []
    closing_dates = []
    alternate_names = []
    companies = []
    disambiguator = []
    location = []
    
    # -----------------------------------
    # Remove non-printable characters:
    description = ''.join([x if x in string.printable else ' ' for x in description])
    
    # -----------------------------------
    # Remove notes (in parentheses):
    description = re.sub(r'\([^)]*?\)', '', description)
    
    # -----------------------------------
    # Remove white spaces:
    description = re.sub(' +', ' ', description)
    
    # -----------------------------------
    # Capture opening dates:
    re_op_date_standard = r"\b(?:[Rr]e)?[Oo]p(?:en(?:ed)?)?\b(?: \[(?:[A-Z].*?)\])? *((?:[0-9]{1,2})? *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?) *(?:[12][0-9]{3}))"
    re_op_date_reverse = r"\b(?:[Rr]e)?[Oo]p(?:en(?:ed)?)?\b(?: \[([?:A-Z].*?)\])? *(?:(?:[12][0-9]{3}) *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?) *(?:[0-9]{1,2})?)"
    re_op_date_nomark = r"\[[A-Za-z\&\;]+\] *((?:[0-9]{1,2})? *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?) *(?:[12][0-9]{3}))"
    re_op_date_flexible = r"\b(?:[Rr]e)?[Oo]p(?:en(?:ed)?)?\b(?: \[(?:[A-Z].*?)\])? *(?:(?:[a-z]+) +)+((?:[0-9]{1,2})? *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?) *(?:[12][0-9]{3}))"
    re_op_date_flexreverse = r"\b(?:[Rr]e)?[Oo]p(?:en(?:ed)?)?\b(?: \[(?:[A-Z].*?)\])? *(?:(?:[a-z]+) +)+((?:(?:[12][0-9]{3}) *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?)? *(?:[0-9]{1,2})?))"
    re_op_date_flexnomark = r"\[[A-Za-z\&\;]+\] *(?:(?:[A-Za-z]+) +)+((?:[0-9]{1,2})? *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?) *(?:[12][0-9]{3}))"
    re_op_firstinbrad = r"[Ff]irst *(?:(?:[a-z]+) +)+[Bb]rad*(?:(?:[a-z]+) +)+((?:[0-9]{1,2})? *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?) *(?:[12][0-9]{3}))"
    
    opst = re.findall(re_op_date_standard, description)
    oprv = re.findall(re_op_date_reverse, description)
    opnm = re.findall(re_op_date_nomark, description)
    opfl = re.findall(re_op_date_flexible, description)
    opflrv = re.findall(re_op_date_flexreverse, description)
    opnmfl = re.findall(re_op_date_flexnomark, description)
    opfib = re.findall(re_op_firstinbrad, description)
    
    capturedOp = list(set(opst+oprv+opnm+opfl+opflrv+opnmfl))
    
    # If no openingdate has been found, add first-in-brad date if exists:
    if not capturedOp:
        capturedOp += opfib
    
    opening_dates = capturedOp
    
    # -----------------------------------
    # Capture closing dates:
    re_cl_date_standard = r"\b(?:re)?[Cc]?lo(?:sed)?\b *((?:[0-9]{1,2})? *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?) *(?:[12][0-9]{3}))"
    re_cl_date_reverse = r"\b(?:re)?[Cc]?lo(?:sed)?\b *(?:(?:[12][0-9]{3}) *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?) *(?:[0-9]{1,2})?)"
    re_cl_date_flexible = r"\b(?:re)?[Cc]?lo(?:sed)?\b *(?:(?:[A-Za-z]+) )+((?:[0-9]{1,2})? *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?) *(?:[12][0-9]{3}))"
    re_cl_date_flexreverse = r"\b(?:re)?[Cc]?lo(?:sed)?\b *(?:(?:[A-Za-z]+) )+((?:(?:[12][0-9]{3}) *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?)? *(?:[0-9]{1,2})?))"
    re_cl_date_last = r"[Ll]ast *(?:(?:[A-Za-z]+) +)*((?:[0-9]{1,2})? *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?) *(?:[12][0-9]{3}))"
    clst = re.findall(re_cl_date_standard, description)
    clrv = re.findall(re_cl_date_reverse, description)
    clfl = re.findall(re_cl_date_flexible, description)
    clflrv = re.findall(re_cl_date_flexreverse, description)
    cllast = re.findall(re_cl_date_last, description)
    
    capturedClo = list(set(clst+clrv+clfl+clflrv))
    # If "still open" in description, add as closing date:
    if "still open" in description.lower():
        capturedClo.append("still open")
        
    # If no closing date has been found, add last-in-brad date if exists:
    if not capturedClo:
        capturedClo += cllast
        
    closing_dates = capturedClo
    
    # -----------------------------------
    # Capture company:
    re_company = r"\[([A-Z].*?)\]"
    
    if re.search(re_company, description):
        companies.append(re.search(re_company, description).group(1))
    
    # -----------------------------------
    # Capture alternate names:
    
    # Remove square and curly brackets:
    description_alt = re.sub(r'\[[^)]*?\]', '', description)
    description_alt = re.sub(r'\{[^)]*?\}', '', description_alt)
    
    re_altnames_verb = r"\b(?:[Ww]as|as|[Bb]ecame|[Rr]eferred to|[Rr]efers to|[Oo]riginally|[Rr]enamed|[Ee]rratically|[Bb]rad had|hb had|[Aa]dded|[Ll]ater|[Ll]isted under|[Ii]ndiscriminately|[Nn]otice has|[Aa]ltered to)\b (\b[A-Z \&\'\-\/(for)(at)(on)(upon)]{3,}\b)"
    re_altnames_guide = r"(\b[A-Z \&\'\-\/(for)(at)(on)(upon)}]{3,}\b) (?:(?:(?:in )?(?:hb|[Bb]rad|NB list))|(?:added)|(?:until renamed))"
    re_altnames_date = r"(\b[A-Z \&\'\-\/(for)(at)(on)(upon)]{3,}\b) (?:until )?(?:(?:[0-9]{1,2})? *(?:Jan ?(?:uary)?|Feb ?(?:ruary)?|Mar ?(?:ch)?|Apr ?(?:il)?|May ?|Jun ?(?:e)?|Jul ?(?:y)?|Aug ?(?:ust)?|Sep ?(?:tember)?|Oct ?(?:ober)?|Nov ?(?:ember)?|Dec ?(?:ember)?)? *(?:[12][0-9]{3}))"
    
    altvb = re.findall(re_altnames_verb, description_alt)
    altgu = re.findall(re_altnames_guide, description_alt)
    altdt = re.findall(re_altnames_date, description_alt)
    
    altns = list(set(altvb+altgu+altdt))
    alternate_names = list(set([x.strip() for x in altns if not x.strip()[0].islower() and not x.strip()[-1].islower()]))
    
    return opening_dates, closing_dates, companies, alternate_names

In [7]:
subsdf = pd.read_pickle("quick_subst_processed.pkl")

for i, row in subsdf.sample(n=1000, random_state=19).iterrows():
    print(row["MainStation"])
    t = parse_description(row["Description"])
    print(row["Description"])
#     print("\tOpd:", t[0])
#     print("\tCld:", t[1])
#     print("\tCmp:", t[2])
    print("\tAlt:", t[3])
    print()

BULWELL
op 23 May 1994 (BLN 733) ; near site of old B Market, south side of Highbury Road Bridge; still open.
	Alt: []

DEANSIDE
[GP Jt] op 1 June 1903 ( RCG ) ; clo 1 January 1905 (Sunday) ( RCG ) ; {Cardonald – Kings Inch}.
	Alt: []

SKIPTON
[Mid] op 7 September 1847**; re-sited 10 chains north 30 April 1876 (Mid) ; still open. Ticket platform , only reference seen: accident here, on Bradford side of station; (Bradford Daily Telegraph 30 August 1875).
	Alt: []

GILFACH GOCH
(non-tt): op after 1915; probably clo with above, 22 September 1930; not in wtt 20 July 1931 nor any later; {60 chains beyond G G}.
	Alt: []

LLANTWIT MAJOR
(a) [Barry] op 1 December 1897 (co n S Wales Echo 1 st ) ; clo 15 June 1964 (RM August) .
	Alt: []

BURITON SIDING STAFF HALT
[LSW or later] (non-tt): op ?; clo by April 1983; {Petersfield – Rowlands Castle} (@JF).
	Alt: []

TOOTING
[Nor] op 13 September 1926  (T 14 th )  as TRINITY ROAD; see 1922**; renamed 1 October 1950  (RM January 1951) ; still open.
	Alt

[GSW] op 1 March 1905 (Dumfries 1 st ) ; clo 3 May 1943 (RM January 1944) .
	Alt: []

NEATH
[GW] (non-tt); railwaymen; at least 1928 to 1954; {branch from main station  (U) .
	Alt: []

BACKWORTH
{map 26}.
	Alt: []

LOWTON
[GC] op 1 April 1884  (Wigan Obs 2 nd , item and tt) ; clo 2 November 1964  (RM December) .
	Alt: []

BERW ROAD
[TV]
	Alt: []

BURGESS HILL
[LBSC] line op 21 September 1841 and mentioned in line inspection report (Rtn) but not in tt co n for opening Sussex Advertiser 20 th and not in Brad until March 1842 but was mentioned in two adverts Brighton Gazette 27 January – delay in opening? Or intended station? Clo 2 October 1843 (co n T 2 nd ) ; reop 1 May 1844 (Sussex Advertiser 7 th ) ; moved about 130 yards north between 1875 and 1897 OS maps – buildings date from 1877 (see Stewart Smith, Chron January 2020) ; still open.
	Alt: []

SHIREBROOK
[Mid] (non-tt): op 1 July 1901 wtt  (Mid) ; clo by July 1954  (U) , however 11 September 1961 wtt shows stop by 5.30 from Worksop

[Raven]: undated ticket to Ravenglass, return half of Edmonson, headed ‘Narrow Guage’ [sic] exists; no indication of anything other than for normal public use (A. Porter). Nothing further known.
	Alt: []

ANDOVER ROAD
– see MICHELDEVER.
	Alt: []

BOLTON-UPON-DEARNE 
[SK Jt] op 1 July 1879 (Mid; co n T 28 June- line) as HICKLETON; became B-on-D 1   November   1879, B-on-D for GOLDTHORPE 15 January 1924, B-on-D 12 June 1961 (Mid) , B-UPON-D 3 April 2008 (AB Chron April 2011) but already this 1883 hb ; still open.
	Alt: ['HICKLETON', 'B-on-D for GOLDTHORPE', 'B-UPON-D', 'B-on-D']

BENTHAM
(alias HIGH B/B HIGH/B HIGHER) op 2 May 1850, as terminus for time being. Inspection report dated 29 May 1850  (Rtn) , prior to line opening through (would happen 1 June 1850 –  T 3 rd ) , shows HIGH B as already  in existence; distance given, 4  miles 15 chains to Clapham also fits HIGH B better than Low B. Still open as B.
	Alt: []

BREAMORE
[LSW] op 20 December 1866 (Salisbury & Winchester Journal 22 

[LTS] op 1 March 1856  (co n T 1 March)  as S ; became S-on-SEA 1 June 1876  (L) ; CENTRAL added 1 May 1949  (Mid) ; shortened to S C 20 February 1969  (JS – BR ER commercial of this date said ‘forthwith’) ; still open. Excursion Platform , only reference seen:  D aily  Herald 13 August 1920 –  excursion platform used for first time for six years.
	Alt: ['CENTRAL', 'S-on-SEA']

PELLON
[Halifax High Level] op 5 September 1890 (RCG) ; clo 1 January 1917 (RCH) . GN ticket (JB) shows as HALIFAX P.
	Alt: ['HALIFAX P']

SOUTH GOSFORTH
(non-tt): open day; 17 September 1989  (@JF).
	Alt: []

TALSARN
[GW] op 12 May 1911 (co n Lampeter) ; HALT; clo 12 February 1951 (Cl) – see 1951**. Added hb 1927a as T  PLATFORM and thus to 1938.
	Alt: ['T PLATFORM']

HOLBORN VIADUCT
[LCD] op 2 March 1874 (co n T 2 nd ) ; clo 29 January 1990 (last train Friday 26 th ) (T 27 th ).
	Alt: []

WOODHILL ROAD
(a) op 3 July 1905  (co n Bury Guardian 1 July) ;  HALT; clo 1 April 1918  (Cl) . LY ticket for BURY W R  (JB