# Parsedir

Parse Langley-style city directory into individual entries

Assumes OCR has been done and that directory is in text format

Currently has two hard-coded parameters (that should be inferred in a future version):
- window size
- threshold for clustering

In [1]:
from __future__ import division
import collections
import editdistance
import json
import matplotlib.pyplot as plt
import numpy as np
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
import regex as re
import scipy.cluster as cl
import scipy as sp
import sqlite3
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter, Figure, Layout

In [2]:
%matplotlib inline
init_notebook_mode(connected=True)

In [3]:
datadir = "data"

In [6]:
def buildindex(indexfile):
    # load file
    # 'data/directoryindex.txt'
    f = open(indexfile, 'r')
    f.seek(0)
    contents = f.read()
    lines = re.split("\n", contents)
    conn = sqlite3.connect('data/index.db')
    c = conn.cursor()

    c.execute('''CREATE TABLE sections (publisher text, year int, file text, section text,
       subsection text, start int, end int)''')

    for line in lines:
        if len(line) > 0:
            fields = re.split("\|", line)
            fieldlist = [field for field in fields]
            execstring = "INSERT INTO sections VALUES ('%s', %s, '%s', '%s', '%s', %s, %s)" % tuple(fieldlist)
            c.execute(execstring)

    conn.commit()
    conn.close()

In [7]:
buildindex(datadir + "/directoryindex.txt")

OperationalError: table sections already exists

In [8]:
# how close in alphabetical order is b following a?
def alphabetized(a, b, token_dict):
    da = token_dict[a.lower()]#[i for i,x in enumerate(sorted_tokens) if x==a.lower()]
    db = token_dict[b.lower()]#[i for i,x in enumerate(sorted_tokens) if x==b.lower()]
    ld = da - db #np.diff(zip(da, db))[0][0]
    return ld

def letterdist(a, b):
    return ord(a) - ord(b)

def levenshtein(a, b):
    return float(editdistance.eval(a, b))

def sortdistance(a, b, token_dict):
    return abs(alphabetized(a, b, token_dict))

# pairwise distance matrix
def sortdist(z, token_dict):
    Y = np.zeros((len(z)*(len(z)-1))//2, dtype=np.int)
    k = 0
    for i in range(len(z) - 1):
        for j in range(i + 1,len(z)):
            Y[k] = np.double(abs(token_dict[z[i].lower()] - token_dict[z[j].lower()]))
            k = k+1
    return Y

def firstword(line):
    words = re.split(r'[,\(]+', line)
    return words[0]#.rstrip()

def rebuild(tokenlist, newline_index):
    outtokenlist = list(tokenlist)
    for i, c in enumerate(outtokenlist):
        if c=="\n":
            outtokenlist[i] = ""
    for i, c in enumerate(newline_index):
        outtokenlist[c] = "\n%s" % (outtokenlist[c])
    joinedstring = "".join(outtokenlist)
    outtokenlist = re.split("\n", joinedstring)
    outtokenlist = [re.sub(" $", "", i) for i in outtokenlist]
    return outtokenlist

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
    
def sectionindex(year, section, **kwargs):
    if 'publisher' in kwargs:
        publisher = kwargs['publisher']
    else:
        publisher = "Langley"
    conn = sqlite3.connect('data/index.db')
    conn.row_factory = sqlite3.Row
    c = conn.cursor()
    execstring = "select * from sections where year=%d and publisher like '%s' and section like '%s'" % (year, publisher, section)
    if 'subsection' in kwargs:
        execstring = "%s and subsection like '%s'" % (execstring, kwargs['subsection'])
    c.execute(execstring)
    r = c.fetchone()
    return {'file':r['file'], 'start':r['start'], 'end':r['end']}

def parsefile(source):
    # load file
    # '1867Langley/1867LangleyDirectoryIndividuals.txt'
    f = open(source['file'], 'r')
    f.seek(0)
    contents = f.read()
    lines = re.split("\n", contents)
    section = lines[source['start']:source['end']]
    contents = "\n".join(section)
    contents = re.sub("- \n", "-\n", contents)
    contents = contents.replace('\xa0', ' ')
    contents = contents.replace('\xa2', 'c')
    contents = contents.replace('\x80', '')
    contents = contents.replace('\xe2', 'a')
    contents = contents.replace('\x96', '')
    contents = contents.replace('\xc2\xbb', 's')
    return contents
    
def parsecontents(contents):
    # split by non-word characters, retaining split character
    #tokens = re.split("(\s)", contents)
    tokens = re.split("(\W)", contents)
    # remove '' tokens
    tokens = [token for i,token in enumerate(tokens) if token!='']
    return tokens

def indexnewlines(tokens):
    newline_idx = [i+1 for i,x in enumerate(tokens) if x=="\n"]
    del newline_idx[-1]
    newline_idx = [i for i in newline_idx if tokens[i]!="\n"]
    return newline_idx

def sorttokens(tokens, newline_idx):
    # find frequency of each lowercased token that is on a newline
    tokenslc = [token.lower() for token in [tokens[i] for i in newline_idx]]
    counter = collections.Counter(tokenslc)

    # sort the unique instances of each token. used for finding alphabetical proximity
    sorted_tokens = sorted(counter.keys())
    sorted_tokens_map = dict();
    for i in range(len(sorted_tokens)):
        sorted_tokens_map[sorted_tokens[i]] = i
    return sorted_tokens_map

def parselines(tokens, newline_idx, sorted_tokens_map, window, threshold):
    #window = 100
    #threshold = 200
    entries = []
    newline_idxs = [i for i in chunks(newline_idx, 100)]
    for chunk in newline_idxs:
        z = np.asarray([tokens[i] for i in chunk]) #newline_idx[0:window]])
        y = sortdist(z, sorted_tokens_map)
        Z = cl.hierarchy.single(y)
        T = cl.hierarchy.fcluster(Z, threshold, criterion='distance')
        clusters = collections.Counter(T)
        largest_cluster_id = clusters.most_common(1)[0][0]
        entries.extend([chunk[i] for i, c in enumerate(T) if c==largest_cluster_id])
    #dend = cl.hierarchy.dendrogram(Z, color_threshold=threshold)
    #entries = entries + rebuild(tokens[0:1580], [chunk[i] for i, c in enumerate(T) if c==largest_cluster])

    lines = rebuild(tokens, entries)
    return lines

### Iteration for fixing directory sections

- Get the indices and filename for the street directory section:
  `source = sectionindex(1863, 'individual')` or
  `source = sectionindex(1863, 'places', subsection='street_directory')` 

- Load the directory section:
  `sourcetext = parsefile(source)`
  
- Parse the contents of the section. This splits on non-word character:
  `tokens = parsecontents(sourcetext)`
  
- Get an index of tokens preceded by a newline
  `newline_idx = indexnewlines(tokens)`

- Sort the tokens at the head of each line
  `sorted_tokens_map = sorttokens(tokens, newline_idx)`

- Cluster head-of-line tokens by overall sort order in windows of size 50 (in this example). Threshold by 20 (in this case) and identify the biggest cluster. Join and then re-split by line.
  `lines = parselines(tokens, newline_idx, sorted_tokens_map, 50, 20)`
  
This process should result in a list of strings each corresonding to an entry of text in the directory section.

In [9]:
source=sectionindex(1863, 'individual')
sourcetext=parsefile(source)
tokens=parsecontents(sourcetext)
newline_idx = indexnewlines(tokens)
sorted_tokens_map = sorttokens(tokens, newline_idx)
lines = parselines(tokens, newline_idx, sorted_tokens_map, 100, 200)

In [11]:
# fixes for street directory
def streetdirectoryfix0(line, fixes):
    # fixes = streetname_fixes
    first_word = firstword(line)
    fix = fixes.get(first_word)
    if fix==None:
        line = "\n%s" % line
    else:
        if fix=='':
            line = " %s" % line
        elif fix=='\x7f':
            line = ''
        else:
            line = re.sub(re.escape(first_word), fix, line)
            line = "\n%s" % line
    return line

def streetdirectoryfix1(line, fixes):
    # fixes = lines_fixes
    fix = fixes.get(line)
    if fix==None:
        line = line
    else:
        line = fix
    line = re.sub('-\s', '-', line)
    return line

### Iteration for fixing the street directory

Given the parsed lines ('`lines`') as described in the previous section, do the following:

- Find the first-pass guess of the streetname in the directory.

- Turn this into a dictionary with each name as a key and 'None' as a value.

`
foundwords = [firstword(i) for i in lines]
streetname_fixes = {key: None for key in foundwords}
for key in streetfix[0]:
    streetname_fixes[key] = streetfix[0][key]
`

- Copy output of this dictionary to input and delete any key that is correct. Annotate the remaining in the value field with a corrective action. Corrective actions are
    - '' :the line was mistakenly given a newline and should be rejoined with the previous line
    - '\x7f' : the line should be deleted entirely
    - a replacement string, including corrections. If the streetname accidentally includes the whole line, a comma should be placed after the name proper. Alternatives such as "Benton or Devisidero" should be parenthesized: "Benton (or Devisidero)".
    
- The resulting dict with corrective actions should be stored in position 0 of a list 'streetfix'.

- 'streetfix' should be saved as a json file in the directory corresponding to the year and publisher.

In [23]:
source=sectionindex(1864, 'places', subsection='street_directory')
sourcetext=parsefile(source)
tokens=parsecontents(sourcetext)
newline_idx = indexnewlines(tokens)
sorted_tokens_map = sorttokens(tokens, newline_idx)
lines = parselines(tokens, newline_idx, sorted_tokens_map, 50, 20)
with open('1864Langley/streetfix.json', 'r') as fp:
    streetfix = json.load(fp)
fixedlines = [streetdirectoryfix0(i, streetfix[0]) for i in lines]
fixedlines2 = [streetdirectoryfix1(i, streetfix[1]) for i in fixedlines]
fixedcontent = ''.join(fixedlines2)
fixedlines3 = re.split('\n', fixedcontent)
fixedlines4 = [re.sub('- ','-',i) for i in fixedlines3]
Langley1864Streets = [firstword(i) for i in fixedlines3]

In [12]:
source=sectionindex(1863, 'places', subsection='street_directory')
sourcetext=parsefile(source)
tokens=parsecontents(sourcetext)
newline_idx = indexnewlines(tokens)
sorted_tokens_map = sorttokens(tokens, newline_idx)
lines = parselines(tokens, newline_idx, sorted_tokens_map, 50, 20)
with open('1863Langley/streetfix.json', 'r') as fp:
    streetfix = json.load(fp)
fixedlines = [streetdirectoryfix0(i, streetfix[0]) for i in lines]
fixedlines2 = [streetdirectoryfix1(i, streetfix[1]) for i in fixedlines]
fixedcontent = ''.join(fixedlines2)
fixedlines3 = re.split('\n', fixedcontent)
fixedlines4 = [re.sub('- ','-',i) for i in fixedlines3]
Langley1863Streets = [firstword(i) for i in fixedlines4]

In [24]:
Langley1864Streets

['',
 '',
 'Ada',
 'Ada Court',
 'Adelaide Place',
 'Adele Place',
 'Adelphi Place',
 'Adler',
 'Adona Place',
 'Agnes Lane',
 'Alabama',
 'Alameda',
 'Alamo Square',
 'Alcatraces Square',
 'Allen',
 'Almera',
 'Alta',
 'Alta ',
 'Alta Plaza',
 'Andrew',
 'Ankeny Place',
 'Ann',
 'Anna',
 'Annie',
 'Anthony',
 'Antonio',
 'Arkansas',
 'Ashburton Place ',
 'Ashland Place',
 'Auburn',
 'August Alley',
 'Austin',
 'Bagley Place',
 'Bailey Alley',
 'Baker',
 'Balance',
 'Baldwin Court',
 'Bannam Place',
 'Baright Place',
 'Barret Alley',
 'Bartlett Alley',
 'Bartol',
 'Battery',
 'Bay',
 'Bay Avenue',
 'Bay View Place',
 'Beach',
 'Beale',
 'Brannan',
 'Beale Place',
 'Bedford Place',
 'Balden',
 'Bellair Place',
 'Benton ',
 'Benzi',
 'Bernard',
 'Berry',
 'Berry',
 'Bertha',
 'Bestole',
 'Beverly Place',
 'Billings Place',
 'Birch',
 'Bluxome',
 'Bluxome East',
 'Bone Alley',
 'Boston Place',
 'Bower Place',
 'Boyd',
 'Brady',
 'Brady Place',
 'Brandon Alley',
 'Brannan',
 'Brenham Place

In [41]:
# 1863Langley street fixes

streetfix = [{
 'PUBLIC STREETS':'\x7f',
 'Keamv':'Kearny',
 'iJiero':'',
 'Lilierty':'Liberty',
 'Jovful Alley':'Joyful Alley',
 'Holf Avenue':'Hoff Avenue',
 'and Twenty -Second':'',
 'Buriitt':'Burritt',
 'Center. S s Biyant SE to South Park':'Center, S s Bryant SE to South Park',
 'Cohn Place S 8 Jackson bet Leav and Hyde':'Cohn Place, S s Jackson bet Leav and Hyde',
 'Dent Place. N s Jackson bet Stockton and Powell':'Dent Place, N s Jackson bet Stockton and Powell',
 'N to Lewis':'',
 'Mathews tlace':'Mathews Place',
 "O'Farrell Jlary":"O'Farrell Alley",
 'Adelaide Plate':'Adelaide Place',
 'Adtlk- Place':'Adelle Place',
 'Adeliilii Place':'Adelphi Place',
 'Aj':'Agnes',
 'AlaiiKi Siiuare':'Alamo Square',
 'Alcalraces Square':'Alcatraces Square',
 'Ayues Alley':'Aynes Alley',
 'Bag LEY':'Bagley',
 'Bakur':'Baker',
 'Barigiit Place':'Baright Place',
 'Bay Avemie':'Bay Avenue',
 'Benton or Devisidero':'Benton (or Devisidero)',
 'Bertha W s Beale bot Mission and Howard': 'Bertha, W s Beale bot Mission and Howard',
 'Brannau':'Brannan',
 'Beldeii':'Belden',
 'Ben/.i':'Benzi',
 'BUSWELL & CO':'\x7f',
 'BUSWELL & CO.':'\x7f',
 'Bnrritt':'Burritt',
 'Cbesley':'Chesley',
 'Columbia S':'Columbia Square',
 'Custom H/:>use Place':'Custom House Place',
 'Eagi.e ':'Eagle ',
 'Eighth ^late Price) S s Market bet Seventh and Ninth':'Eighth (late Price) S s Market bet Seventh and Ninth',
 'EUick Alley':'Ellick Alley',
 'Ennna':'Emma',
 'Folsom bet First and Second':'Folsom, bet First and Second',
 "Fort'Place":'Fort Place',
 'Fourteentli':'Fourteenth',
 'Fourteenth SE to Harrison':'Fourteenth, SE to Harrison',
 'Fremont Court or Clay Street Avenue':'Fremont Court (or Clay Street Avenue)',
 'Glenwood Plaie':'Glenwood Place',
 'Gonf':'Gough',
 'Grand Phice':'Grand Place',
 'Gueirero':'Guerrero',
 'Hall Couit':'Hall Court',
 'Havwood':'Haywood',
 'Hein-y':'Henry',
 'Hermatni Place':'Hermann Place',
 "Hodgea I'lace":'Hodges Place',
 "Holi' Avenue":'Hoff Avenue',
 'ladel Place':'Isdel Place',
 "I'ngton":'',
 'Jacksok':'Jackson',
 "Jctt'erson Square":'Jefferson Square',
 'Kearnv':'Kearny',
 'Lafayette Ave.vue':'Lafayette Avenue',
 'licwis':'Lewis',
 'Lick Place. N s Post bet Montgomery and Kearny':'Lick Place, N s Post bet Montgomery and Kearny',
 'Lima. N 8 Filbert bet Leavenworth and Hyde':'Lima, N s Filbert bet Leavenworth and Hyde',
 "Lombard I'lace":'Lombard Place',
 'Lone Momitain Avenue':'Lone Mountain Avenue',
 'Lone Jlountain Avenue':'Lone Mountain Avenue',
 'ington':'',
 'Mathew or Jane':'Mathew (or Jane)',
 'Mathew Lane or West Mathew':'Mathew Lane (or West Mathew)',
 'Melius':'Mellus',
 "Morel 'Place":'Morel Place',
 'Pfeiner':'Pfeiffer',
 'Poet Office Place':'Post Office Place',
 'Ninth SE to Channel':'',
 'Pike or Waverley Place':'Pike Place (or Waverley Place)',
 'Pike or Waverly Place':'Pike Place (or Waverley Place)',
 'QuiNCY':'Quincy',
 'Randai.i':'Randall Place',
 'Raspcttc Place':'Rassette Place',
 'Rped':'Reed',
 'Rilev':'Riley',
 'Ritcli':'Ritch',
 'Roacli Alley':'Roach Alley',
 'Ro88 ':'Ross ',
 'Rousch.':'Rousch',
 'Sacramekto':'Sacramento',
 'SAN FRANCISCO STREET DIRECTORY. 429':'\x7f',
 'SAN FRANCISCO STREET DIRECTORY.':'\x7f',
 "Sharp i'lace":'Sharp Place',
 'Sherwood PJace':'Sherwood Place',
 'Sonoma or Sonora Phice':'Sonoma Place (or Sonora Place)',
 'to the Bay':'',
 'to the Bay':'',
 'to Devisidero':'',
 "Sonoma or So'nora I'lace":'Sonoma Place (or Sonora Place)',
 'Sophie Teri-ace':'Sophie Terrace',
 'Spoflbni':'Spofford',
 'St. Charles IMace':'St. Charles Place',
 'ton':'',
 'Sunmier':'Summer',
 "Tay'":'Tay',
 'teenth':'',
 "Thompson I'lace":'Thompson Place',
 'Thome ':'Thorne ',
 'Tborne':'Thorne',
 'to tlie bay':'',
 'Ton':'Tonquin',
 'Touchard .Mley ':'Tonchard Alley ',
 'Thirteenth SE to Harrison':'',
 'Twenty-Seconcf ':'Twenty-Second ',
 'ty-First W to Castro':'',
 "Union I'lace":'Union Place',
 'Van Xess Avenue':'Van Ness Avenue',
 'Vareiine':'Varenne',
 'Ver Mebr':'Ver Mehr',
 'Vernoti Place':'Vernon Place',
 'Vincent or St. Vincent':'Vincent (or St. Vincent)',
 'Viiginia':'Virginia',
 'Wai-i.ace Place':'Wallace Place',
 "Wasliin^ton Avenue":"Washington Avenue",
 "Webstei'":'Webster',
 'to Lewis':'',
 'Willow Avenue.':'Willow Avenue',
 'Winter8 Alley':'Winters Alley',
 '[compiled fkom official and authentic sources.]': '\x7f',
 'and Twenty-Second': '',
 'bet Sutter and Post': '',
 'eenth and Twentieth W to Castro': '',
 'giuia': '',
 'ington': '',
 'kin SE to Mission Creek': '',
 'ladel Place': 'Isdel Place',
 'licwis': 'Lewis',
 'teenth': '',
 'to Devihidero': '',
 'to Lewis': '',
 'to the Bay': '',
 'to the bay':'',
 'ton': '',
 'ty-First W to Castro': '',
 'W^allace Place':'Wallace Place',
 'W^aller':'Waller',
 'to Lewis a':'',
 'White. N 8 Vallejo bet Hyde and Larkin':'White, N s Vallejo bet Hyde and Larkin',
 'ZoE':'Zoe'
},
{
"\nCastro, S s Ridley W of Xoe t'atarro Place, W s Taylor bet Post and Geary":
"\nCastro, S s Ridley W of Xoe\nCatarro Place, W s Taylor bet Post and Geary",
'\nCentral Place, S s Pine bet Dnpont and Kearny (Jhambers, W s Front bet Pacitic and Broadwiiy':
'\nCentral Place, S s Pine bet Dnpont and Kearny\nChambers, W s Front bet Pacific and Broadway',
'\nGibb, W s\'Slaiden Lane bet Jackson and Wash A. BOMAN" & CO., 417 & 419 Montgomery St., Booksellers, Importers, and Publishers. 25 386 SAN FRANCISCO STREET DIRECTORY.':
'\nGibb, W s\'Slaiden Lane bet Jackson and Wash',
'\nHubbard, S a Howard bet Second and Third llul.bcll, W a Fifth bet Irwin and South SW to Seventll':
'\nHubbard, S a Howard bet Second and Third\nHubbell, W a Fifth bet Irwin and South SW to Seventll',
'\nJasper Place, N s Union bet Dupont and Stockton a lefierson, W a Powell N of Beacli to Devisidero':
'\nJasper Place, N s Union bet Dupont and Stockton\nJefferson, W a Powell N of Beacli to Devisidero',
'\nLynch, W B Leavenworth bet Pacific and Broad-way BTJSWELL & CO., Blank Book Manufacturers and Paper Rvders, 517 Clay Street. SAN FRANCISCO STREET DIRECTORY, 387':
'\nLynch, W B Leavenworth bet Pacific and Broad-way',
'\nMahon Place, N s Geary bet Hyde and Larkin Slaiden Lane, S s Jackson bet Montgomery and':
'\nMahon Place, N s Geary bet Hyde and Larkin\nMaiden Lane, S s Jackson bet Montgomery and',
'\nKearny':'',
'\nMontgomery Place, W s Montgomery bet Union and Filbert ]\\Ioore, N s Union bet Hyde and Larkin':
'\nMontgomery Place, W s Montgomery bet Union and Filbert\nMoore, N s Union bet Hyde and Larkin',
'\nRandall Place, Pr.ACE, N s Greenwich bet Hyde and Leavenworth A. ROMAN & CO., 417 & 419 Montgomery Street, Standard and Miscellaneous Books. 388':
'\nRandall Place, Pr.ACE, N s Greenwich bet Hyde and Leavenworth',
"\nReed, X M Clay l)et JoncB and Leavenworth Kced Place, S's (Jreen hot Kearny and Montgomery":
"\nReed, X M Clay l)et JoncB and Leavenworth\nReed Place, S's (Jreen hot Kearny and Montgomery",
'\nRichard, S s Suiter near Jones Kichmoiul, W s Front het Sacramento and California':
'\nRichard, S s Suiter near Jones\nRichmond, W s Front het Sacramento and California',
"\nRitter, S s Harrison l)et Seventh and Eijjhth Kciacli. W s Zoe bet Bryant and ilari'ison":
"\nRitter, S s Harrison l)et Seventh and Eijjhth\nRoach, W s Zoe bet Bryant and ilari'ison",
'\nRousch. N 8 Folsom bet Sixth and Seventh':'\nRousch, N 8 Folsom bet Sixth and Seventh',
'\nTonquin(iuin, from Larkin bet Lewis and Jefferson W':
'\nTonquin, from Larkin bet Lewis and Jefferson W',
"\nVallejo, W s Davis bet Green and Broadway W to I'jevisidero ^'allejo I'lace, X^ s Vallejo bet Stockton and Po%yell":
"\nVallejo, W s Davis bet Green and Broadway W to I'jevisidero\nVallejo Place, X^ s Vallejo bet Stockton and Po%yell",
"\nValparaiso, W s Mason bet Filbert and Greenwich \\'ande water, W s Pcnvell bet Francisco and Bay":
"\nValparaiso, W s Mason bet Filbert and Greenwich\nVandewater, W s Pcnvell bet Francisco and Bay",
'\nVan Ness Avenue, junction Oak and Market N to Lewis BUSWEIiL & CO., Book Binders and Printers, 517 Clay Street, San Francisco. SAN FRANCISCO STREET DIRECTORY, 389':
'\nVan Ness Avenue, junction Oak and Market N to Lewis',
'\nWillow Avenue. W s Larkin bet Ellis and Eddy':
'\nWillow Avenue, W s Larkin bet Ellis and Eddy',
'\nTenth':'',
'\nStockton, rear':'\x7fStockton, rear',
'\nStockton':'\x7fStockton',
 '\nMaiden Lane, S s Jackson bet Montgomery and Kearny ^laiden Lane, N s Valiejo bet Powell and Stockton':
 '\nMaiden Lane, S s Jackson bet Montgomery and Kearny\nMaiden Lane, N s Valiejo bet Powell and Stockton',
 '\nMain, S s Market bet Spear and Beale SE to Bryant IMalvina Place, W s Mason bet Clay and Sacramento':
 '\nMain, S s Market bet Spear and Beale SE to Bryant\nMalvina Place, W s Mason bet Clay and Sacramento',
 '\nMargaret Place, N s Valiejo bet Dupont and Kearny !Maria, N s Howard bet Seventh and Eighth Jlariposa, W from the bay to Channel':
 '\nMargaret Place, N s Vallejo bet Dupont and Kearny\nMaria, N s Howard bet Seventh and Eighth\nMariposa, W from the bay to Channel',
 '\nJansen, N 8 Greenwich bet Mason and Taylor BIGELOW & BKOTHEB, Insurance Agents. All losses paid in United States Gold Coin. SAN FRANCISCO STREET DIRECTORY. 427':
 '\nJansen, N 8 Greenwich bet Mason and Taylor',
 "\nMason, junction Turk and Market'N to the bay JIassett, W 8 Seventh bet Mission and Howard":
 "\nMason, junction Turk and Market N to the bay\nMassett, W s Seventh bet Mission and Howard",
 '\nMiller, W s Powell bet Pacific and Broadway Jlills Place, W s Dupont bet Post and Sutter':
 '\nMiller, W s Powell bet Pacific and Broadway\nMills Place, W s Dupont bet Post and Sutter',
 '\nMissouri, Potrero Nuevo IMonroe, N s Bush bet Stockton and Powell':
 '\nMissouri, Potrero Nuevo\nMMonroe, N s Bush bet Stockton and Powell',
 '\nSilver, W s Second bet Harrison and Bryant BIGELOW & BROTHEB., Insurance Agents. All Losses promptly adjusted and paid in G-old.':
 '\nSilver, W s Second bet Harrison and Bryant',
 '\nMoore, N s Union bet Hyde and Larkin IMoore Place, N s Clay bet Hyde aud Jjarkin':
 '\nMoore, N s Union bet Hyde and Larkin\nMoore Place, N s Clay bet Hyde aud Larkin',
 '\nWaverly Court, S 8 Washington bet Dupont and\x7fStockton, rear':'\nWaverly Court, S s Washington bet Dupont and Stockton, rear',
 '\nWaverly Place, N s Sacramento bet Dupont and\x7fStockton':'\nWaverly Place, N s Sacramento bet Dupont and Stockton',
 "\nWilliam Place, W s Davis bet California and Sac-ramento A. SOMAI]' & CO., 417 and 419 Mont. St., Photograph Albums and Portraits of Notable Persons. 430 SAN FRANCISCO STREET DIRECTORY.":
 "\nWilliam Place, W s Davis bet California and Sac-ramento",
 "\nMorse Place, S s Broadway bet Hyde and Leaven-worth A. BOMAIl' & CO., 417 and 419 Montgomery Street, Ne'W Books for sale as soon as published. 428 SAN FEANCISCO STREET DIRECTORY,":
 "\nMorse Place, S s Broadway bet Hyde and Leavenworth"
}
             ]
with open('1863Langley/streetfix.json', 'w') as fp:
    json.dump(streetfix, fp, sort_keys=True, indent=4)

In [22]:
# 1864Langley street fixes

streetfix = [{
 '': None,
 'Benton or Devisidero': 'Benton (or Devisidero)',
 'Buclianan': 'Buchanan',
 'Center. S s Biyant SE to South Park':'Center, S s Bryant SE to South Park',
 'Cohn Place S 8 Jackson bet Leav and Hyde': 'Cohn Place, S 8 Jackson bet Leav and Hyde',
 'Dent Place. N s Jackson bet Stockton and Powell':'Dent Place, N s Jackson bet Stockton and Powell',
 'Duaue':'Duane',
 'Eighth ^late Price) S s Market bet Seventh and Ninth': 'Eighth (late Price) S s Market bet Seventh and Ninth',
 'Fourteenth SE to Harrison': 'Fourteenth, SE to Harrison',
 'Fremont Court (or Clay Street Avenue)': None,
 'Lick Place. N s Post bet Montgomery and Kearny': 'Lick Place, N s Post bet Montgomery and Kearny',
 'Lilierty': 'Liberty',
 'Lima. N 8 Filbert bet Leavenworth and Hyde': 'Lima, N 8 Filbert bet Leavenworth and Hyde',
 "Lombard I'lace": 'Lombard Place',
 'Lone Jlountain Avenue': 'Lone Mountain Avenue',
 'Mathew Lane or West Mathew': 'Mathew Lane (or West Mathew)',
 'Mathew or Jane': 'Mathew (or Jane)',
 'Mathews tlace': 'Mathews Place',
 'N to Lewis': '',
 'Ninth SE to Channel': '',
 "O'Farrell Jlary": "O'Farrell Alley",
 'Pfeifier': 'Pfeiffer',
 'Pike or Waverly Place': 'Pike Place (or Waverly Place)',
 'QuiNCY': 'Quincy',
 'Rusa Alley': 'Russ Alley',
 'SAN FRANCISCO STREET DIRECTORY. 429': '\x7f',
 'Sonoma or Sonora Phice': 'Sonoma Place (or Sonora Place)',
 'Tborne ': 'Thorne',
 'Thirteenth SE to Harrison': '',
 'Vincent or St. Vincent': 'Vincent (or St. Vincent)',
 'W^allace Place': 'Wallace Place',
 'W^aller': 'Waller',
 'White. N 8 Vallejo bet Hyde and Larkin': 'White, N 8 Vallejo bet Hyde and Larkin',
 'ZoE': 'Zoe',
 'Zoe Place': None,
 'and Twenty -Second': '',
 'bet Sutter and Post': '',
 'eenth and Twentieth W to Castro': '',
 'iJiero': '',
 'ington': '',
 'kin SE to Mission Creek': '',
 'sidero': '',
 'teenth': '',
 'to Devisidero': '',
 'to Lewis a': '',
 'to the bay': '',
 'to tlie bay': '',
 'ton': '',
 'ty-First W to Castro': ''},
{
 '\nBuchanan (now Lincoln) junction Market and Eid-ley N to Lewis * Reference is frequently made in the Register of Names to the following, and which are located as follows, viz. : Hoadley\'s Addition, situated west of Grant or Pierce bet Geary and Washington ; Horner\'s Addition, south and near the Mission Dolores; Western Addition, west of Larkin; San Miguel Ranch, south-west and near tlie Mission Dolores, and Bernal Heights, near the San Bruno Road and south of the Totrero Nuevo. See also prominent places, page 431. BIGELO"W & BBOTHEB, Insurance Agents. $250,000 taken in a single risk. SAN FRANCISCO STREET DIRECTORY. 425':
 '\nBuchanan (now Lincoln) junction Market and Eid-ley N to Lewis',
 '\nEllick Alley, N s Pacific bet Dupont and Stockton p]llick Lane, N s California bet Stockton and Powell':
 '\nEllick Alley, N s Pacific bet Dupont and Stockton\nEllick Lane, N s California bet Stockton and Powell',
 '\nEverett, W s Third bet Mission and Howard A. BOMATf & Co., 417 and 419 Montgomery Street, Standard and Miscellaneous Books. 426 SAN FRANCISCO STREET DIRECTORY.':
 '\nEverett, W s Third bet Mission and Howard',
 '\nJansen, N 8 Greenwich bet Mason and Taylor BIGELOW & BKOTHEB, Insurance Agents. All losses paid in United States Gold Coin. SAN FRANCISCO STREET DIRECTORY. 427':
 '\nJansen, N 8 Greenwich bet Mason and Taylor',
 '\nMaiden Lane, S s Jackson bet Montgomery and Kearny ^laiden Lane, N s Valiejo bet Powell and Stockton':
 '\nMaiden Lane, S s Jackson bet Montgomery and Kearny\nMaiden Lane, N s Vallejo bet Powell and Stockton',
 '\nMain, S s Market bet Spear and Beale SE to Bryant IMalvina Place, W s Mason bet Clay and Sacramento':
 '\nMain, S s Market bet Spear and Beale SE to Bryant\nMalvina Place, W s Mason bet Clay and Sacramento',
 '\nMiller, W s Powell bet Pacific and Broadway Jlills Place, W s Dupont bet Post and Sutter':
 '\nMiller, W s Powell bet Pacific and Broadway\nMills Place, W s Dupont bet Post and Sutter',
 '\nMoore, N s Union bet Hyde and Larkin IMoore Place, N s Clay bet Hyde aud Jjarkin':
 '\nMoore, N s Union bet Hyde and Larkin\nMoore Place, N s Clay bet Hyde aud Larkin',
 "\nMorse Place, S s Broadway bet Hyde and Leaven-worth A. BOMAIl' & CO., 417 and 419 Montgomery Street, Ne'W Books for sale as soon as published. 428 SAN FEANCISCO STREET DIRECTORY,":
 "\nMorse Place, S s Broadway bet Hyde and Leavenworth",
 '\nSilver, W s Second bet Harrison and Bryant BIGELOW & BROTHEB., Insurance Agents. All Losses promptly adjusted and paid in G-old.':
 '\nSilver, W s Second bet Harrison and Bryant',
 "\nWilliam Place, W s Davis bet California and Sac-ramento A. SOMAI]' & CO., 417 and 419 Mont. St., Photograph Albums and Portraits of Notable Persons. 430 SAN FRANCISCO STREET DIRECTORY.":
 "\nWilliam Place, W s Davis bet California and Sac-ramento",
    }
]

with open('1864Langley/streetfix.json', 'w') as fp:
    json.dump(streetfix, fp, sort_keys=True, indent=4)

### Tasks for line cleanup

- check for location indicators ("dwl", "cor", "bet")
- check for street names or near street names
- remove hyphenations

### Tools needed

- dictionary of words
- dictionary of streets
- dictionary of abbreviations

In [42]:
def builddictionary():
    f = open('data/unigrams.txt', 'r')
    f.seek(0)
    unigrams_raw = f.read()
    unigrams_lines = re.split("\n", unigrams_raw)
    unigrams = [re.split("\t", i) for i in unigrams_lines]
    dictionary_list = [{i[0] : i[1]} for i in unigrams if len(i) > 1]

    dictionary = {}
    for d in dictionary_list:
        dictionary.update(d)
    return dictionary

def buildstreetsdictionary():
    f = open('data/streets.txt', 'r')
    f.seek(0)
    contents = f.read()
    return contents
    
def parseline(line):
    tokens = re.split(",", line)
    names = re.split(" ", tokens[0])
    lastname = names[0]
    del names[0]
    firstnames = " ".join(names)
    entry_elements = {"lastname" : names[0], "names" : firstnames, "address" : tokens[-1], "middle" : [i for i in tokens[1:len(tokens)-1]]}
    return entry_elements

In [43]:
unigrams = builddictionary()

In [27]:
# Dynamic programming examples
def edDistRecursiveMemo(x, y, memo=None):
    if memo is None: memo = {}
    if len(x) == 0: return len(y)
    if len(y) == 0: return len(x)
    if (len(x), len(y)) in memo:
        return memo[(len(x), len(y))]
#    delt = 1 if x[-1] != y[-1] else 0
    delt = editdistance.eval(x[-1], y[-1])
    diag = edDistRecursiveMemo(x[:-1], y[:-1], memo) + delt
    vert = edDistRecursiveMemo(x[:-1], y, memo) + 1
    horz = edDistRecursiveMemo(x, y[:-1], memo) + 1
    ans = min(diag, vert, horz)
    memo[(len(x), len(y))] = ans
    return ans

def edDistDp(x, y):
    """ Calculate edit distance between sequences x and y using
        matrix dynamic programming. Return distance. """
    D = np.zeros((len(x)+1, len(y)+1), dtype=int)
    D[0, 1:] = range(1, len(y)+1)
    D[1:, 0] = range(1, len(x)+1)
    for i in range(1, len(x)+1):
        for j in range(1, len(y)+1):
            delt = 1 if x[i-1] != y[j-1] else 0
#            delt = editdistance.eval(x[-1], y[-1]) if x[i-1] != y[j-1] else 0
            D[i, j] = min(D[i-1, j-1]+delt, D[i-1, j]+1, D[i, j-1]+1)
#    return D[len(x), len(y)]
    return D

In [28]:
D = edDistDp(Langley1863Streets, Langley1864Streets)

In [29]:
D[0:20,0:15]

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
       [ 1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13],
       [ 2,  1,  1,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
       [ 3,  2,  2,  2,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
       [ 4,  3,  3,  3,  2,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10],
       [ 5,  4,  4,  4,  3,  2,  2,  3,  4,  5,  6,  7,  8,  9, 10],
       [ 6,  5,  5,  5,  4,  3,  3,  2,  3,  4,  5,  6,  7,  8,  9],
       [ 7,  6,  6,  6,  5,  4,  4,  3,  2,  3,  4,  5,  6,  7,  8],
       [ 8,  7,  7,  7,  6,  5,  5,  4,  3,  2,  3,  4,  5,  6,  7],
       [ 9,  8,  8,  8,  7,  6,  6,  5,  4,  3,  3,  4,  5,  6,  7],
       [10,  9,  9,  9,  8,  7,  7,  6,  5,  4,  4,  4,  5,  5,  6],
       [11, 10, 10, 10,  9,  8,  8,  7,  6,  5,  5,  5,  5,  6,  5],
       [12, 11, 11, 11, 10,  9,  9,  8,  7,  6,  6,  6,  6,  6,  6],
       [13, 12, 12, 12, 11, 10, 10,  9,  8,  7,  7,  7,  7,  7,  7],
       [14, 13, 13, 13, 12, 11, 11

In [49]:
print(len(Langley1863Streets), len(Langley1864Streets))

617 697


In [36]:
D[8,9]

2

In [34]:
(Langley1863Streets[1:10],Langley1864Streets[2:11])

(['Ada',
  'Ada Court',
  'Adelaide Place',
  'Adelle Place',
  'Adelphi Place',
  'Adler',
  'Adona Place',
  'Agnes',
  'Alamo Square'],
 ['Ada',
  'Ada Court',
  'Adelaide Place',
  'Adele Place',
  'Adelphi Place',
  'Adler',
  'Adona Place',
  'Agnes Lane',
  'Alabama'])

In [52]:
Langley1864Streets

['',
 '',
 'Ada',
 'Ada Court',
 'Adelaide Place',
 'Adele Place',
 'Adelphi Place',
 'Adler',
 'Adona Place',
 'Agnes Lane',
 'Alabama',
 'Alameda',
 'Alamo Square',
 'Alcatraces Square',
 'Allen',
 'Almera',
 'Alta',
 'Alta ',
 'Alta Plaza',
 'Andrew',
 'Ankeny Place',
 'Ann',
 'Anna',
 'Annie',
 'Anthony',
 'Antonio',
 'Arkansas',
 'Ashburton Place ',
 'Ashland Place',
 'Auburn',
 'August Alley',
 'Austin',
 'Bagley Place',
 'Bailey Alley',
 'Baker',
 'Balance',
 'Baldwin Court',
 'Bannam Place',
 'Baright Place',
 'Barret Alley',
 'Bartlett Alley',
 'Bartol',
 'Battery',
 'Bay',
 'Bay Avenue',
 'Bay View Place',
 'Beach',
 'Beale',
 'Brannan',
 'Beale Place',
 'Bedford Place',
 'Balden',
 'Bellair Place',
 'Benton ',
 'Benzi',
 'Bernard',
 'Berry',
 'Berry',
 'Bertha',
 'Bestole',
 'Beverly Place',
 'Billings Place',
 'Birch',
 'Bluxome',
 'Bluxome East',
 'Bone Alley',
 'Boston Place',
 'Bower Place',
 'Boyd',
 'Brady',
 'Brady Place',
 'Brandon Alley',
 'Brannan',
 'Brenham Place