In [42]:
import os
import re
from hydraseq import Hydraseq

## basic string breaker "word word2" -> ['word', 'word2']

In [43]:
def load_category_from_file(fpath):
    """Take a one word per line file and return a regex for the concatenation '^(w1|w2)$'"""
    with open(fpath, 'r') as source:
        ways = [line.strip().lower() for line in source]
    return r'^(' + "|".join(ways) + r')$'

def w(str_sentence):
    return re.findall(r"[\w'/-:]+|[.,!?;#&]", str_sentence)


In [44]:
seq = Hydraseq('input')

## Encoder, takes a single word and returns a list of 'features' characterizing word

In [45]:
ways = load_category_from_file('ways.csv')
apts = load_category_from_file('apts.csv')
nths = load_category_from_file('nth.csv')
dirs = load_category_from_file('dirs.csv')
arti = load_category_from_file('arti.csv')
pre  = load_category_from_file('pre.csv')
deleg = load_category_from_file('deleg.csv')
print(apts)
def encoder(word, trim=False):
    encodings = [
        ('POBOX', [r'^:box$']),
        ('LETTER', [r'^[a-z]$']),
        ('ALNUM', [r'^(\d+[a-z]+|[a-z]+\d+)[\da-z]*$']),
        ('AL_1NUM', [ r'^\d+[a-z]$' ]),
        ('NTH',    [ nths ]),
        ('ATTN', [r'^:deleg$']),
        ('WAY',  [ ways ]),
        ('DIR',  [ dirs ]),
        ('NUMSTR', [r'^\d+[a-z]+$' ]),
        ('APT', [ apts ]),
        ('DIGIT', [r'^\d+$']),
        ('COMMA', [r'^,$']),
        ('PERIOD', [r'^\.$']),
        ('POUND', [r'^#$']),
        ('DELEG', [r'^attn$', r'^attn:$', r'^c\/o$', r'^co$' ]),
        ('POB0', [r'^po$', r'^p\.o\.$']),
        ('POB2', [r'^box$']),
        ('PRE',  [ pre ]),
        ('ARTI', [ arti ]),
        ('ADDRESS', [r'^:adr$']),
        ('ALPHA', [r'^[a-z]+$']),
    ]
    if not trim:
        return [key for key, rexs in encodings for rex in rexs if re.match(rex, word)]
    else:
        encoding = [key for key, rexs in encodings for rex in rexs if re.match(rex, word)]
        if len(encoding) > 1 and 'ALPHA' in encoding:
            encoding.remove('ALPHA')
        if len(encoding) > 1 and 'ALNUM' in encoding:
            encoding.remove('ALNUM')
        return encoding
            
        


def encode_sentence(st):
    return [encoder(word.lower()) for word in w(st)]

def encode_nested_list(st):
    return [[' '.join(encoder(word))] for word in w(st)]

def encode_from_word_list(arr_st):
    """Expects ['123', 'main', 'st]"""
    assert isinstance(arr_st, list)
    if arr_st: assert isinstance(arr_st[0], str)
    return [encoder(word) for word in arr_st]

def train(seq, sts):
    return seq.insert(encode_nested_list(sts)).get_next_values()

def train_with_provided_list(seq, matrix_lst):
    """matrix list means [['DIGIT'],['ALPHA'],['WAY']] for example"""
    return seq.insert(matrix_lst, is_learning=True).get_next_values()

def train_arr(seq, arr_w):
    return seq.insert([[' '.join(encoder(word))] for word in arr_w]).get_next_values()

def think(seq, sts):
    return seq.insert(encode_nested_list(sts), is_learning=False).get_next_values()

st = "100 n main st"
print(train(seq, st))
print(train_arr(seq, ["1217", "iris", "ct"]))
print(train_with_provided_list(seq, [['DIGIT'],['ALPHA']]))
print(train_with_provided_list(seq, [['DIGIT'], ['ALPHA'], ['WAY'], ['ADDRESS']]))
print(encode_from_word_list(['123','st','james','st']))

^(apartment|apt|suite|ste|unit|lot|trlr|ro|spc|rm|fl|lbby|pmb|spc|)$
[]
[]
['WAY ALPHA']
[]
[['DIGIT'], ['WAY', 'PRE', 'ALPHA'], ['ALPHA'], ['WAY', 'PRE', 'ALPHA']]


In [46]:
# THOU SHALL NOT SEQUENCE THREE ALPHAS IN A ROW!
tuples_training = [
("123 main st",                    [['DIGIT'], ['ALPHA'], ['WAY']]),
("1217 iris ct",                   [['DIGIT'], ['ALPHA'], ['WAY']]),
("123 east main st",               [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY']]),
("310 W MAIN ST",                  [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY']]),
("123 south main street.",         [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY'], ['PERIOD']]),
("90 south park rd",               [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY']]),
("987 el canyon ave",              [['DIGIT'], ['ARTI'], ['ALPHA'], ['WAY']]),
("333 st james st",                [['DIGIT'], ['PRE'], ['ALPHA'], ['WAY']]),
("123 west hill",                  [['DIGIT'], ['DIR'], ['ALPHA']]),
("77 el camino real",              [['DIGIT'], ['ARTI'], ['ALPHA'], ['ALPHA']]),
("1 bay st # b",                   [['DIGIT'], ['ALPHA'], ['WAY'], ['POUND'], ['LETTER']]),
("1 bay rd # b",                   [['DIGIT'], ['ALPHA'], ['WAY'], ['POUND'], ['LETTER']]),
("43 HUDSON AVE # 2",              [['DIGIT'], ['ALPHA'], ['WAY'], ['POUND'], ['DIGIT']]),
("111 san vicente blvd",           [['DIGIT'], ['PRE'], ['ALPHA'], ['WAY']]),
("28 black watch way",             [['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY']]),
("249 route 206",                  [['DIGIT'], ['WAY'], ['DIGIT']]),
("5500 lost tree",                 [['DIGIT'], ['ALPHA'], ['ALPHA']]),
("996 san benito st",              [['DIGIT'], ['PRE'], ['ALPHA'], ['WAY']]),
("899 embarcadero",                [['DIGIT'], ['ALPHA']]),
("177 montague e",                 [['DIGIT'], ['ALPHA'], ['DIR']]),
("2001 W COURT ST",                [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY']]),
("12545 S HIGHWAY J",              [['DIGIT'], ['DIR'], ['WAY'], ['LETTER']]),
("437 S HIGHWAY 101 STE 501",      [['DIGIT'], ['DIR'], ['WAY'], ['DIGIT'], ['APT'], ['DIGIT']]),
("12940 S HIGHWAY 259",            [['DIGIT'], ['DIR'], ['WAY'], ['DIGIT']]),
("775 E 14 MILE RD",               [['DIGIT'], ['DIR'], ['DIGIT'], ['ALPHA'], ['WAY']]),
("4701 VAN DAM ST",                [['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY']]),
("3201 NEW MEXICO AVE NW STE 246", [['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY'], ['DIR'], ['APT'], ['DIGIT']]),
("111 RIO RANCHO BLVD SE",         [['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY'], ['DIR']]),
("17259 WILD HORSE CREEK RD",      [['DIGIT'], ['ALPHA'], ['ALPHA'], ['ALPHA'], ['WAY']]),
("12587 FAIR LAKES CIR STE 141",   [['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY'], ['APT'], ['DIGIT']]),
("107 E MARIPOSA DR",              [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY']]),
("401 W FORT WILLIAMS ST",         [['DIGIT'], ['DIR'], ['ALPHA'], ['ALPHA'], ['WAY']]),
("3242 E DESERT INN RD STE 17",    [['DIGIT'], ['DIR'], ['ALPHA'], ['ALPHA'], ['WAY'], ['APT'], ['DIGIT']]),
("14238 44ND AVE",                 [['DIGIT'], ['NUMSTR'], ['WAY']]),
("106 HARVEST HILL LN",            [['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY']]),
("206 ELMHURST DR APT F15",        [['DIGIT'], ['ALPHA'], ['WAY'], ['APT'], ['ALNUM']]),
("655 COUNTY ROAD 1300",           [['DIGIT'], ['ALPHA'], ['WAY'], ['DIGIT']]),
("50 LOS PATOS WAY",               [['DIGIT'], ['ARTI'], ['ALPHA'], ['WAY']]),
("1001 N 60TH ST",                 [['DIGIT'], ['DIR'], ['NUMSTR'], ['WAY']]),
("5350 S SANTA FE DR",             [['DIGIT'], ['DIR'], ['ALPHA'], ['ALPHA'], ['WAY']]),
("638 18TH ST",                    [['DIGIT'], ['NUMSTR'], ['WAY']]),
("210 E 3RD AVE",                  [['DIGIT'], ['DIR'], ['NUMSTR'], ['WAY']]),
("1390 SW 160TH AVE",              [['DIGIT'], ['DIR'], ['NUMSTR'], ['WAY']]),
("171 NE 212TH ST",                [['DIGIT'], ['DIR'], ['NUMSTR'], ['WAY']]),
("712 B MAIN ST",                  [['DIGIT'], ['LETTER'], ['ALPHA'], ['WAY']]),
("2060 B MOUNTAIN BLVD",           [['DIGIT'], ['LETTER'], ['ALPHA'], ['WAY']]),
("123 avenue c",                   [['DIGIT'], ['WAY'], ['LETTER']]),
("717B GRAVES ST",                 [['ALNUM'], ['ALPHA'], ['WAY']]),
("1234 7TH AVENUE",                [['DIGIT'], ['NTH'], ['WAY']]),
("205 3RD AVE APT 3K",             [['DIGIT'], ['NTH'], ['WAY'], ['APT'], ['ALNUM']]),
("462 7TH AVE FL 2",               [['DIGIT'], ['NTH'], ['WAY'], ['ALPHA'], ['DIGIT']]),
    ("219 2ND AVE SE STE 101",     [['DIGIT'], ['NTH'], ['WAY'], ['DIR'], ['APT'], ['DIGIT']]),
    ("10859 12TH AVE SW",          [['DIGIT'], ['NTH'], ['WAY'], ['DIR']]),
    ("4401 LOOP 322",              [['DIGIT'], ['ALPHA'], ['DIGIT']]),
    ("8690 EDGE OF TEXAS",         [['DIGIT'], ['ALPHA'], ['ARTI'], ['ALPHA']]),
    ("487 RITCHIE HWY B102",       [['DIGIT'], ['ALPHA'], ['WAY'], ['ALNUM']]),
    ("366 RAILROAD AVE LOT 31", [['DIGIT'], ['ALPHA'], ['WAY'], ['APT'], ['DIGIT']]),
    ("1631A S MAIN ST",         [['AL_1NUM'],['DIR'],['ALPHA'],['WAY']]),
    ("338A 7TH AVE", [['AL_1NUM'],['NTH'],['WAY']]),
    #("3RD ST", [['NTH'],['WAY']]), THIS IS IN INCOPLETE ADDRESS
    ("1771A PRINCESS ANNE RD", [['AL_1NUM'], ['ALPHA'], ['ALPHA'], ['WAY']]),
    ("751 ROUTE 37 W", [['DIGIT'], ['WAY'], ['DIGIT'], ['DIR']]),
    ("73140 HIGHWAY 111 STE 6", [['DIGIT'], ['WAY'], ['DIGIT'], ['APT'], ['DIGIT']]),
    ("476534 HIGHWAY 95 STE A", [['DIGIT'], ['WAY'], ['DIGIT'], ['APT'], ['LETTER']]),
    ("225 LA PALOMA APT A", [['DIGIT'], ['ARTI'], ['ALPHA'], ['APT'], ['LETTER']]), # DIFF!
    ("3200 SISK RD STE C", [['DIGIT'], ['ALPHA'], ['WAY'], ['APT'], ['LETTER']]), # DIFF!
    ("2584 US HIGHWAY 80 W", [['DIGIT'], ['ALPHA'], ['WAY'], ['DIGIT'], ['DIR']]), # DIFF!
    ("2387 PORTOLA RD STE A", [['DIGIT'], ['ALPHA'], ['WAY'], ['APT'], ['LETTER']]), # DIFF!
    ("118 N 300 W", [['DIGIT'], ['DIR'], ['DIGIT'], ['DIR']]),
    ("1027 S WESTERN AVE # 2", [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY'], ['POUND'], ['DIGIT']]),
    ("690 S STATE HIGHWAY 5", [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY'], ['DIGIT']]),
    ("3070 W CHAPMAN AVE STE A", [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY'], ['APT'], ['LETTER']]),
    ("1420 E PLAZA BLVD STE D5", [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY'], ['APT'], ['ALNUM']]),
    ("82 S MAIN ST # 2", [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY', 'PRE'], ['POUND'], ['DIGIT']]),
    ("473 S MAIN ST STE A", [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY'], ['APT'], ['LETTER']]),
    ("1029 W BATTLEFIELD ST APT E214", [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY', 'PRE'], ['APT'],['ALNUM']]),
    ("3843 S BRISTOL 213", [['DIGIT'], ['DIR'], ['ALPHA'], ['DIGIT']]),
    ("", [['DIGIT'], ['DIR'], ['PERIOD'], ['ALPHA'], ['WAY']]),
    ("", [['DIGIT'], ['LETTER', 'DIR'], ['PERIOD'], ['ALPHA'], ['WAY']]),
    ("", [['DIGIT'], ['ALPHA'], ['WAY'], ['DIR']]),
    ("", [['DIGIT'], ['ALPHA'], ['WAY', 'PRE'], ['PERIOD']]),
]
for ts, matrix_lst in tuples_training:
    matrix_lst.append(['ADDRESS'])
    train_with_provided_list(seq, matrix_lst)

# valid po box
train_samples = [
    ("po box 1234", [['POB0'],['POB2'],['DIGIT']]),
    ("box 999",      [['POB2'],['DIGIT']]),
]
for ts, matrix_lst in train_samples:
    matrix_lst.append(['POBOX'])
    train_with_provided_list(seq, matrix_lst)

# valid attn
train_samples = [
    ("c/o john smith",     [['DELEG'],['ALPHA'],['ALPHA']]),
    ("attn john smith",    [['DELEG'],['ALPHA'],['ALPHA']]),
    ("attn: john smith",   [['DELEG'],['ALPHA'],['ALPHA']]),
    ("c/o john",           [['DELEG'],['ALPHA']]),
]
for ts, matrix_lst in train_samples:
    matrix_lst.append(['ATTN'])
    train_with_provided_list(seq, matrix_lst)

## The lexicon and get_normalized fn determine how to intepret a word.
* So for example, 'st', can mean 'saint', or 'street', depending if we look at it in the context of 'pre'(prefix) or 'way' as in a street or road designation.

## Next steps...
* Combine encoded sentence with a predetermined pattern, say like expecting 'way' type of words at end and see if we can reliably interpret the meaning of abbrevs like st.
* Automate geneation of the address templates by running encoder on a looooong list of valid addresses
* Hopefully after that an address can be normalized reliably... and this can then be applied maybe to business names....

* and IF that works, ^^, then do it for 'c/o', 'attn:', and 'po box' phrases... and the write more code to break up a long sentence into its composite phrases.  So like "some very long string with lots of words" => "c/o substring" + "valid address" + "junk on end".....


In [47]:
def is_address(seq, arr_st):
        """Expects ["123","main","st"]"""
        return any([pred == 'ADDRESS' for pred in seq.look_ahead(encode_from_word_list(arr_st)).get_next_values()])

def is_pobox(seq, arr_st):
    """Expects ["123","main","st"]"""
    assert isinstance(arr_st, list)
    return any([pred == 'POBOX' for pred in seq.look_ahead(encode_from_word_list(arr_st)).get_next_values()])

sent = "po box 7001"
assert is_pobox(seq, w(sent)) == True

def is_deleg(seq, arr_st):
    """Expects ["123","main","st"]"""
    assert isinstance(arr_st, list)
    return any([pred == 'ATTN' for pred in seq.look_ahead(encode_from_word_list(arr_st)).get_next_values()])

sent = "attn john doe"
assert is_deleg(seq, w(sent)) == True


In [48]:
def get_markers(seq, sent, lst_targets):
    """Input is like '123 main str' and returns a list of lists"""
    arr_w = w(sent)
    idx_tail = len(arr_w)
    markers = []
    for idx_beg in range(idx_tail):
        for idx_end in range(idx_beg + 1, idx_tail +1):
            next_values = seq.look_ahead(encode_from_word_list(arr_w[idx_beg:idx_end])).get_next_values()
            matches = list(set(next_values) & set(lst_targets) )
            if matches:
                markers.append([idx_beg, idx_end, idx_end - idx_beg, matches, ' '.join(arr_w[idx_beg:idx_end])])
    return markers


#st = "c/o john smith 100 maple ave"
st = "317 N WOODWORK LANE LLC 317 WOODWORK LANE"
get_markers(seq, st.lower(), ['ADDRESS'])

[[0, 2, 2, ['ADDRESS'], '317 n'],
 [0, 3, 3, ['ADDRESS'], '317 n woodwork'],
 [0, 4, 4, ['ADDRESS'], '317 n woodwork lane'],
 [5, 7, 2, ['ADDRESS'], '317 woodwork'],
 [5, 8, 3, ['ADDRESS'], '317 woodwork lane']]

In [67]:
def get_markers2(seq, sent, lst_targets):
    """Input is like '123 main str' and returns a list of lists"""
    arr_w = w(sent)
    idx_tail = len(arr_w)
    markers = []
    for idx_beg in range(idx_tail):
        for idx_end in range(idx_beg + 1, idx_tail +1):
            next_values = seq.look_ahead(encode_from_word_list(arr_w[idx_beg:idx_end])).get_next_values()
            matches = list(set(next_values) & set(lst_targets) )
            if matches:
                markers.append([idx_beg, idx_end, idx_end - idx_beg, matches, ' '.join(arr_w[idx_beg:idx_end])])
    return markers


#st = "c/o john smith 100 maple ave"
st = "4 STAR CAMPS 2171 IVY RD STE 3"
get_markers(seq, st.lower(), ['ADDRESS', 'ATTN', 'POBOX'])

[[0, 2, 2, ['ADDRESS'], '4 star'],
 [0, 3, 3, ['ADDRESS'], '4 star camps'],
 [3, 5, 2, ['ADDRESS'], '2171 ivy'],
 [3, 6, 3, ['ADDRESS'], '2171 ivy rd'],
 [3, 8, 5, ['ADDRESS'], '2171 ivy rd ste 3']]

In [50]:
def return_max_address(seq, sent):
    sent = str(sent).lower().strip()
    arr_cands = get_markers(seq, sent, ['ADDRESS'])
    if not arr_cands:
        return str([encoder(word) for word in w(sent)])
    
    max_len = 0
    for cand in arr_cands:
        if cand[2] > max_len:
            max_len = cand[2]
            candidate_address = cand[4]
    
    candidate_address = candidate_address.upper()
    if candidate_address != sent.upper():
        return '("", {}),'.format(str([encoder(word, trim=True) for word in w(sent)]))
    else:
        return candidate_address

return_max_address(seq, "16w661 90th st")

"[['ALNUM'], ['ALNUM', 'NTH', 'NUMSTR'], ['WAY', 'PRE', 'ALPHA']]"

In [51]:
field1 = "280 NW 123RD LN"
field2 = "288 NE 53RD ST"
print(return_max_address(seq, field2))

288 NE 53RD ST


In [52]:
import pandas as pd

In [53]:
df = pd.read_csv('fivehundred_more_addresses_no_spaces_sorted.csv')

In [54]:
df_addresses = df[['address1']]
df_addresses.drop_duplicates(keep='first', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [55]:
%%time
df_addresses['NEW ADDRESS'] = df_addresses['address1'].apply(lambda x: return_max_address(seq, x))

CPU times: user 918 ms, sys: 7.17 ms, total: 925 ms
Wall time: 930 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [56]:
df_addresses.head()

Unnamed: 0,address1,NEW ADDRESS
0,& DECOR CENTER LLC 750 SOUTH 13TH STREET,"("""", [[], ['ALPHA'], ['ALPHA'], ['ALPHA'], ['D..."
1,0-794 CHICAGO DRIVE,"("""", [['DIGIT'], ['DIGIT'], ['ALPHA'], ['WAY']]),"
2,1 BRUSSELS STREET,1 BRUSSELS STREET
3,1 EAGLE SQUARE,1 EAGLE SQUARE
4,1 JFK STREET,1 JFK STREET


In [57]:
df_addresses.to_csv('processed_addresses.csv')
! open 'processed_addresses.csv'

In [58]:
from collections import Counter
wcount = Counter()

def count_words(ctr, st):
    arr_words = w(st)
    for idx, word in enumerate(arr_words):
        if not word.isdigit():
            ctr[word] += 1
            bigram = arr_words[idx:idx+2]
            #print(arr_words[idx:idx+2])
            if len(bigram) == 2:
                ctr["-".join(bigram)] += 1
    return st

df_addresses['address1'].apply(lambda x: count_words(wcount, x))
wcount.most_common()

[('STREET', 113),
 ('ST', 101),
 ('AVE', 95),
 ('RD', 87),
 ('.', 83),
 ('STE', 75),
 ('ROAD', 74),
 ('AVENUE', 49),
 ('S', 46),
 ('DR', 42),
 ('DRIVE', 40),
 ('E', 40),
 ('SUITE', 39),
 (',', 36),
 ('W', 32),
 ('BLVD', 31),
 ('N', 31),
 ('#', 31),
 ('MAIN', 29),
 ('WEST', 22),
 ('EAST', 19),
 ('ST-.', 18),
 ('UNIT', 17),
 ('MAIN-ST', 16),
 ('LANE', 16),
 ('HWY', 14),
 ('SOUTH', 13),
 ('A', 13),
 ('AVE-.', 13),
 ('NORTH', 12),
 ('MAIN-STREET', 12),
 (',-SUITE', 12),
 ('B', 11),
 ('WAY', 11),
 ('LLC', 10),
 (',-STE', 9),
 ('COURT', 9),
 ('NE', 9),
 ('S-.', 9),
 ('CENTER', 8),
 ('NW', 8),
 ('PARK', 8),
 ('ROUTE', 8),
 ('E-.', 8),
 ('OLD', 7),
 ('W-.', 7),
 ('ST-STE', 7),
 ('CIRCLE', 7),
 ('STREET-STE', 7),
 ('BROADWAY', 7),
 ('STATE', 7),
 ('CT', 7),
 ('HILL', 7),
 ('RIDGE', 7),
 ('WATER', 6),
 ('.-MAIN', 6),
 ('INDUSTRIAL', 6),
 ('RD-STE', 6),
 ('RIVER', 6),
 ('OAK', 6),
 ('APT', 6),
 ('BRIDGE', 6),
 ('PLAZA', 5),
 ('SE', 5),
 ('STREET-SUITE', 5),
 ('N-.', 5),
 (',-UNIT', 5),
 ('BOX', 5

In [59]:
df_addresses['NEW ADDRESS'].value_counts()

("", [['DIGIT'], ['ALPHA'], ['WAY'], ['PERIOD']]),                                                 12
("", [['DIGIT'], ['LETTER', 'DIR'], ['PERIOD'], ['ALPHA'], ['WAY']]),                              10
("", [['DIGIT'], ['ALPHA'], ['WAY', 'PRE'], ['PERIOD']]),                                           6
("", [['DIGIT'], ['DIGIT'], ['ALPHA'], ['WAY']]),                                                   5
("", [['DIGIT'], ['LETTER'], ['ALPHA'], ['WAY']]),                                                  3
("", [['DIGIT'], ['ALPHA'], ['WAY'], ['LETTER', 'DIR'], ['APT'], ['DIGIT']]),                       3
("", [['DIGIT'], ['LETTER', 'DIR'], ['PERIOD'], ['ALPHA'], ['WAY', 'PRE'], ['PERIOD']]),            3
("", [['DIGIT'], ['ALPHA'], ['WAY'], ['PERIOD'], ['APT'], ['DIGIT']]),                              3
[['ALPHA'], ['DIGIT']]                                                                              3
("", [['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY'], ['POUND'], ['DIGIT']]),           

In [60]:
paths = sorted([pattern for st, pattern in tuples_training])
print(len(paths))

def convert_to_str_rep(path):
    rep = []
    for sub_lst in path:
        rep.append("-".join(sub_lst))
    return "|".join(rep)

arr_paths = []
for path in paths:
    if len(path) > 0:
        arr_paths.append(convert_to_str_rep(path))
    
#assert len(arr_paths) == len(list(set(arr_paths))), "should be unique"
print("UNIQUE PATHS: ", len(set(arr_paths)))
lst_paths = sorted(list(set(arr_paths)))
for path in set(arr_paths):
    print(path)
    

80
UNIQUE PATHS:  61
DIGIT|ALPHA|ALPHA|WAY|ADDRESS
DIGIT|ALPHA|ALPHA|WAY|APT|DIGIT|ADDRESS
DIGIT|ARTI|ALPHA|ALPHA|ADDRESS
DIGIT|NTH|WAY|DIR|APT|DIGIT|ADDRESS
DIGIT|PRE|ALPHA|WAY|ADDRESS
DIGIT|ALPHA|ALPHA|WAY|DIR|ADDRESS
DIGIT|NTH|WAY|ALPHA|DIGIT|ADDRESS
ALNUM|ALPHA|WAY|ADDRESS
DIGIT|DIR|ALPHA|WAY|ADDRESS
DIGIT|ALPHA|WAY|POUND|LETTER|ADDRESS
DIGIT|ALPHA|ALPHA|ADDRESS
DIGIT|ALPHA|ALPHA|ALPHA|WAY|ADDRESS
DIGIT|DIR|ALPHA|WAY|POUND|DIGIT|ADDRESS
DIGIT|ALPHA|DIGIT|ADDRESS
DIGIT|DIR|WAY|DIGIT|APT|DIGIT|ADDRESS
DIGIT|WAY|DIGIT|APT|DIGIT|ADDRESS
DIGIT|DIR|WAY|DIGIT|ADDRESS
DIGIT|DIR|WAY|LETTER|ADDRESS
AL_1NUM|DIR|ALPHA|WAY|ADDRESS
DIGIT|ALPHA|WAY|DIGIT|ADDRESS
DIGIT|ALPHA|WAY|DIGIT|DIR|ADDRESS
DIGIT|DIR|ALPHA|WAY|APT|ALNUM|ADDRESS
DIGIT|DIR|PERIOD|ALPHA|WAY|ADDRESS
DIGIT|ALPHA|DIR|ADDRESS
DIGIT|ALPHA|WAY|APT|ALNUM|ADDRESS
DIGIT|DIR|NUMSTR|WAY|ADDRESS
DIGIT|ALPHA|ARTI|ALPHA|ADDRESS
DIGIT|ALPHA|WAY|ALNUM|ADDRESS
DIGIT|DIR|ALPHA|DIGIT|ADDRESS
DIGIT|ALPHA|ADDRESS
DIGIT|ALPHA|ALPHA|WAY|DIR|APT|DIGIT

In [61]:
lexicon = {
    'WAY': {
        'street': ['st', 'street'],
        'road':  ['rd', 'road'],
        'avenue': ['ave', 'avenue'],
        'court': ['ct']
        },
    'PRE': {
        'saint': ['st']
        },
    'ADDRESS': {
        ':adr'
        },
    'POBOX': {
        ':box'
    },
    'DIGIT': {
        'echo': []
    },
    'ALPHA': {
        'echo': []
    }
    
}

def get_normalized(word, _way, lexicon):
    for entry, lst_matches in lexicon[_way].items():
        if entry == 'echo':
            return word
        if word in lst_matches:
            return entry
    else:
        return None

print(get_normalized('st', 'PRE', lexicon))
print(get_normalized('st', 'WAY', lexicon))

saint
street


In [62]:
add = "333 st james st"
print(w(add))
st_lst = w(add)
print(encode_sentence(add))

valid_sequence = [["DIGIT"], ["PRE"], ["ALPHA"], ["WAY"]]
another_valid = [['DIGIT'], ['ALPHA'], ['ALPHA'],['WAY']]
print(valid_sequence)
print(another_valid)
for wrd, typ in zip(st_lst, valid_sequence):
    print(get_normalized(wrd, typ[0], lexicon))

['333', 'st', 'james', 'st']
[['DIGIT'], ['WAY', 'PRE', 'ALPHA'], ['ALPHA'], ['WAY', 'PRE', 'ALPHA']]
[['DIGIT'], ['PRE'], ['ALPHA'], ['WAY']]
[['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY']]
333
saint
james
street


In [63]:
seq.reset()
lookaheads = []
st = "333 st james st"
encoded_sequence = encode_sentence(st)
print(st)
print(encoded_sequence)
print('-------------')
for idx in range(len(encoded_sequence)):
    seq.reset()
    for idy, word in enumerate(encoded_sequence):
        seq.hit(word, [],is_learning=False)
        nexts = seq.get_next_values()
        if 'ADDRESS' in nexts:
            print()
            print(idx, idy)
            print(word)
            print(seq.get_active_sequences())
            print(seq.get_next_values())

#print(lookaheads)

333 st james st
[['DIGIT'], ['WAY', 'PRE', 'ALPHA'], ['ALPHA'], ['WAY', 'PRE', 'ALPHA']]
-------------

0 1
['WAY', 'PRE', 'ALPHA']
['(*)>DIGIT>ALPHA', '(*)>DIGIT>PRE', '(*)>DIGIT>WAY']
['ADDRESS', 'ALPHA', 'ARTI', 'DIGIT', 'DIR', 'LETTER', 'WAY', 'WAY ALPHA']

0 2
['ALPHA']
['(*)>DIGIT>ALPHA>ALPHA', '(*)>DIGIT>PRE>ALPHA']
['ADDRESS', 'ALPHA', 'WAY']

0 3
['WAY', 'PRE', 'ALPHA']
['(*)>DIGIT>ALPHA>ALPHA>ALPHA', '(*)>DIGIT>ALPHA>ALPHA>WAY', '(*)>DIGIT>PRE>ALPHA>WAY']
['ADDRESS', 'APT', 'DIR', 'WAY']

1 1
['WAY', 'PRE', 'ALPHA']
['(*)>DIGIT>ALPHA', '(*)>DIGIT>PRE', '(*)>DIGIT>WAY']
['ADDRESS', 'ALPHA', 'ARTI', 'DIGIT', 'DIR', 'LETTER', 'WAY', 'WAY ALPHA']

1 2
['ALPHA']
['(*)>DIGIT>ALPHA>ALPHA', '(*)>DIGIT>PRE>ALPHA']
['ADDRESS', 'ALPHA', 'WAY']

1 3
['WAY', 'PRE', 'ALPHA']
['(*)>DIGIT>ALPHA>ALPHA>ALPHA', '(*)>DIGIT>ALPHA>ALPHA>WAY', '(*)>DIGIT>PRE>ALPHA>WAY']
['ADDRESS', 'APT', 'DIR', 'WAY']

2 1
['WAY', 'PRE', 'ALPHA']
['(*)>DIGIT>ALPHA', '(*)>DIGIT>PRE', '(*)>DIGIT>WAY']
['ADDRESS', 'A

# Next
* digits that have dashes, and separating those from say 123 17 mile dr.  
* how about all the !DIFF cases
* put the non-addresses identified into a separate negative test file!
* clean up code, add tests and move to single file script
* the dictionary in encode is now a list because we need to preserve order, do not sort it, and also, we need to revistit the order, letter should come after dir etc, because it is more general.  
* after we do that, we need to test the heck out of the output, so include the output of the ['ALPHA'] encoding in the excel sheet for review
* then get onto testing the separation of po box from address from attn strings
* then if all goes well, apply this to business name to wrangle in dba selection and also to name fields to identify suffix'
* and then it's wherever this can find an application.