In [72]:
import os
import re
from sequemem import seqds

## basic string breaker "word word2" -> ['word', 'word2']

In [73]:
def w(str_sentence):
    return re.findall(r"[\w'/-:]+|[.,!?;]", str_sentence)
st = w("123 alpha alpha123 c/o alpha STREET")
st

['123', 'alpha', 'alpha123', 'c/o', 'alpha', 'STREET']

## Encoder, takes a single word and returns a list of 'features' characterizing word

In [111]:
def encoder(word):
    encodings = {
        'ALPHA': [r'^[a-z]+$'],
        'DIGIT': [r'^\d+$'],
        'ALNUM': [r'^(\d+[a-z]+|[a-z]+\d+)[\da-z]*$'],
        'COMMA': [r'^,$'],
        'PERIOD': [r'^\.$'],
        'WAY': [r'^street$', r'^st$',r'^road$',r'^rd$', r'^ave$',r'^avenue$',r'^hwy$', r'^highway$', r'^ct$', r'^court$', r'^way$'],
        'DELEG': [r'^attn$', r'^attn:$', r'^c\/o$', r'^co$' ],
        'POB0': [r'^po$', r'^p\.o\.$'],
        'POB2': [r'^box$'],
        'DIR':  [r'^east$',r'^west$',r'^north$',r'^south$',],
        'PRE':  [r'^st$'],
        'PRON': [r'^el$',r'^la$',r'^las$',r'^los$'],
        'ADDRESS': [r'^:adr$'],
        'POBOX': [r'^:box$'],
        'ATTN': [r'^:deleg$']
    
    }
    hits = []
    for key, rexs in encodings.items():
        for rex in rexs:
            if re.match(rex, word):
                hits.append(key)
    return hits


addresses = [
    '123 south main street.',
    '1217 iris box 222',
    '1217 iris box',
    'po box 117',
    'box 2111'
    '1470 la cima rd',
    '1217 iris court c/o stewy dewy',
    '1215 iris court c/o stewy',
    '123 elcrest hwy',
    '2323 elmhurst way',
    '2323 way elmhurst',
    'c/o frank 123 south main st.',
    '123 main st. attn: elmer fudge',
    '333 st james st',
    'c/o forever 21 111 south broadway'
]



for address in addresses:
    print(w(address))
    print([encoder(word) for word in w(address)])
    print()


['123', 'south', 'main', 'street', '.']
[['DIGIT'], ['DIR', 'ALPHA'], ['ALPHA'], ['WAY', 'ALPHA'], ['PERIOD']]

['1217', 'iris', 'box', '222']
[['DIGIT'], ['ALPHA'], ['ALPHA', 'POB2'], ['DIGIT']]

['1217', 'iris', 'box']
[['DIGIT'], ['ALPHA'], ['ALPHA', 'POB2']]

['po', 'box', '117']
[['ALPHA', 'POB0'], ['ALPHA', 'POB2'], ['DIGIT']]

['box', '21111470', 'la', 'cima', 'rd']
[['ALPHA', 'POB2'], ['DIGIT'], ['ALPHA', 'PRON'], ['ALPHA'], ['WAY', 'ALPHA']]

['1217', 'iris', 'court', 'c/o', 'stewy', 'dewy']
[['DIGIT'], ['ALPHA'], ['WAY', 'ALPHA'], ['DELEG'], ['ALPHA'], ['ALPHA']]

['1215', 'iris', 'court', 'c/o', 'stewy']
[['DIGIT'], ['ALPHA'], ['WAY', 'ALPHA'], ['DELEG'], ['ALPHA']]

['123', 'elcrest', 'hwy']
[['DIGIT'], ['ALPHA'], ['WAY', 'ALPHA']]

['2323', 'elmhurst', 'way']
[['DIGIT'], ['ALPHA'], ['WAY', 'ALPHA']]

['2323', 'way', 'elmhurst']
[['DIGIT'], ['WAY', 'ALPHA'], ['ALPHA']]

['c/o', 'frank', '123', 'south', 'main', 'st', '.']
[['DELEG'], ['ALPHA'], ['DIGIT'], ['DIR', 'ALPHA'], [

## The lexicon and get_normalized fn determine how to intepret a word.
* So for example, 'st', can mean 'saint', or 'street', depending if we look at it in the context of 'pre'(prefix) or 'way' as in a street or road designation.

In [112]:
lexicon = {
    'WAY': {
        'street': ['st', 'street'],
        'road':  ['rd', 'road'],
        'avenue': ['ave', 'avenue'],
        'court': ['ct']
        },
    'PRE': {
        'saint': ['st']
        },
    'ADDRESS': {
        ':adr'
        },
    'POBOX': {
        ':box'
    }
}

def get_normalized(word, _way, lexicon):
    for entry, lst_matches in lexicon[_way].items():
        if word in lst_matches:
            return entry
    else:
        return None

print(get_normalized('st', 'PRE', lexicon))
print(get_normalized('st', 'WAY', lexicon))

saint
street


## Next steps...
* Combine encoded sentence with a predetermined pattern, say like expecting 'way' type of words at end and see if we can reliably interpret the meaning of abbrevs like st.
* Automate geneation of the address templates by running encoder on a looooong list of valid addresses
* Hopefully after that an address can be normalized reliably... and this can then be applied maybe to business names....

* and IF that works, ^^, then do it for 'c/o', 'attn:', and 'po box' phrases... and the write more code to break up a long sentence into its composite phrases.  So like "some very long string with lots of words" => "c/o substring" + "valid address" + "junk on end".....


In [113]:
def train(seq, sts):
    return seq.predict(['|'.join(encoder(word)) for word in w(sts)]).sdr_predicted
def train_arr(seq, arr_w):
    return seq.predict(['|'.join(encoder(word)) for word in arr_w]).sdr_predicted
def think(seq, sts):
    return seq.predict(['|'.join(encoder(word)) for word in w(sts)], is_learning=False).sdr_predicted


In [114]:
seq = seqds.Seqds('input')
test_st = "123 east main st" " :adr"
seq.predict(['|'.join(encoder(word)) for word in w(test_st)])

uuid: input
n_init: <node: <start>>
predicted: []
active: [<node: ADDRESS>]
sdr_active: ['DIGIT DIR|ALPHA ALPHA WAY|PRE|ALPHA ADDRESS']
sdr_predicted: []

In [115]:
test_st = "1217 iris ct" " :adr"
seq.predict(['|'.join(encoder(word)) for word in w(test_st)])

uuid: input
n_init: <node: <start>>
predicted: []
active: [<node: ADDRESS>]
sdr_active: ['DIGIT ALPHA WAY|ALPHA ADDRESS']
sdr_predicted: []

In [132]:
# valid addresses 
train_samples = [
    "123 main st",
    "1217 iris ct",
    "123 east main st",
    "123 south main street.",
    "90 south park rd",
    "987 el canyon ave",
    "333 st james st",
    "123 west hill",
    "77 el camino real"
]
for ts in train_samples:
    train(seq, "{} {}".format(ts,":adr"))

# valid po box
train_samples = [
    "po box 1234",
    "box 999"
]
for ts in train_samples:
    train(seq, "{} {}".format(ts,":box"))

# valid attn
train_samples = [
    "c/o john smith",
    "attn john smith",
    "attn: john smith",
    "c/o john"
]
for ts in train_samples:
    train(seq, "{} {}".format(ts,":deleg"))

In [126]:
print(train(seq, "1217 iris ct"))
print(train_arr(seq, ["1217", "iris", "ct"]))
print(train_arr(seq, ["po", "box", "7001"]))

['ADDRESS', 'ALPHA|POB0', 'DELEG']
['ADDRESS', 'ALPHA|POB0', 'DELEG']
['POBOX']


In [127]:
valid_addresses = [
    "666 demons st",
    "999 devils ct"
]

for va in addresses:
    print(train(seq, va), "\t\t", va)

['ADDRESS'] 		 123 south main street.
[] 		 1217 iris box 222
['DIGIT'] 		 1217 iris box
['POBOX'] 		 po box 117
[] 		 box 21111470 la cima rd
[] 		 1217 iris court c/o stewy dewy
['ALPHA'] 		 1215 iris court c/o stewy
['ADDRESS', 'ALPHA|POB0', 'DELEG'] 		 123 elcrest hwy
['ADDRESS', 'ALPHA|POB0', 'DELEG'] 		 2323 elmhurst way
[] 		 2323 way elmhurst
[] 		 c/o frank 123 south main st.
[] 		 123 main st. attn: elmer fudge
['ADDRESS'] 		 333 st james st
[] 		 c/o forever 21 111 south broadway


In [128]:
def is_address(seq, arr_st):
    """Is this passed sequence a valid address?"""
    return any([pred == 'ADDRESS' for pred in train_arr(seq, arr_st)]) 
    
sent = "1217 iris court c/o stewy dewy"
assert is_address(seq, w(sent)) == False
assert is_address(seq, w("1217 iris ct")) == True

In [129]:
def is_pobox(seq, arr_st):
    """Is this passed sequence a valid address?"""
    assert isinstance(arr_st, list)
    return any([pred == 'POBOX' for pred in train_arr(seq, arr_st)])

sent = "po box 7001"
assert is_pobox(seq, w(sent)) == True

def is_deleg(seq, arr_st):
    """Is this passed sequence a valid address?"""
    assert isinstance(arr_st, list)
    return any([pred == 'ATTN' for pred in train_arr(seq, arr_st)])

sent = "attn john doe"
assert is_deleg(seq, w(sent)) == True

In [130]:
def chunk_sentence_array(seq, sent):
    """Return chunked identified substrings is possible"""
    print("Processing: ", sent)
    arr_w = w(sent)
    idx_tail = len(arr_w)
    for idx_beg in range(idx_tail):
        for idx_end in range(idx_beg + 1, idx_tail +1):
            if is_address(seq, arr_w[idx_beg:idx_end]):
                print(idx_beg, idx_end - 1,'ADDRESS', arr_w[idx_beg:idx_end])
            if is_pobox(seq, arr_w[idx_beg:idx_end]):
                print(idx_beg, idx_end - 1, 'POBOX', arr_w[idx_beg:idx_end])
            if is_deleg(seq, arr_w[idx_beg:idx_end]):
                print(idx_beg, idx_end - 1, 'ATTN', arr_w[idx_beg:idx_end])
                
chunk_sentence_array(seq, "attn jon doe 1217 iris ct po box 7001")

Processing:  attn jon doe 1217 iris ct po box 7001
0 2 ATTN ['attn', 'jon', 'doe']
3 5 ADDRESS ['1217', 'iris', 'ct']
6 8 POBOX ['po', 'box', '7001']
7 8 POBOX ['box', '7001']


In [131]:
for address in addresses:
    print()
    chunk_sentence_array(seq, address)


Processing:  123 south main street.
0 2 ADDRESS ['123', 'south', 'main']
0 3 ADDRESS ['123', 'south', 'main', 'street']
0 4 ADDRESS ['123', 'south', 'main', 'street', '.']

Processing:  1217 iris box 222
2 3 POBOX ['box', '222']

Processing:  1217 iris box

Processing:  po box 117
0 2 POBOX ['po', 'box', '117']
1 2 POBOX ['box', '117']

Processing:  box 21111470 la cima rd
0 1 POBOX ['box', '21111470']
1 4 ADDRESS ['21111470', 'la', 'cima', 'rd']

Processing:  1217 iris court c/o stewy dewy
0 2 ADDRESS ['1217', 'iris', 'court']
3 4 ATTN ['c/o', 'stewy']
3 5 ATTN ['c/o', 'stewy', 'dewy']

Processing:  1215 iris court c/o stewy
0 2 ADDRESS ['1215', 'iris', 'court']
3 4 ATTN ['c/o', 'stewy']

Processing:  123 elcrest hwy
0 2 ADDRESS ['123', 'elcrest', 'hwy']

Processing:  2323 elmhurst way
0 2 ADDRESS ['2323', 'elmhurst', 'way']

Processing:  2323 way elmhurst

Processing:  c/o frank 123 south main st.
0 1 ATTN ['c/o', 'frank']
2 4 ADDRESS ['123', 'south', 'main']
2 5 ADDRESS ['123', 'so