In [49]:
import os
import re
from sequemem import seqds

## basic string breaker "word word2" -> ['word', 'word2']

In [50]:
def w(str_sentence):
    return re.findall(r"[\w'/-:]+|[.,!?;]", str_sentence)
st = w("123 alpha alpha123 c/o alpha STREET")
st

['123', 'alpha', 'alpha123', 'c/o', 'alpha', 'STREET']

## Encoder, takes a single word and returns a list of 'features' characterizing word

In [106]:
def encoder(word):
    encodings = {
        'alpha': [r'^[a-z]+$'],
        'digit': [r'^\d+$'],
        'alnum': [r'^(\d+[a-z]+|[a-z]+\d+)[\da-z]*$'],
        'comma': [r'^,$'],
        'period': [r'^\.$'],
        'way': [r'^street$', r'^st$',r'^road$',r'^rd$', r'^ave$',r'^avenue$',r'^hwy$', r'^highway$', r'^ct$', r'^way$'],
        'deleg': [r'^attn$', r'^attn:$', r'^c\/o$', r'^co$' ],
        'pob0': [r'^po$', r'^p\.o\.$'],
        'pob2': [r'^box$'],
        'dir':  [r'^east$',r'^west$',r'^north$',r'^south$',],
        'pre':  [r'^st$'],
        'prnn': [r'^el$',r'^la$',r'^las$',r'^los$'],
        'ADRESS': [r'^:adr$'],
        'POBOX': [r'^:box$']
    
    }
    hits = []
    for key, rexs in encodings.items():
        for rex in rexs:
            if re.match(rex, word):
                hits.append(key)
    return hits

addresses = [
    '123 south main street.',
    '1217 iris box 222',
    '1217 iris box',
    'po box 117',
    'box 2111'
    '1470 la cima rd',
    '1217 iris court c/o stewy dewy',
    '1215 iris court c/o stewy',
    '123 elcrest hwy',
    '2323 elmhurst way',
    '2323 way elmhurst',
    'c/o frank 123 south main st.',
    '123 main st. attn: elmer fudge',
    '333 st james st',
    'c/o forever 21 111 south broadway'
]



for address in addresses:
    print(w(address))
    print([encoder(word) for word in w(address)])
    print()


['123', 'south', 'main', 'street', '.']
[['digit'], ['alpha', 'dir'], ['alpha'], ['alpha', 'way'], ['period']]

['1217', 'iris', 'box', '222']
[['digit'], ['alpha'], ['alpha', 'pob2'], ['digit']]

['1217', 'iris', 'box']
[['digit'], ['alpha'], ['alpha', 'pob2']]

['po', 'box', '117']
[['alpha', 'pob0'], ['alpha', 'pob2'], ['digit']]

['box', '21111470', 'la', 'cima', 'rd']
[['alpha', 'pob2'], ['digit'], ['alpha', 'prnn'], ['alpha'], ['alpha', 'way']]

['1217', 'iris', 'court', 'c/o', 'stewy', 'dewy']
[['digit'], ['alpha'], ['alpha'], ['deleg'], ['alpha'], ['alpha']]

['1215', 'iris', 'court', 'c/o', 'stewy']
[['digit'], ['alpha'], ['alpha'], ['deleg'], ['alpha']]

['123', 'elcrest', 'hwy']
[['digit'], ['alpha'], ['alpha', 'way']]

['2323', 'elmhurst', 'way']
[['digit'], ['alpha'], ['alpha', 'way']]

['2323', 'way', 'elmhurst']
[['digit'], ['alpha', 'way'], ['alpha']]

['c/o', 'frank', '123', 'south', 'main', 'st', '.']
[['deleg'], ['alpha'], ['digit'], ['alpha', 'dir'], ['alpha'], ['al

## The lexicon and get_normalized fn determine how to intepret a word.
* So for example, 'st', can mean 'saint', or 'street', depending if we look at it in the context of 'pre'(prefix) or 'way' as in a street or road designation.

In [107]:
lexicon = {
        'way': {
            'street': ['st', 'street'],
            'road':  ['rd', 'road'],
            'avenue': ['ave', 'avenue'],
            'court': ['ct']
            },
        'pre': {
            'saint': ['st']
            },
        'ADR': {
            ':adr'
        },
    'POBOX': {
        ':box'
    }
}

def get_normalized(word, _way, lexicon):
    for entry, lst_matches in lexicon[_way].items():
        if word in lst_matches:
            return entry
    else:
        return None

print(get_normalized('st', 'pre', lexicon))
print(get_normalized('st', 'way', lexicon))

saint
street


## Next steps...
* Combine encoded sentence with a predetermined pattern, say like expecting 'way' type of words at end and see if we can reliably interpret the meaning of abbrevs like st.
* Automate geneation of the address templates by running encoder on a looooong list of valid addresses
* Hopefully after that an address can be normalized reliably... and this can then be applied maybe to business names....

* and IF that works, ^^, then do it for 'c/o', 'attn:', and 'po box' phrases... and the write more code to break up a long sentence into its composite phrases.  So like "some very long string with lots of words" => "c/o substring" + "valid address" + "junk on end".....


In [108]:
def train(seq, sts):
    return seq.predict(['|'.join(encoder(word)) for word in w(sts)]).sdr_predicted
def think(seq, sts):
    return seq.predict(['|'.join(encoder(word)) for word in w(sts)], is_learning=False).sdr_predicted

In [109]:
seq = seqds.Seqds('input')
test_st = "123 east main st" " :adr"
seq.predict(['|'.join(encoder(word)) for word in w(test_st)])

uuid: input
n_init: <node: <start>>
predicted: []
active: [<node: ADRESS>]
sdr_active: ['digit alpha|dir alpha alpha|way|pre ADRESS']
sdr_predicted: []

In [110]:
test_st = "1217 iris ct" " :adr"
seq.predict(['|'.join(encoder(word)) for word in w(test_st)])

uuid: input
n_init: <node: <start>>
predicted: []
active: [<node: ADRESS>]
sdr_active: ['digit alpha alpha|way ADRESS']
sdr_predicted: []

In [111]:
train_samples = [
    "123 main st",
    "1217 iris ct",
    "123 east main st",
    "123 south main street.",
    "90 south park rd",
    "987 el canyon ave"
]
for ts in train_samples:
    train(seq, "{} {}".format(ts,":adr"))
train_samples = [
    "po box 1234",
    "box 999"
]
for ts in train_samples:
    train(seq, "{} {}".format(ts,":box"))

In [112]:
train(seq, "1217 iris ct")

['ADRESS']

In [113]:
valid_addresses = [
    "666 demons st",
    "999 devils ct"
]

for va in addresses:
    print(train(seq, va), "\t\t", va)

['ADRESS'] 		 123 south main street.
[] 		 1217 iris box 222
['digit'] 		 1217 iris box
['POBOX'] 		 po box 117
[] 		 box 21111470 la cima rd
[] 		 1217 iris court c/o stewy dewy
['alpha'] 		 1215 iris court c/o stewy
['ADRESS'] 		 123 elcrest hwy
['ADRESS'] 		 2323 elmhurst way
[] 		 2323 way elmhurst
[] 		 c/o frank 123 south main st.
[] 		 123 main st. attn: elmer fudge
[] 		 333 st james st
[] 		 c/o forever 21 111 south broadway
