In [27]:
import os
import re

## basic string breaker "word word2" -> ['word', 'word2']

In [28]:
def w(str_sentence):
    return re.findall(r"[\w'/-:]+|[.,!?;]", str_sentence)
st = w("123 alpha alpha123 c/o alpha STREET")
st

['123', 'alpha', 'alpha123', 'c/o', 'alpha', 'STREET']

## Encoder, takes a single word and returns a list of 'features' characterizing word

In [29]:
def encoder(word):
    encodings = {
        'alpha': [r'^[a-z]+$'],
        'digit': [r'^\d+$'],
        'alnum': [r'^(\d+[a-z]+|[a-z]+\d+)[\da-z]*$'],
        'comma': [r'^,$'],
        'period': [r'^\.$'],
        'way': [r'^street$', r'^st$',r'^road$',r'^rd$', r'^ave$',r'^avenue$',r'^hwy$', r'^highway$'],
        'deleg': [r'^attn$', r'^attn:$', r'^c\/o$', r'^co$' ],
        'pob0': [r'^po$', r'^p\.o\.$'],
        'pob2': [r'^box$'],
        'dir':  [r'^east$',r'^west$',r'^north$',r'^south$',],
        'pre':  [r'^st$']
    
    }
    hits = []
    for key, rexs in encodings.items():
        for rex in rexs:
            if re.match(rex, word):
                hits.append(key)
    return hits

addresses = [
    '123 south main street.',
    '1217 iris box 222',
    '1217 iris box',
    '1470 la cima rd',
    '1217 iris court c/o stewy dewy',
    '1215 iris court c/o stewy',
    '123 elcrest hwy',
    '2323 elmhurst way',
    'c/o frank 123 south main st.',
    '123 main st. attn: elmer fudge',
    '333 st james st',
    'c/o forever 21 111 south broadway'
]



for address in addresses:
    print(w(address))
    print([encoder(word) for word in w(address)])
    print()


['123', 'south', 'main', 'street', '.']
[['digit'], ['dir', 'alpha'], ['alpha'], ['alpha', 'way'], ['period']]

['1217', 'iris', 'box', '222']
[['digit'], ['alpha'], ['alpha', 'pob2'], ['digit']]

['1217', 'iris', 'box']
[['digit'], ['alpha'], ['alpha', 'pob2']]

['1470', 'la', 'cima', 'rd']
[['digit'], ['alpha'], ['alpha'], ['alpha', 'way']]

['1217', 'iris', 'court', 'c/o', 'stewy', 'dewy']
[['digit'], ['alpha'], ['alpha'], ['deleg'], ['alpha'], ['alpha']]

['1215', 'iris', 'court', 'c/o', 'stewy']
[['digit'], ['alpha'], ['alpha'], ['deleg'], ['alpha']]

['123', 'elcrest', 'hwy']
[['digit'], ['alpha'], ['alpha', 'way']]

['2323', 'elmhurst', 'way']
[['digit'], ['alpha'], ['alpha']]

['c/o', 'frank', '123', 'south', 'main', 'st', '.']
[['deleg'], ['alpha'], ['digit'], ['dir', 'alpha'], ['alpha'], ['alpha', 'pre', 'way'], ['period']]

['123', 'main', 'st', '.', 'attn:', 'elmer', 'fudge']
[['digit'], ['alpha'], ['alpha', 'pre', 'way'], ['period'], ['deleg'], ['alpha'], ['alpha']]

['333

## The lexicon and get_normalized fn determine how to intepret a word.
* So for example, 'st', can mean 'saint', or 'street', depending if we look at it in the context of 'pre'(prefix) or 'way' as in a street or road designation.

In [32]:
lexicon = {
        'way': {
            'street': ['st', 'street'],
            'road':  ['rd', 'road'],
            'avenue': ['ave', 'avenue']
            },
        'pre': {
            'saint': ['st']
        }
}

def get_normalized(word, _way, lexicon):
    for entry, lst_matches in lexicon[_way].items():
        if word in lst_matches:
            return entry
    else:
        return None

print(get_normalized('st', 'pre', lexicon))
print(get_normalized('st', 'way', lexicon))

saint
street


## Next steps...
* Combine encoded sentence with a predetermined pattern, say like expecting 'way' type of words at end and see if we can reliably interpret the meaning of abbrevs like st.
* Automate geneation of the address templates by running encoder on a looooong list of valid addresses
* Hopefully after that an address can be normalized reliably... and this can then be applied maybe to business names....

* and IF that works, ^^, then do it for 'c/o', 'attn:', and 'po box' phrases... and the write more code to break up a long sentence into its composite phrases.  So like "some very long string with lots of words" => "c/o substring" + "valid address" + "junk on end".....
