In [511]:
import os
import re
from sequemem import seqds

## basic string breaker "word word2" -> ['word', 'word2']

In [512]:
def w(str_sentence):
    return re.findall(r"[\w'/-:]+|[.,!?;#]", str_sentence)
st = w("123 alpha alpha123 c/o alpha STREET")
st
st = w("42 hamilton ave # 3")
st

['42', 'hamilton', 'ave', '#', '3']

## Encoder, takes a single word and returns a list of 'features' characterizing word

In [538]:
ways = "|".join(["ave", "avenue", "blvd", "boulevard", "canyon","cir","circle","cmn","common","court","creek",
                 "ct","cyn","dr","drive","freeway","fwy","highway","hill","hl","hwy","jct","lane","ln","mall",
                 "park","pike","pl","place","plc","point","pt","rd",
                 "road","route","sq","square","st","street","ter","terrace","tpke","trl","turnpike",
                 "walk","way","xing"])

apts = "|".join(["apartment", "apt", "suite", "ste", "unit"])

def encoder(word):
    encodings = {
        'ALPHA': [r'^[a-z]+$'],
        'LETTER': [r'^[a-z]$'],
        'DIGIT': [r'^\d+$'],
        'ALNUM': [r'^(\d+[a-z]+|[a-z]+\d+)[\da-z]*$'],
        'NUMSTR': [r'^\d+th$',r'^\d+nd$',r'^\d+st$',r'^\d+rd$'],
        'APT': [r'^(' + apts + ')$'],
        'COMMA': [r'^,$'],
        'PERIOD': [r'^\.$'],
        'POUND': [r'^#$'],
        'WAY': [r'^(' + ways + ')$'],
        'DELEG': [r'^attn$', r'^attn:$', r'^c\/o$', r'^co$' ],
        'POB0': [r'^po$', r'^p\.o\.$'],
        'POB2': [r'^box$'],
        'DIR':  [r'^east$',r'^west$',r'^north$',r'^south$',r'^e$',r'^w$',r'^n$',r'^s$',r'^[nsew][nsew]$',],
        'PRE':  [r'^st$', r'^san$'],
        'PRON': [r'^el$',r'^la$',r'^las$',r'^los$'],
        'ADDRESS': [r'^:adr$'],
        'POBOX': [r'^:box$'],
        'ATTN': [r'^:deleg$']
    
    }
    hits = []
    for key, rexs in encodings.items():
        for rex in rexs:
            if re.match(rex, word):
                hits.append(key)
    return sorted(hits)


addresses = [
    '123 south main street.',
    '1217 iris box 222',
    '1217 iris box',
    'po box 117',
    'box 2111'
    '1470 la cima rd',
    '1217 iris court c/o stewy dewy',
    '1215 iris court c/o stewy',
    '123 elcrest hwy',
    '2323 elmhurst way',
    '2323 way elmhurst',
    'c/o frank 123 south main st.',
    '123 main st. attn: elmer fudge',
    '333 st james st',
    'c/o forever 21 111 south broadway',
    "1 bay st # b",
    "1 bay rd # b",
    "14238 44nd ave"
]


rex = r'^(' + ways + ')$'
matches = re.findall(rex, "st")
print(matches)

def encode_sentence(st):
    return [encoder(word.lower()) for word in w(st)]



['st']


## The lexicon and get_normalized fn determine how to intepret a word.
* So for example, 'st', can mean 'saint', or 'street', depending if we look at it in the context of 'pre'(prefix) or 'way' as in a street or road designation.

## Next steps...
* Combine encoded sentence with a predetermined pattern, say like expecting 'way' type of words at end and see if we can reliably interpret the meaning of abbrevs like st.
* Automate geneation of the address templates by running encoder on a looooong list of valid addresses
* Hopefully after that an address can be normalized reliably... and this can then be applied maybe to business names....

* and IF that works, ^^, then do it for 'c/o', 'attn:', and 'po box' phrases... and the write more code to break up a long sentence into its composite phrases.  So like "some very long string with lots of words" => "c/o substring" + "valid address" + "junk on end".....


In [515]:
def train(seq, sts):
    return seq.predict(['|'.join(encoder(word)) for word in w(sts)]).sdr_predicted
def train_arr(seq, arr_w):
    return seq.predict(['|'.join(encoder(word)) for word in arr_w]).sdr_predicted
def think(seq, sts):
    return seq.predict(['|'.join(encoder(word)) for word in w(sts)], is_learning=False).sdr_predicted


In [516]:
seq = seqds.Seqds('input')
test_st = "123 east main st" " :adr"
seq.predict(['|'.join(encoder(word)) for word in w(test_st)])

uuid: input
n_init: <node: <start>>
predicted: []
active: [<node: ADDRESS>]
sdr_active: ['DIGIT ALPHA|DIR ALPHA ALPHA|PRE|WAY ADDRESS']
sdr_predicted: []

In [517]:
test_st = "1217 iris ct" " :adr"
seq.predict(['|'.join(encoder(word)) for word in w(test_st)])

uuid: input
n_init: <node: <start>>
predicted: []
active: [<node: ADDRESS>]
sdr_active: ['DIGIT ALPHA ALPHA|WAY ADDRESS']
sdr_predicted: []

In [518]:
# valid addresses 
train_samples1 = [
    "123 main st",
    "1217 iris ct",
    "123 east main st",
    "310 W MAIN ST",
    "123 south main street.",
    "90 south park rd",
    "987 el canyon ave",
    "333 st james st",
    "123 west hill",
    "77 el camino real",
    "1 bay st # b",
    "1 bay rd # b",
    "43 HUDSON AVE # 2",
    "111 san vicente blvd",
    "28 black watch way",
    "249 route 206",
    "5500 lost tree",
    "996 san benito st",
    "899 embarcadero",
    "177 montague e",
    "2001 W COURT ST",
    "12545 S HIGHWAY J",
    "437 S HIGHWAY 101 STE 501",
    "12940 S HIGHWAY 259",
    "775 E 14 MILE RD",
    "4701 VAN DAM ST",
    "3201 NEW MEXICO AVE NW STE 246",
    "111 RIO RANCHO BLVD SE",
    "17259 WILD HORSE CREEK RD",
    "12587 FAIR LAKES CIR STE 141",
    "107 E MARIPOSA DR",
    "401 W FORT WILLIAMS ST",
    "3242 E DESERT INN RD STE 17",
    "14238 44ND AVE",
    "106 HARVEST HILL LN",
    "206 ELMHURST DR APT F15",
    "655 COUNTY ROAD 1300",
    "50 LOS PATOS WAY",
    "1001 N 60TH ST",
    "5350 S SANTA FE DR",
    "638 18TH ST",
    "210 E 3RD AVE",
    "1390 SW 160TH AVE",
    "171 NE 212TH ST",
    "712 B MAIN ST",
    "2060 B MOUNTAIN BLVD",
]
for ts in train_samples1:
    train(seq, "{} {}".format(ts.lower(),":adr"))

# valid po box
train_samples = [
    "po box 1234",
    "box 999"
]
for ts in train_samples:
    train(seq, "{} {}".format(ts,":box"))

# valid attn
train_samples = [
    "c/o john smith",
    "attn john smith",
    "attn: john smith",
    "c/o john"
]
for ts in train_samples:
    train(seq, "{} {}".format(ts,":deleg"))

In [519]:
def is_address(seq, arr_st):
    """Is this passed sequence a valid address?"""
    return any([pred == 'ADDRESS' for pred in train_arr(seq, arr_st)]) 
    
for address in train_samples1:
    assert is_address(seq, w(address.lower())) == True, address

In [520]:
print(train(seq, "1217 iris ct"))
print(train_arr(seq, ["1217", "iris", "ct"]))
print(train_arr(seq, ["po", "box", "7001"]))
print(train(seq, "1 bay st # b"))

['ADDRESS', 'ALPHA|WAY', 'APT|ALPHA', 'DIGIT', 'POUND']
['ADDRESS', 'ALPHA|WAY', 'APT|ALPHA', 'DIGIT', 'POUND']
['POBOX']
['ADDRESS']


In [521]:
def is_pobox(seq, arr_st):
    """Is this passed sequence a valid address?"""
    assert isinstance(arr_st, list)
    return any([pred == 'POBOX' for pred in train_arr(seq, arr_st)])

sent = "po box 7001"
assert is_pobox(seq, w(sent)) == True

def is_deleg(seq, arr_st):
    """Is this passed sequence a valid address?"""
    assert isinstance(arr_st, list)
    return any([pred == 'ATTN' for pred in train_arr(seq, arr_st)])

sent = "attn john doe"
assert is_deleg(seq, w(sent)) == True

In [522]:
def chunk_sentence_array(seq, sent):
    """Return chunked identified substrings is possible"""
    print("Processing: ", sent)
    arr_w = w(sent)
    idx_tail = len(arr_w)
    for idx_beg in range(idx_tail):
        for idx_end in range(idx_beg + 1, idx_tail +1):
            if is_address(seq, arr_w[idx_beg:idx_end]):
                print(idx_beg, idx_end - 1,'ADDRESS', arr_w[idx_beg:idx_end])
            if is_pobox(seq, arr_w[idx_beg:idx_end]):
                print(idx_beg, idx_end - 1, 'POBOX', arr_w[idx_beg:idx_end])
            if is_deleg(seq, arr_w[idx_beg:idx_end]):
                print(idx_beg, idx_end - 1, 'ATTN', arr_w[idx_beg:idx_end])
                
chunk_sentence_array(seq, "attn jon doe 1217 iris ct po box 7001")

Processing:  attn jon doe 1217 iris ct po box 7001
0 2 ATTN ['attn', 'jon', 'doe']
3 4 ADDRESS ['1217', 'iris']
3 5 ADDRESS ['1217', 'iris', 'ct']
6 8 POBOX ['po', 'box', '7001']
7 8 POBOX ['box', '7001']


In [523]:
def return_max_address(seq, sent):
    sent = str(sent).lower().strip()
    if sent.startswith("po box"):
        return ''
    arr_w = w(sent)
    idx_tail = len(arr_w)
    candidate_address = ''
    max_length = 0
    for idx_beg in range(idx_tail):
        for idx_end in range(idx_beg + 1, idx_tail +1):
            if is_address(seq, arr_w[idx_beg:idx_end]):
                this_len = idx_end - idx_beg
                if this_len > max_length:
                    max_length = this_len
                    candidate_address = " ".join(arr_w[idx_beg:idx_end])
    
    if len(candidate_address) ==0:
        return str([encoder(word) for word in w(sent)])
    candidate_address = candidate_address.upper()
    if candidate_address != sent.upper():
        candidate_address = "!DIFF {} : {} => {}".format(str([encoder(word) for word in w(sent)]), sent.upper(), candidate_address)
    return candidate_address

In [524]:
field1 = "280 NW 123RD LN"
field2 = "288 NE 53RD ST"
print(return_max_address(seq, field2))

288 NE 53RD ST


In [525]:
import pandas as pd

In [526]:
df = pd.read_csv('/Users/efrain/niarfe/datasheets/company_uploads_the_hartford_20180305_054740_20180305_054740_245_dba_vs_legal_carpe_redo.csv')

In [527]:
df_addresses = df[['ACCT_STREET_ADDR']]
df_addresses.drop_duplicates(keep='first', inplace=True)
df_addresses['NEW ADDRESS'] = df_addresses['ACCT_STREET_ADDR'].apply(lambda x: return_max_address(seq, x))
df_addresses.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,ACCT_STREET_ADDR,NEW ADDRESS
0,11706 SAN VICENTE BLVD,11706 SAN VICENTE BLVD
1,28 BLACK WATCH WAY,28 BLACK WATCH WAY
2,385 S MAIN ST,385 S MAIN ST
3,7808 CHASEWOOD LOOP,7808 CHASEWOOD LOOP
4,996 SAN BENITO ST,996 SAN BENITO ST


In [528]:
df_addresses.to_csv('processed_addresses.csv')
! open 'processed_addresses.csv'

In [529]:
from collections import Counter
wcount = Counter()

def count_words(ctr, st):
    arr_words = w(st)
    for idx, word in enumerate(arr_words):
        if not word.isdigit():
            ctr[word] += 1
            bigram = arr_words[idx:idx+2]
            #print(arr_words[idx:idx+2])
            if len(bigram) == 2:
                ctr["-".join(bigram)] += 1
    return st

df_addresses['ACCT_STREET_ADDR'].apply(lambda x: count_words(wcount, x))
wcount.most_common()

[('ST', 10932),
 ('AVE', 8265),
 ('RD', 7797),
 ('DR', 5148),
 ('STE', 5006),
 ('W', 3286),
 ('S', 3177),
 ('E', 3153),
 ('BLVD', 3029),
 ('N', 2990),
 ('BOX', 2101),
 ('PO-BOX', 2092),
 ('PO', 2092),
 ('LN', 1653),
 ('MAIN', 1363),
 ('MAIN-ST', 1306),
 ('CT', 1132),
 ('RD-STE', 1009),
 ('APT', 987),
 ('ST-STE', 924),
 ('WAY', 861),
 ('AVE-STE', 846),
 ('HIGHWAY', 750),
 ('HWY', 731),
 ('#', 704),
 ('BLVD-STE', 648),
 ('PL', 561),
 ('CIR', 545),
 ('A', 520),
 ('UNIT', 504),
 ('B', 472),
 ('DR-STE', 467),
 ('NW', 466),
 ('PKWY', 454),
 ('NE', 453),
 ('STATE', 435),
 ('PARK', 434),
 ('BROADWAY', 414),
 ('SW', 362),
 ('ROUTE', 343),
 ('SE', 333),
 ('HILL', 310),
 ('STE-A', 299),
 ('LAKE', 295),
 ('WASHINGTON', 286),
 ('CREEK', 284),
 ('OLD', 272),
 ('C', 270),
 ('STE-B', 265),
 ('VALLEY', 265),
 ('ST-APT', 256),
 ('ROAD', 246),
 ('US', 246),
 ('OAK', 224),
 ('RIVER', 223),
 ('TRL', 222),
 ('COUNTY', 218),
 ('D', 209),
 ('US-HIGHWAY', 209),
 ('SAN', 200),
 ('RIDGE', 195),
 ('3RD', 193),
 (

In [530]:
df_addresses['NEW ADDRESS'].value_counts()

                                                                                                                                            2068
[['DIGIT'], ['ALPHA', 'DIR'], ['ALPHA'], ['ALPHA', 'WAY']]                                                                                   145
[['DIGIT'], ['LETTER', 'ALPHA', 'DIR'], ['ALPHA']]                                                                                           127
[['DIGIT'], ['ALPHA', 'WAY'], ['ALPHA', 'WAY']]                                                                                              110
[['DIGIT'], ['LETTER', 'ALPHA', 'DIR'], ['DIGIT'], ['LETTER', 'ALPHA', 'DIR']]                                                                99
[['ALNUM'], ['ALPHA'], ['ALPHA', 'WAY']]                                                                                                      83
[['DIGIT'], ['ALPHA', 'DIR'], ['ALPHA', 'PRE', 'WAY']]                                                                            

In [558]:
paths = sorted([encode_sentence(st) for st in train_samples1])
print(len(paths))

def convert_to_str_rep(path):
    rep = []
    for sub_lst in path:
        rep.append("-".join(sub_lst))
    return "|".join(rep)

arr_paths = []
for path in paths:
    if len(path) == 4:
        arr_paths.append(convert_to_str_rep(path))
    
assert len(arr_paths) == len(list(set(arr_paths))), "should be unique"

for path in arr_paths:
    print(path)
    

44
DIGIT|ALPHA|ALPHA|ALPHA-PRE-WAY
DIGIT|ALPHA|ALPHA|ALPHA-WAY
DIGIT|ALPHA|ALPHA-WAY|ALPHA-WAY
DIGIT|ALPHA|ALPHA-WAY|DIGIT
DIGIT|ALPHA-DIR|ALNUM-NUMSTR|ALPHA-PRE-WAY
DIGIT|ALPHA-DIR|ALNUM-NUMSTR|ALPHA-WAY
DIGIT|ALPHA-DIR|ALPHA|ALPHA-PRE-WAY
DIGIT|ALPHA-DIR|ALPHA-WAY|ALPHA-WAY
DIGIT|ALPHA-DIR-LETTER|ALNUM-NUMSTR|ALPHA-PRE-WAY
DIGIT|ALPHA-DIR-LETTER|ALNUM-NUMSTR|ALPHA-WAY
DIGIT|ALPHA-DIR-LETTER|ALPHA|ALPHA-PRE-WAY
DIGIT|ALPHA-DIR-LETTER|ALPHA|ALPHA-WAY
DIGIT|ALPHA-DIR-LETTER|ALPHA-WAY|ALPHA-LETTER
DIGIT|ALPHA-DIR-LETTER|ALPHA-WAY|ALPHA-PRE-WAY
DIGIT|ALPHA-DIR-LETTER|ALPHA-WAY|DIGIT
DIGIT|ALPHA-PRE|ALPHA|ALPHA-PRE-WAY
DIGIT|ALPHA-PRE|ALPHA|ALPHA-WAY
DIGIT|ALPHA-PRE-WAY|ALPHA|ALPHA-PRE-WAY
DIGIT|ALPHA-PRON|ALPHA|ALPHA
DIGIT|ALPHA-PRON|ALPHA|ALPHA-WAY
DIGIT|ALPHA-PRON|ALPHA-WAY|ALPHA-WAY


In [569]:
lexicon = {
    'WAY': {
        'street': ['st', 'street'],
        'road':  ['rd', 'road'],
        'avenue': ['ave', 'avenue'],
        'court': ['ct']
        },
    'PRE': {
        'saint': ['st']
        },
    'ADDRESS': {
        ':adr'
        },
    'POBOX': {
        ':box'
    },
    'DIGIT': {
        'echo': []
    },
    'ALPHA': {
        'echo': []
    }
    
}

def get_normalized(word, _way, lexicon):
    for entry, lst_matches in lexicon[_way].items():
        if entry == 'echo':
            return word
        if word in lst_matches:
            return entry
    else:
        return None

print(get_normalized('st', 'PRE', lexicon))
print(get_normalized('st', 'WAY', lexicon))

saint
street


In [574]:
add = "333 st james st"
print(w(add))
st_lst = w(add)
print(encode_sentence(add))

# these are the sequences we seek... a word can have mutliple interpretations, but it is the 1:1 match
# which allows us to interpret.  A PRE does not make sense at the end of a sentence, but that 'knowledge'
# is stored in the sequemem. Not a very good way to go for self learning, but a much better way to generalize
# what we glean from examples.  And once you see a pattern once, you have that option moving forward.
## NEXT, try a convoluted example, and see if it can handle both in same sequemem.
# 333 st james st
# 444 albert james st
#### !!!!! ORRRRRR maybe we can use the 'active' to weed out the bad predicts.
# Right now we 'predict' three things if we have a DIGIT, but as soon as the next word appears, say we have
# DIGIT, which predicts ALPHA-PRE-WAY.... then when the next word shows up and it's a DIGIT, then the active
# state of PRE should dissapear.  Thus the current state of the sequemem should be DIGIT->ALPHA->DIGIT...
# but is that only when we do single entry? or 
valid_sequence = [["DIGIT"], ["PRE"], ["ALPHA"], ["WAY"]]
another_valid = [['DIGIT'], ['ALPHA'], ['ALPHA'],['WAY']]
print(valid_sequence)
print(another_valid)
for wrd, typ in zip(st_lst, valid_sequence):
    print(get_normalized(wrd, typ[0], lexicon))

['333', 'st', 'james', 'st']
[['DIGIT'], ['ALPHA', 'PRE', 'WAY'], ['ALPHA'], ['ALPHA', 'PRE', 'WAY']]
[['DIGIT'], ['PRE'], ['ALPHA'], ['WAY']]
[['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY']]
333
saint
james
street
