In [1]:
import os
import re
from hydraseq import Hydraseq

## basic string breaker "word word2" -> ['word', 'word2']

In [2]:
def load_category_from_file(fpath):
    """Take a one word per line file and return a regex for the concatenation '^(w1|w2)$'"""
    with open(fpath, 'r') as source:
        ways = [line.strip().lower() for line in source]
    return r'^(' + "|".join(ways) + r')$'

def w(str_sentence):
    return re.findall(r"[\w'/-:]+|[.,!?;#&]", str_sentence)


In [3]:
seq = Hydraseq('input')

## Encoder, takes a single word and returns a list of 'features' characterizing word

In [4]:
ways = load_category_from_file('ways.csv')
apts = load_category_from_file('apts.csv')
nths = load_category_from_file('nth.csv')
dirs = load_category_from_file('dirs.csv')
arti = load_category_from_file('arti.csv')
pre  = load_category_from_file('pre.csv')
deleg = load_category_from_file('deleg.csv')
print(apts)
def encoder(word, trim=False):
    encodings = [
        ('POBOX', [r'^:box$']),
        ('LETTER', [r'^[a-z]$']),
        ('ALNUM', [r'^(\d+[a-z]+|[a-z]+\d+)[\da-z]*$']),
        ('AL_1NUM', [ r'^\d+[a-z]$' ]),
        ('NTH',    [ nths ]),
        ('ATTN', [r'^:deleg$']),
        ('WAY',  [ ways ]),
        ('DIR',  [ dirs ]),
        ('NUMSTR', [r'^\d+[a-z]+$' ]),
        ('APT', [ apts ]),
        ('DIGIT', [r'^\d+$']),
        ('COMMA', [r'^,$']),
        ('PERIOD', [r'^\.$']),
        ('POUND', [r'^#$']),
        ('DELEG', [r'^attn$', r'^attn:$', r'^c\/o$', r'^co$' ]),
        ('POB0', [r'^po$', r'^p\.o\.$']),
        ('POB2', [r'^box$']),
        ('PRE',  [ pre ]),
        ('ARTI', [ arti ]),
        ('ADDRESS', [r'^:adr$']),
        ('ALPHA', [r'^[a-z]+$']),
    ]
    if not trim:
        return [key for key, rexs in encodings for rex in rexs if re.match(rex, word)]
    else:
        encoding = [key for key, rexs in encodings for rex in rexs if re.match(rex, word)]
        if len(encoding) > 1 and 'ALPHA' in encoding:
            encoding.remove('ALPHA')
        if len(encoding) > 1 and 'ALNUM' in encoding:
            encoding.remove('ALNUM')
        return encoding
            

^(apartment|apt|suite|ste|unit|lot|trlr|ro|spc|rm|fl|lbby|pmb|spc|)$


In [5]:
# THOU SHALL NOT SEQUENCE THREE ALPHAS IN A ROW!
def train_with_provided_list(seq, matrix_lst):
    """matrix list means [['DIGIT'],['ALPHA'],['WAY']] for example"""
    return seq.insert(matrix_lst, is_learning=True).get_next_values()
tuples_training = [
("123 main st",                    [['DIGIT'], ['ALPHA'], ['WAY']]),
("1217 iris ct",                   [['DIGIT'], ['ALPHA'], ['WAY']]),
("123 east main st",               [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY']]),
("310 W MAIN ST",                  [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY']]),
("123 south main street.",         [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY'], ['PERIOD']]),
("90 south park rd",               [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY']]),
("987 el canyon ave",              [['DIGIT'], ['ARTI'], ['ALPHA'], ['WAY']]),
("333 st james st",                [['DIGIT'], ['PRE'], ['ALPHA'], ['WAY']]),
("123 west hill",                  [['DIGIT'], ['DIR'], ['ALPHA']]),
("77 el camino real",              [['DIGIT'], ['ARTI'], ['ALPHA'], ['ALPHA']]),
("1 bay st # b",                   [['DIGIT'], ['ALPHA'], ['WAY'], ['POUND'], ['LETTER']]),
("1 bay rd # b",                   [['DIGIT'], ['ALPHA'], ['WAY'], ['POUND'], ['LETTER']]),
("43 HUDSON AVE # 2",              [['DIGIT'], ['ALPHA'], ['WAY'], ['POUND'], ['DIGIT']]),
("111 san vicente blvd",           [['DIGIT'], ['PRE'], ['ALPHA'], ['WAY']]),
("28 black watch way",             [['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY']]),
("249 route 206",                  [['DIGIT'], ['WAY'], ['DIGIT']]),
("5500 lost tree",                 [['DIGIT'], ['ALPHA'], ['ALPHA']]),
("996 san benito st",              [['DIGIT'], ['PRE'], ['ALPHA'], ['WAY']]),
("899 embarcadero",                [['DIGIT'], ['ALPHA']]),
("177 montague e",                 [['DIGIT'], ['ALPHA'], ['DIR']]),
("2001 W COURT ST",                [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY']]),
("12545 S HIGHWAY J",              [['DIGIT'], ['DIR'], ['WAY'], ['LETTER']]),
("437 S HIGHWAY 101 STE 501",      [['DIGIT'], ['DIR'], ['WAY'], ['DIGIT'], ['APT'], ['DIGIT']]),
("12940 S HIGHWAY 259",            [['DIGIT'], ['DIR'], ['WAY'], ['DIGIT']]),
("775 E 14 MILE RD",               [['DIGIT'], ['DIR'], ['DIGIT'], ['ALPHA'], ['WAY']]),
("4701 VAN DAM ST",                [['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY']]),
("3201 NEW MEXICO AVE NW STE 246", [['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY'], ['DIR'], ['APT'], ['DIGIT']]),
("111 RIO RANCHO BLVD SE",         [['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY'], ['DIR']]),
("17259 WILD HORSE CREEK RD",      [['DIGIT'], ['ALPHA'], ['ALPHA'], ['ALPHA'], ['WAY']]),
("12587 FAIR LAKES CIR STE 141",   [['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY'], ['APT'], ['DIGIT']]),
("107 E MARIPOSA DR",              [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY']]),
("401 W FORT WILLIAMS ST",         [['DIGIT'], ['DIR'], ['ALPHA'], ['ALPHA'], ['WAY']]),
("3242 E DESERT INN RD STE 17",    [['DIGIT'], ['DIR'], ['ALPHA'], ['ALPHA'], ['WAY'], ['APT'], ['DIGIT']]),
("14238 44ND AVE",                 [['DIGIT'], ['NUMSTR'], ['WAY']]),
("106 HARVEST HILL LN",            [['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY']]),
("206 ELMHURST DR APT F15",        [['DIGIT'], ['ALPHA'], ['WAY'], ['APT'], ['ALNUM']]),
("655 COUNTY ROAD 1300",           [['DIGIT'], ['ALPHA'], ['WAY'], ['DIGIT']]),
("50 LOS PATOS WAY",               [['DIGIT'], ['ARTI'], ['ALPHA'], ['WAY']]),
("1001 N 60TH ST",                 [['DIGIT'], ['DIR'], ['NUMSTR'], ['WAY']]),
("5350 S SANTA FE DR",             [['DIGIT'], ['DIR'], ['ALPHA'], ['ALPHA'], ['WAY']]),
("638 18TH ST",                    [['DIGIT'], ['NUMSTR'], ['WAY']]),
("210 E 3RD AVE",                  [['DIGIT'], ['DIR'], ['NUMSTR'], ['WAY']]),
("1390 SW 160TH AVE",              [['DIGIT'], ['DIR'], ['NUMSTR'], ['WAY']]),
("171 NE 212TH ST",                [['DIGIT'], ['DIR'], ['NUMSTR'], ['WAY']]),
("712 B MAIN ST",                  [['DIGIT'], ['LETTER'], ['ALPHA'], ['WAY']]),
("2060 B MOUNTAIN BLVD",           [['DIGIT'], ['LETTER'], ['ALPHA'], ['WAY']]),
("123 avenue c",                   [['DIGIT'], ['WAY'], ['LETTER']]),
("717B GRAVES ST",                 [['ALNUM'], ['ALPHA'], ['WAY']]),
("1234 7TH AVENUE",                [['DIGIT'], ['NTH'], ['WAY']]),
("205 3RD AVE APT 3K",             [['DIGIT'], ['NTH'], ['WAY'], ['APT'], ['ALNUM']]),
("462 7TH AVE FL 2",               [['DIGIT'], ['NTH'], ['WAY'], ['ALPHA'], ['DIGIT']]),
    ("219 2ND AVE SE STE 101",     [['DIGIT'], ['NTH'], ['WAY'], ['DIR'], ['APT'], ['DIGIT']]),
    ("10859 12TH AVE SW",          [['DIGIT'], ['NTH'], ['WAY'], ['DIR']]),
    ("4401 LOOP 322",              [['DIGIT'], ['ALPHA'], ['DIGIT']]),
    ("8690 EDGE OF TEXAS",         [['DIGIT'], ['ALPHA'], ['ARTI'], ['ALPHA']]),
    ("487 RITCHIE HWY B102",       [['DIGIT'], ['ALPHA'], ['WAY'], ['ALNUM']]),
    ("366 RAILROAD AVE LOT 31", [['DIGIT'], ['ALPHA'], ['WAY'], ['APT'], ['DIGIT']]),
    ("1631A S MAIN ST",         [['AL_1NUM'],['DIR'],['ALPHA'],['WAY']]),
    ("338A 7TH AVE", [['AL_1NUM'],['NTH'],['WAY']]),
    #("3RD ST", [['NTH'],['WAY']]), THIS IS IN INCOPLETE ADDRESS
    ("1771A PRINCESS ANNE RD", [['AL_1NUM'], ['ALPHA'], ['ALPHA'], ['WAY']]),
    ("751 ROUTE 37 W", [['DIGIT'], ['WAY'], ['DIGIT'], ['DIR']]),
    ("73140 HIGHWAY 111 STE 6", [['DIGIT'], ['WAY'], ['DIGIT'], ['APT'], ['DIGIT']]),
    ("476534 HIGHWAY 95 STE A", [['DIGIT'], ['WAY'], ['DIGIT'], ['APT'], ['LETTER']]),
    ("225 LA PALOMA APT A", [['DIGIT'], ['ARTI'], ['ALPHA'], ['APT'], ['LETTER']]), # DIFF!
    ("3200 SISK RD STE C", [['DIGIT'], ['ALPHA'], ['WAY'], ['APT'], ['LETTER']]), # DIFF!
    ("2584 US HIGHWAY 80 W", [['DIGIT'], ['ALPHA'], ['WAY'], ['DIGIT'], ['DIR']]), # DIFF!
    ("2387 PORTOLA RD STE A", [['DIGIT'], ['ALPHA'], ['WAY'], ['APT'], ['LETTER']]), # DIFF!
    ("118 N 300 W", [['DIGIT'], ['DIR'], ['DIGIT'], ['DIR']]),
    ("1027 S WESTERN AVE # 2", [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY'], ['POUND'], ['DIGIT']]),
    ("690 S STATE HIGHWAY 5", [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY'], ['DIGIT']]),
    ("3070 W CHAPMAN AVE STE A", [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY'], ['APT'], ['LETTER']]),
    ("1420 E PLAZA BLVD STE D5", [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY'], ['APT'], ['ALNUM']]),
    ("82 S MAIN ST # 2", [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY', 'PRE'], ['POUND'], ['DIGIT']]),
    ("473 S MAIN ST STE A", [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY'], ['APT'], ['LETTER']]),
    ("1029 W BATTLEFIELD ST APT E214", [['DIGIT'], ['DIR'], ['ALPHA'], ['WAY', 'PRE'], ['APT'],['ALNUM']]),
    ("3843 S BRISTOL 213", [['DIGIT'], ['DIR'], ['ALPHA'], ['DIGIT']]),
]
for ts, matrix_lst in tuples_training:
    matrix_lst.append(['ADDRESS'])
    train_with_provided_list(seq, matrix_lst)

# valid po box
train_samples = [
    ("po box 1234", [['POB0'],['POB2'],['DIGIT']]),
    ("box 999",      [['POB2'],['DIGIT']]),
]
for ts, matrix_lst in train_samples:
    matrix_lst.append(['POBOX'])
    train_with_provided_list(seq, matrix_lst)

# valid attn
train_samples = [
    ("c/o john smith",     [['DELEG'],['ALPHA'],['ALPHA']]),
    ("attn john smith",    [['DELEG'],['ALPHA'],['ALPHA']]),
    ("attn: john smith",   [['DELEG'],['ALPHA'],['ALPHA']]),
    ("c/o john",           [['DELEG'],['ALPHA']]),
]
for ts, matrix_lst in train_samples:
    matrix_lst.append(['ATTN'])
    train_with_provided_list(seq, matrix_lst)

In [6]:
def encode_from_word_list(arr_st):
    """Expects ['123', 'main', 'st]"""
    assert isinstance(arr_st, list)
    if arr_st: assert isinstance(arr_st[0], str)
    return [encoder(word) for word in arr_st]

def is_address(seq, arr_st):
    """Expects ["123","main","st"]"""
    return any([pred == 'ADDRESS' for pred in seq.look_ahead(encode_from_word_list(arr_st)).get_next_values()])

def is_pobox(seq, arr_st):
    """Expects ["123","main","st"]"""
    assert isinstance(arr_st, list)
    return any([pred == 'POBOX' for pred in seq.look_ahead(encode_from_word_list(arr_st)).get_next_values()])

sent = "po box 7001"
assert is_pobox(seq, w(sent)) == True

def is_deleg(seq, arr_st):
    """Expects ["123","main","st"]"""
    assert isinstance(arr_st, list)
    return any([pred == 'ATTN' for pred in seq.look_ahead(encode_from_word_list(arr_st)).get_next_values()])

sent = "attn john doe"
assert is_deleg(seq, w(sent)) == True


In [7]:
%%time
def get_markers(seq, sent, lst_targets):
    """Input is like '123 main str' and returns a list of lists"""
    arr_w = w(sent)
    idx_tail = len(arr_w)
    markers = []
    for idx_beg in range(idx_tail):
        for idx_end in range(idx_beg + 1, idx_tail +1):
            next_values = seq.look_ahead(encode_from_word_list(arr_w[idx_beg:idx_end])).get_next_values()
            matches = list(set(next_values) & set(lst_targets) )
            if matches:
                markers.append([idx_beg, idx_end, idx_end - idx_beg, matches, ' '.join(arr_w[idx_beg:idx_end])])
    return markers


#st = "c/o john smith 100 maple ave"
st = "333 st james st"
get_markers(seq, st, ['ADDRESS'])

CPU times: user 989 µs, sys: 36 µs, total: 1.03 ms
Wall time: 1.06 ms


In [9]:
def return_max_address(seq, sent):
    sent = str(sent).lower().strip()
    arr_cands = get_markers(seq, sent, ['ADDRESS'])
    if not arr_cands:
        return str([encoder(word) for word in w(sent)])
    
    max_len = 0
    for cand in arr_cands:
        if cand[2] > max_len:
            max_len = cand[2]
            candidate_address = cand[4]
    
    candidate_address = candidate_address.upper()
    if candidate_address != sent.upper():
        return '("", {}),'.format(str([encoder(word, trim=True) for word in w(sent)]))
    else:
        return candidate_address

return_max_address(seq, "16w661 90th st")

"[['ALNUM'], ['ALNUM', 'NTH', 'NUMSTR'], ['WAY', 'PRE', 'ALPHA']]"

In [10]:
field1 = "280 NW 123RD LN"
field2 = "288 NE 53RD ST"
print(return_max_address(seq, field2))

288 NE 53RD ST


In [11]:
import pandas as pd

In [12]:
df = pd.read_csv('originals.csv')

In [13]:
df_addresses = df[['ACCT_STREET_ADDR']]
df_addresses.drop_duplicates(keep='first', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
%%time
df_addresses['NEW ADDRESS'] = df_addresses['ACCT_STREET_ADDR'].apply(lambda x: return_max_address(seq, x))

CPU times: user 40.9 s, sys: 302 ms, total: 41.2 s
Wall time: 41.6 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
df_addresses.head()

Unnamed: 0,ACCT_STREET_ADDR,NEW ADDRESS
0,11706 SAN VICENTE BLVD,11706 SAN VICENTE BLVD
1,28 BLACK WATCH WAY,28 BLACK WATCH WAY
2,385 S MAIN ST,385 S MAIN ST
3,7808 CHASEWOOD LOOP,7808 CHASEWOOD LOOP
4,996 SAN BENITO ST,996 SAN BENITO ST


In [16]:
df_addresses.to_csv('processed_addresses.csv')
! open 'processed_addresses.csv'

In [17]:
from collections import Counter
wcount = Counter()

def count_words(ctr, st):
    arr_words = w(st)
    for idx, word in enumerate(arr_words):
        if not word.isdigit():
            ctr[word] += 1
            bigram = arr_words[idx:idx+2]
            #print(arr_words[idx:idx+2])
            if len(bigram) == 2:
                ctr["-".join(bigram)] += 1
    return st

df_addresses['ACCT_STREET_ADDR'].apply(lambda x: count_words(wcount, x))
wcount.most_common()

[('ST', 10932),
 ('AVE', 8265),
 ('RD', 7797),
 ('DR', 5148),
 ('STE', 5006),
 ('W', 3286),
 ('S', 3177),
 ('E', 3153),
 ('BLVD', 3029),
 ('N', 2990),
 ('BOX', 2101),
 ('PO', 2092),
 ('PO-BOX', 2092),
 ('LN', 1653),
 ('MAIN', 1363),
 ('MAIN-ST', 1306),
 ('CT', 1132),
 ('RD-STE', 1009),
 ('APT', 987),
 ('ST-STE', 924),
 ('WAY', 861),
 ('AVE-STE', 846),
 ('HIGHWAY', 750),
 ('HWY', 731),
 ('#', 704),
 ('BLVD-STE', 648),
 ('PL', 561),
 ('CIR', 545),
 ('A', 520),
 ('UNIT', 504),
 ('B', 472),
 ('DR-STE', 467),
 ('NW', 466),
 ('PKWY', 454),
 ('NE', 453),
 ('STATE', 435),
 ('PARK', 434),
 ('BROADWAY', 414),
 ('SW', 362),
 ('ROUTE', 343),
 ('SE', 333),
 ('HILL', 310),
 ('STE-A', 299),
 ('LAKE', 295),
 ('WASHINGTON', 286),
 ('CREEK', 284),
 ('OLD', 272),
 ('C', 270),
 ('VALLEY', 265),
 ('STE-B', 265),
 ('ST-APT', 256),
 ('ROAD', 246),
 ('US', 246),
 ('OAK', 224),
 ('RIVER', 223),
 ('TRL', 222),
 ('COUNTY', 218),
 ('D', 209),
 ('US-HIGHWAY', 209),
 ('SAN', 200),
 ('RIDGE', 195),
 ('N-MAIN', 193),

In [18]:
df_addresses['NEW ADDRESS'].value_counts()

[['POB0', 'ALPHA'], ['POB2', 'ALPHA'], ['DIGIT']]                                                2059
("", [['DIGIT'], ['ALPHA'], ['WAY'], ['DIR']]),                                                   306
("", [['DIGIT'], ['ALPHA'], ['WAY'], ['LETTER', 'DIR']]),                                         233
("", [['DIGIT'], ['LETTER', 'DIR'], ['NTH', 'NUMSTR'], ['WAY', 'PRE'], ['APT'], ['DIGIT']]),      127
("", [['DIGIT'], ['ALPHA'], ['ALPHA'], ['WAY'], ['APT'], ['LETTER']]),                            110
("", [['DIGIT'], ['DIGIT'], ['ALPHA'], ['WAY']]),                                                  86
("", [['DIGIT'], ['ALPHA'], ['WAY', 'PRE'], ['DIR']]),                                             74
("", [['DIGIT'], ['ALPHA'], ['ALPHA'], ['ALPHA']]),                                                67
("", [['DIGIT'], ['ALPHA'], ['WAY'], ['ALPHA']]),                                                  56
("", [['DIGIT'], ['ALPHA'], ['WAY'], ['DIR'], ['APT'], ['DIGIT']]),               

In [19]:
paths = sorted([pattern for st, pattern in tuples_training])
print(len(paths))

def convert_to_str_rep(path):
    rep = []
    for sub_lst in path:
        rep.append("-".join(sub_lst))
    return "|".join(rep)

arr_paths = []
for path in paths:
    if len(path) > 0:
        arr_paths.append(convert_to_str_rep(path))
    
#assert len(arr_paths) == len(list(set(arr_paths))), "should be unique"
print("UNIQUE PATHS: ", len(set(arr_paths)))
lst_paths = sorted(list(set(arr_paths)))
for path in set(arr_paths):
    print(path)
    

76
UNIQUE PATHS:  57
DIGIT|ARTI|ALPHA|WAY|ADDRESS
DIGIT|DIR|ALPHA|DIGIT|ADDRESS
DIGIT|ALPHA|ALPHA|ALPHA|WAY|ADDRESS
DIGIT|DIR|ALPHA|WAY|PERIOD|ADDRESS
DIGIT|NTH|WAY|APT|ALNUM|ADDRESS
DIGIT|NTH|WAY|DIR|ADDRESS
DIGIT|DIR|ALPHA|WAY|DIGIT|ADDRESS
DIGIT|DIR|ALPHA|WAY-PRE|APT|ALNUM|ADDRESS
ALNUM|ALPHA|WAY|ADDRESS
AL_1NUM|DIR|ALPHA|WAY|ADDRESS
DIGIT|DIR|ALPHA|ADDRESS
DIGIT|LETTER|ALPHA|WAY|ADDRESS
DIGIT|ALPHA|WAY|APT|ALNUM|ADDRESS
DIGIT|DIR|ALPHA|WAY|POUND|DIGIT|ADDRESS
DIGIT|DIR|ALPHA|ALPHA|WAY|APT|DIGIT|ADDRESS
DIGIT|NTH|WAY|ALPHA|DIGIT|ADDRESS
DIGIT|DIR|ALPHA|WAY|APT|ALNUM|ADDRESS
DIGIT|ALPHA|ALPHA|ADDRESS
DIGIT|ALPHA|WAY|APT|DIGIT|ADDRESS
DIGIT|ALPHA|ALPHA|WAY|APT|DIGIT|ADDRESS
DIGIT|ALPHA|WAY|POUND|LETTER|ADDRESS
DIGIT|DIR|ALPHA|WAY|APT|LETTER|ADDRESS
DIGIT|NTH|WAY|ADDRESS
DIGIT|DIR|ALPHA|WAY-PRE|POUND|DIGIT|ADDRESS
AL_1NUM|NTH|WAY|ADDRESS
DIGIT|PRE|ALPHA|WAY|ADDRESS
DIGIT|ALPHA|DIGIT|ADDRESS
DIGIT|ALPHA|WAY|DIGIT|DIR|ADDRESS
DIGIT|ALPHA|WAY|ADDRESS
DIGIT|DIR|WAY|DIGIT|ADDRESS
DIGIT|ALPH

In [20]:
lexicon = {
    'WAY': {
        'street': ['st', 'street'],
        'road':  ['rd', 'road'],
        'avenue': ['ave', 'avenue'],
        'court': ['ct']
        },
    'PRE': {
        'saint': ['st']
        },
    'ADDRESS': {
        ':adr'
        },
    'POBOX': {
        ':box'
    },
    'DIGIT': {
        'echo': []
    },
    'ALPHA': {
        'echo': []
    }
    
}

def get_normalized(word, _way, lexicon):
    for entry, lst_matches in lexicon[_way].items():
        if entry == 'echo':
            return word
        if word in lst_matches:
            return entry
    else:
        return None

print(get_normalized('st', 'PRE', lexicon))
print(get_normalized('st', 'WAY', lexicon))

saint
street


In [21]:
# def encode_sentence(st):
#     return [encoder(word.lower()) for word in w(st)]

# def encode_nested_list(st):
#     return [[' '.join(encoder(word))] for word in w(st)]

# def train(seq, sts):
#     return seq.insert(encode_nested_list(sts)).get_next_values()



# def train_arr(seq, arr_w):
#     return seq.insert([[' '.join(encoder(word))] for word in arr_w]).get_next_values()

# def think(seq, sts):
#     return seq.insert(encode_nested_list(sts), is_learning=False).get_next_values()

# st = "100 n main st"
# print(train(seq, st))
# print(train_arr(seq, ["1217", "iris", "ct"]))
# print(train_with_provided_list(seq, [['DIGIT'],['ALPHA']]))
# print(train_with_provided_list(seq, [['DIGIT'], ['ALPHA'], ['WAY'], ['ADDRESS']]))
# print(encode_from_word_list(['123','st','james','st']))