In [2]:
import hydraseq as hseq
import re

In [3]:
re.findall(r'martin$', 'martin')

['martin']

In [37]:
def w(str_sentence):
    """input: "123 main st" and outout: ['123','main','st]]"""
    return re.findall(r"[\w'/-:]+|[,!?;#&]", str_sentence)
assert w("123 main st") == ['123','main','st'],"123 main st"

def isItAnAlpha(strt):
    return 'STR_NAME' if strt.isalpha() else None

def isItADigit(strt):
    return 'STR_NUM' if strt.isdigit() else None

def isSpecialWord(start):
    special_words = ['martin', 'luther', 'king', 'jr']
    return start if start in special_words else None

def isUnitType(strt):
    unit_types = ['apt', 'suite', 'unit']
    return 'UNT_TYPE' if strt in unit_types else None

def isUnitNumber(strt):
    return 'UNT_NUM' if (re.findall(r'\d+$', strt) or re.findall(r'^[a-z]$', strt)) else None

def isPOType(strt):
    if strt == 'po':
        return 'PO0'
    elif strt == 'box':
        return 'PO1'
    else:
        return None


sensors = [
    isItAnAlpha,
    isItADigit,
    isSpecialWord,
    isUnitType,
    isUnitNumber,
    isPOType
]


#sense_active = zip(activations, sensors)

def encode(words):
    """Input: ['w1, 'w2']"""
    return [[func(word) for func in sensors if func(word)] for word in words]
def encode_sentence(sentence):
    """Input: [['w1'],['w2']], also [['w1a','w2a'],['w1b','w2b']]"""
    return [encode(words) for words in sentence]

encode_examples = [
    ['123', 'martin']
]
for example in encode_examples:
    print('Example: ', example)
    print('Encoded: ', encode(example))
    print('Encode Sentence: ', encode_sentence(encode(example)))


Example:  ['123', 'martin']
Encoded:  [['STR_NUM', 'UNT_NUM'], ['STR_NAME', 'martin']]
Encode Sentence:  [[[], []], [[], ['STR_NAME', 'martin']]]


In [38]:
seq = hseq.Hydraseq('input')
seq.insert([['STR_NUM'], ['STR_NAME'], ['ADDRESS']])
seq.insert([['STR_NUM'],['martin'],['luther'],['king'],['jr'],['ADDRESS']])
seq.insert([['UNT_TYPE'],['UNT_NUM'],['UNIT']])
seq.look_ahead([['STR_NUM'],['STR_NAME']])
seq.insert([['PO0'],['PO1'],['STR_NUM'],['POBOX']])
print(seq.get_next_values())
seq.look_ahead([['martin'],['luther'],['king'],['jr']])
seq.get_next_values()


[]


[]

In [39]:
seq2 = hseq.Hydraseq('first')
seq2.insert([['DIGIT'],['ALPHA'],['_ADDRESS_']])
seq2.look_ahead([['DIGIT'],['ALPHA']])
seq2.get_next_values()

['_ADDRESS_']

In [40]:
t1_encoded = encode(w("123 main"))
print(t1_encoded)
seq.look_ahead(t1_encoded)
seq.get_next_values()

[['STR_NUM', 'UNT_NUM'], ['STR_NAME']]


['ADDRESS']

In [41]:
def get_interpretation(seq, arr_words):
    """INPUT: ['123', 'main']"""
    t1_encoded = encode(arr_words)
    print("encoded: " ,t1_encoded)
    seq.look_ahead(t1_encoded)
    next_values = seq.get_next_values()
    print("returning ", next_values)
    return next_values

#get_interpretation(seq, w("123 martin luther king jr"))
get_interpretation(seq, w("po box 123"))

encoded:  [['STR_NAME', 'PO0'], ['STR_NAME', 'PO1'], ['STR_NUM', 'UNT_NUM']]
returning  ['POBOX']


['POBOX']

In [42]:
def decompose_into_dictionary_words(domain, _seq, types):
    last_length = [0] * len(domain)
    print("Length Domain: ", len(domain))
    for i in range(len(domain)):
        print("INPUT TO GET_INTERPRETATION: ", domain[:i + 1])
        inter1 = get_interpretation(_seq, domain[:i + 1])
        if any([ret_type in types for ret_type in inter1]):
            last_length[i] = i + 1
            
        if last_length[i] == 0:
            for j in range(i):
                inter2 = get_interpretation(_seq, domain[j + 1:i + 1])
                print("SUB INPUT for INTERPRET: ", inter2)
                if last_length[j] != -1 and any([ret_type in types for ret_type in inter2]):
                    last_length[i] = i - j
                    break
        print("last_length: ", last_length)
    return last_length
#     decompositions = []
#     if last_length[-1] != 0:
#         idx = len(domain) - 1
#         while idx >= 0:
#             decompositions.append(domain[idx + 1 - last_length[idx]:idx + 1])
#             idx -= last_length[idx]
#         decompositions = decompositions[::-1]

#     return decompositions

#decompose_into_dictionary_words(['apt','123','123','martin','luther','king','jr','apt','b'], seq, ['ADDRESS', 'UNIT'])
decompose_into_dictionary_words(['po', 'box', '600', '17', 'mile', 'rd','ste','19'], seq, ['ADDRESS', 'UNIT', 'POBOX'])

Length Domain:  8
INPUT TO GET_INTERPRETATION:  ['po']
encoded:  [['STR_NAME', 'PO0']]
returning  ['PO1']
last_length:  [0, 0, 0, 0, 0, 0, 0, 0]
INPUT TO GET_INTERPRETATION:  ['po', 'box']
encoded:  [['STR_NAME', 'PO0'], ['STR_NAME', 'PO1']]
returning  ['STR_NUM']
encoded:  [['STR_NAME', 'PO1']]
returning  []
SUB INPUT for INTERPRET:  []
last_length:  [0, 0, 0, 0, 0, 0, 0, 0]
INPUT TO GET_INTERPRETATION:  ['po', 'box', '600']
encoded:  [['STR_NAME', 'PO0'], ['STR_NAME', 'PO1'], ['STR_NUM', 'UNT_NUM']]
returning  ['POBOX']
last_length:  [0, 0, 3, 0, 0, 0, 0, 0]
INPUT TO GET_INTERPRETATION:  ['po', 'box', '600', '17']
encoded:  [['STR_NAME', 'PO0'], ['STR_NAME', 'PO1'], ['STR_NUM', 'UNT_NUM'], ['STR_NUM', 'UNT_NUM']]
returning  []
encoded:  [['STR_NAME', 'PO1'], ['STR_NUM', 'UNT_NUM'], ['STR_NUM', 'UNT_NUM']]
returning  []
SUB INPUT for INTERPRET:  []
encoded:  [['STR_NUM', 'UNT_NUM'], ['STR_NUM', 'UNT_NUM']]
returning  []
SUB INPUT for INTERPRET:  []
encoded:  [['STR_NUM', 'UNT_NUM']]
r

[0, 0, 3, 0, 2, 0, 0, 0]

In [224]:
seq_00 = hseq.Hydraseq('00')
seq_01 = hseq.Hydraseq('01')

In [225]:
seq_00.insert([['STR_NUM'],['STR_NAME'],['_ADR_STREET_']])
seq_01.insert([['APT_TYPE'],['APT_NUM'],['_ADR_APT_']])

uuid: 01
n_init: <node: (*),(*)>:
active values: []
next values: []

In [234]:
decompose_into_dictionary_words(["123","main","123","broadway"], seq, 'ADDRESS')

Length Domain:  4
INPUT TO GET_INTERPRETATION:  ['123']
encoded:  [['DIGIT']]
returning  ['ALPHA', 'martin']
INPUT TO GET_INTERPRETATION:  ['123', 'main']
encoded:  [['DIGIT'], ['ALPHA']]
returning  ['ADDRESS']
INPUT TO GET_INTERPRETATION:  ['123', 'main', '123']
encoded:  [['DIGIT'], ['ALPHA'], ['DIGIT']]
returning  []
encoded:  [['ALPHA'], ['DIGIT']]
returning  []
encoded:  [['DIGIT']]
returning  ['ALPHA', 'martin']
INPUT TO GET_INTERPRETATION:  ['123', 'main', '123', 'broadway']
encoded:  [['DIGIT'], ['ALPHA'], ['DIGIT'], ['ALPHA']]
returning  []
encoded:  [['ALPHA'], ['DIGIT'], ['ALPHA']]
returning  []
encoded:  [['DIGIT'], ['ALPHA']]
returning  ['ADDRESS']


[0, 2, 0, 2]

In [228]:
seq_00.look_ahead([['STR_NUM'], ['STR_NAME']]).get_next_values()

['_ADR_STREET_']

In [230]:
seq_00.columns

defaultdict(list,
            {'STR_NAME': [<node: STR_NAME,(*)>STR_NUM>STR_NAME>],
             'STR_NUM': [<node: STR_NUM,(*)>STR_NUM>],
             '_ADR_STREET_': [<node: _ADR_STREET_,(*)>STR_NUM>STR_NAME>_ADR_STREET_>]})

In [31]:
def encoder(word, trim=True):
    encodings = [
        # LETTERS ONLY
        ('ALPHA', [r'^[a-z]+$']),
            ('LETTER', [r'^[a-z]$']),
            ('WAY', [ways]),
            ('WORDWAY', [wordways]),
            ('APT', [ apts ]),
            ('ARTI', [arti]),
            ('SP_ARTI', [sp_arti]),
            ('PRE',  [ pre ]),
            ('DIR',  [ dirs ]),
            ('POB2', [r'^box$']),
            ('DELEG', [r'^attn$', r'^attn:$', r'^c\/o$', r'^co$' ]),
            ('POB0', [r'^po$', r'^p\.o\.$']),

        # NUMBERS ONLY
        ('DIGIT', [r'^\d+$']),

        # MIXED LETTERS AND NUMBERS
        ('ALNUM', [r'^(\d+[a-z]+|[a-z]+\d+)[\da-z]*$']),
            ('NUMSTR', [r'^\d+[a-z]+$' ]),
                ('NTH',    [ nths ]),
                ('NUMS_1AL', [ r'^\d+[a-z]$' ]),
            ('APT_NUM', [ r'apt\d+$', r'unit\d+$', r'bldg\d+$', r'ste\d+$', r'suite\d+$']),

        # SYMBOLS ONLY
        ('COMMA', [r'^,$']),
        ('PERIOD', [r'^\.$']),
        ('POUND', [r'^#$']),

        # LETTERS AND SYMBOLS

        # NUMBERS AND SYMBOLS

        # INTERNAL MARKERS
        ('ADDRESS', [r'^:adr$']),
        ('ATTN', [r'^:deleg$']),
        ('POBOX', [r'^:box$']),
    ]
    encoding = [key for key, rexs in encodings for rex in rexs if re.match(rex, word)]
    if not trim:
        return encoding
    else:
        if any([key in ['SP_ARTI','LETTER', 'WORDWAY', 'WAY', 'APT', 'ARTI', 'PRE', 'DIR', 'DELEG', 'POB2', 'POB0'] for key in encoding]) and 'ALPHA' in encoding:
            encoding.remove('ALPHA')  # Redudant category level if we have probable meaning
        if any([key in ['NUMS_1AL', 'NUMSTR', 'NTH'] for key in encoding]) and 'ALNUM' in encoding:
            encoding.remove('ALNUM')  # Redudant category level if we have probable meaning
        if any([key in ['NUMS_1AL', 'NTH'] for key in encoding]) and 'NUMSTR' in encoding:
            encoding.remove('NUMSTR')

        return encoding

In [7]:
seq.insert("martin luther king jr WAY_NAME")
seq.insert("")

uuid: <bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x104b2b358>>
n_init: <node: (*),(*)>:
active values: []
next values: []

In [None]:
# use for encoding
way_types = [
    "dr",
    "st",
    "ct",
    "rd",
]

words = [
    "martin",
    "luther",
    "king"
]

