# Address Parser

Goal: As a DS, I want to create a Model, which is able to extract the street name, the house number, the postal_code and the city from an arbitrary address.

Approach:
- Construct simple, standardized training addresses
- Test first iteration of model on this training set
- Introduce random permutations of addresses
- Test and iterate over model to deal with random permutations

Source for addresses: https://openaddresses.io/

In [1]:
import sys

import pandas as pd
import numpy as np
from tqdm import tqdm

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, RNN, Bidirectional, TimeDistributed, LeakyReLU, ReLU
from tensorflow.keras.optimizers import Adam

In [2]:
open_addresses = pd.read_csv('data/openaddr-collected-europe/pt/countrywide.csv').sample(250000)

In [3]:
sys.getsizeof(open_addresses)*1e-9

0.117055876

### Create Addresses

In [4]:
open_addresses.head()

Unnamed: 0,LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID,HASH
4410514,-8.235179,39.479835,56,R CALDEIRAS,,RIO DE MOINHOS ABT,,,2200-788,pt.ine.add.PTCONT.4596805,9606f35c2b1b19d0
3848961,-8.684895,41.225435,957,R 31 DE JANEIRO,,PERAFITA,,,4455-584,pt.ine.add.PTCONT.4239550,4673b8ab0f970c7f
5823198,-25.637087,37.756201,68,CAN FAIAS,,PONTA DELGADA,,,9500-701,pt.ine.add.AC26.39272,eccf9fa4195ad2c9
1821027,-7.704664,37.09322,,R 25 DE ABRIL,,LUZ TVR,,,8800-113,pt.ine.add.PTCONT.1819207,08aefd1d61ac0310
3266093,-9.085827,38.876213,B,R ALMADA NEGREIROS,,VIALONGA,,,2625-638,pt.ine.add.PTCONT.2858732,908eda230b212261


In [5]:
open_addresses = open_addresses.fillna('')
const_matrix = open_addresses[['STREET', 'NUMBER', 'POSTCODE', 'CITY']].values
addresses_raw = map(lambda x: f'{x[0]} {x[1]}, {x[2]} {x[3]}'.lower().replace(' , ', ', '), tqdm(const_matrix))
addresses_raw = tuple(addresses_raw)
maxlen = len(max(addresses_raw, key=len))
addresses = map(lambda x: x.ljust(maxlen), tqdm(addresses_raw))
addresses = tuple(addresses)

100%|██████████| 250000/250000 [00:00<00:00, 621211.61it/s]
100%|██████████| 250000/250000 [00:00<00:00, 757994.33it/s]


In [6]:
text = tuple(set(" ".join(addresses)))
encoding = {key: value for value, key in enumerate(text)}
decoding = {value: key for value, key in enumerate(text)}
del(text)

X = np.zeros(shape=(len(addresses), maxlen, len(encoding)), dtype=bool)

for i, row in tqdm(enumerate(addresses)):
    for t, letter in enumerate(row):
        X[i, t, encoding[letter]] = True

250000it [00:12, 20414.32it/s]


In [7]:
y = X.copy()

In [8]:
def decode_matrix(matrix):
    decode_func = lambda x: decoding[np.argmax(x)]
    decoded = [tuple(map(decode_func, row)) for row in tqdm(matrix)]
        
    return decoded

### Simple Model

In [9]:
simple = Sequential()

simple.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(encoding))))
simple.add(LeakyReLU())

simple.add(LSTM(256, return_sequences=True))
simple.add(LeakyReLU())

simple.add(LSTM(128, return_sequences=True))
simple.add(LeakyReLU())

simple.add(TimeDistributed(Dense(len(encoding), activation='softmax')))


optimizer = Adam(lr=0.01)

simple.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
simple.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 90, 512)           1177600   
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 90, 512)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 90, 256)           787456    
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 90, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 90, 128)           197120    
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 90, 128)           0         
_________________________________________________________________
time_distributed (TimeDistri (None, 90, 62)            7

In [None]:
history = simple.fit(
    X,
    y,
    batch_size=128,
    epochs=5,
    shuffle=True,
    validation_split=0.1
)

Train on 225000 samples, validate on 25000 samples
Epoch 1/5
 19072/225000 [=>............................] - ETA: 57:07 - loss: 1.6859 - accuracy: 0.6420