# Address Parser

Goal: As a DS, I want to create a Model, which is able to extract the street name, the house number, the postal_code and the city from an arbitrary address.

Approach:
- Construct simple, standardized training addresses
- Test first iteration of model on this training set
- Introduce random permutations of addresses
- Test and iterate over model to deal with random permutations

In [1]:
import sys

import pandas as pd
import numpy as np
from tqdm import tqdm

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, RNN, Bidirectional, TimeDistributed, LeakyReLU, ReLU
from tensorflow.keras.optimizers import Adam

In [2]:
open_addresses = pd.read_csv('data/openaddr-collected-europe/pt/countrywide.csv').sample(250000)

In [3]:
sys.getsizeof(open_addresses)*1e-9

0.117071163

### Create Addresses

In [4]:
open_addresses.head()

Unnamed: 0,LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID,HASH
2921991,-9.230775,38.719125,22,R INÁCIO DUARTE,,CARNAXIDE,,,2790-226,pt.ine.add.PTCONT.2370307,e5a23ef05cfd9f32
668456,-8.177823,41.438743,895,AV S JORGE,,FAFE,,,4820-120,pt.ine.add.PTCONT.838517,1d2a392c1b4ab747
4090780,-8.754557,41.379241,1A,R SENHOR DO BONFIM,,PÓVOA DE VARZIM,,,4490-583,pt.ine.add.PTCONT.3590234,964c673865ecf110
5809156,-25.28367,37.851574,20,R MAGANA,,ACHADINHA,,,9630-043,pt.ine.add.AC26.16652,ea34b6a9a91231f4
9344,-8.449765,40.555953,113,TV ROMPIDAS,,RECARDÃES,,,3750-725,pt.ine.add.PTCONT.86070,86a9b0e0e247906e


In [5]:
open_addresses = open_addresses.fillna('')
const_matrix = open_addresses[['STREET', 'NUMBER', 'POSTCODE', 'CITY']].values
addresses_raw = map(lambda x: f'{x[0]} {x[1]}, {x[2]} {x[3]}'.lower().replace(' , ', ', '), tqdm(const_matrix))
addresses_raw = tuple(addresses_raw)
maxlen = len(max(addresses_raw, key=len))
addresses = map(lambda x: x.ljust(maxlen), tqdm(addresses_raw))
addresses = tuple(addresses)

100%|██████████| 250000/250000 [00:00<00:00, 895085.03it/s]
100%|██████████| 250000/250000 [00:00<00:00, 2521639.80it/s]


In [6]:
text = tuple(set(" ".join(addresses)))
encoding = {key: value for value, key in enumerate(text)}
decoding = {value: key for value, key in enumerate(text)}
del(text)

X = np.zeros(shape=(len(addresses), maxlen, len(encoding)), dtype=bool)

for i, row in tqdm(enumerate(addresses)):
    for t, letter in enumerate(row):
        X[i, t, encoding[letter]] = True

250000it [00:04, 54806.04it/s]


In [31]:
def decode_matrix(matrix):
    decode_func = lambda x: decoding[np.argmax(x)]
    decoded = [tuple(map(decode_func, row)) for row in tqdm(matrix)]
        
    return decoded

### Simple Model

In [32]:
simple = Sequential()

simple.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(encoding))))
simple.add(LeakyReLU())

simple.add(LSTM(256, return_sequences=True))
simple.add(LeakyReLU())

simple.add(LSTM(128, return_sequences=True))
simple.add(LeakyReLU())

simple.add(TimeDistributed(Dense(len(encoding), activation='softmax')))


optimizer = Adam(lr=0.01)

simple.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
simple.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 90, 512)           1179648   
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 90, 512)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 90, 256)           787456    
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 90, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 90, 128)           197120    
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 90, 128)           0         
_________________________________________________________________
time_distributed (TimeDistri (None, 90, 63)            8

In [33]:
history = simple.fit(
    X,
    y_city,
    batch_size=128,
    epochs=10,
    shuffle=True,
    validation_split=0.1
)

Train on 225000 samples, validate on 25000 samples
Epoch 1/10
 22528/225000 [==>...........................] - ETA: 40:51 - loss: 0.4726 - accuracy: 0.8963

KeyboardInterrupt: 