# Synthetic Addresses with GAN

Goal of this notebook is to see, how hard it would be to generate synthetic addresses from existing addresses with the help of a GAN. Basis will be the sourced addresses from OpenAddresses Portugal.

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LeakyReLU, Input, Flatten, LSTM, Bidirectional, TimeDistributed
from tensorflow.keras.optimizers import Adam

In [4]:
open_address = pd.read_csv('./data/openaddr-collected-europe/pt/countrywide.csv').sample(frac=0.15)

In [5]:
open_address.shape

(886671, 11)

This df is way too large. It will have to be downsampled to be useful. As it is I just don't have enough memory.

In [6]:
open_address.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 886671 entries, 5420398 to 2293475
Data columns (total 11 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   LON       886671 non-null  float64
 1   LAT       886671 non-null  float64
 2   NUMBER    661563 non-null  object 
 3   STREET    851397 non-null  object 
 4   UNIT      0 non-null       float64
 5   CITY      885473 non-null  object 
 6   DISTRICT  0 non-null       float64
 7   REGION    0 non-null       float64
 8   POSTCODE  886671 non-null  object 
 9   ID        886671 non-null  object 
 10  HASH      886671 non-null  object 
dtypes: float64(5), object(6)
memory usage: 81.2+ MB


In [7]:
open_address.sample()

Unnamed: 0,LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID,HASH
5212010,-8.49116,41.783872,46,R POMBAL,,GANDRA PTL,,,4990-640,pt.ine.add.PTCONT.5207942,1d4c6fcb006cdc1e


In [8]:
open_address.shape

(886671, 11)

For now, the only relevant information is the address, from which we will create sequences of Booleans. The final input shape will therefore be:

(number_of_samples x address_length x number_of_distinct_characters) Whereby the address_length will be the length of the longest address in characters. Shorter addresses will have fillers appended to them.

In terms of characters, all will be eligible, although, all strings will be converted to lowercase.

### Feature Engineering

In [None]:
open_address.fillna('', inplace=True)
construct_mat = open_address[['STREET', 'NUMBER', 'POSTCODE', 'CITY']].values
address = map(lambda x: f'{x[0]} {x[1]}, {x[2]} {x[3]}'.replace('  ', ' ').replace(' ,', ',').strip().title(), tqdm(construct_mat))
addresses = tuple(address)

In [None]:
maxlen = len(max(addresses, key=len))

In [None]:
text = ''.join(addresses)
text = tuple(set(text))

encoding = {letter: i for i, letter in enumerate(text)}
decoding = {i: letter for i, letter in enumerate(text)}

y = np.zeros((len(addresses), maxlen, len(encoding)), dtype=bool)
for i, address in tqdm(enumerate(addresses)):
    for t, letter in enumerate(address):
        y[i, t, encoding[letter]] = True

In [None]:
y.shape

In [None]:
maxlen

### Make Generator

In [None]:
def _make_generator():
    model = Sequential()

    model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(maxlen, 90)))
    model.add(LeakyReLU(alpha=0.2))

    model.add(TimeDistributed(Dense(256)))
    model.add(LeakyReLU(alpha=0.2))

    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))

    model.add(Dense(len(encoding), activation='softmax'))

    optimizer = Adam(lr=0.1)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    
    return model

generator = _make_generator()
generator.summary()

In [None]:
# history = model.fit(
#     X,
#     y,
#     epochs=3,
#     batch_size=128,
#     validation_split=0.15,
#     shuffle=True
# )

In [None]:
# model.evaluate(X, y)

In [None]:
# X_test = np.zeros((len(addresses), maxlen), dtype=np.int64)
# for i in range(len(addresses)):
#     for t in range(X_test.shape[1]):
#         new = np.random.randint(10**10, 9*10**10)
#         X_test[i, t] = new
        
# X_test = X_test.reshape(len(addresses), maxlen, 1)

In [None]:
# y_hat = model.predict(X_test)

In [None]:
# def decode_prediction(matrix_row):
#     decoded = map(lambda unit: decoding[np.argmax(unit)], matrix_row)
    
#     return '.'.join(list(decoded))

In [None]:
# test = decode_prediction(y_hat[0])

In [None]:
# test

Obviously the preliminary results of the generator are not really what expected. My hypothesis is, that a generator was never meant to be that way. Instead of an actual generatr, what I build was the second side of a variational autoencoder. 

The next attempt will see to creating the generator and then then the generator will ONLY be trained at fooling the discriminator. I think now it is trying to learn the relationship between random noise and addresses, which does not exists. With the discriminator, it will learn how to manipulate random noise in a way that fools the discriminator.

### Make Discriminator

In [None]:
def _make_discriminator():
    model = Sequential()

    model.add(Bidirectional(LSTM(256, return_sequences=True), input_shape=(maxlen, len(encoding))))
    model.add(LeakyReLU(alpha=0.2))

    model.add(TimeDistributed(Dense(256)))
    model.add(LeakyReLU(alpha=0.2))
    
    model.add(Flatten())
    model.add(LeakyReLU(alpha=0.2))

    model.add(Dense(1, activation='sigmoid'))

    optimizer = Adam(lr=0.1)
    model.compile(loss='binary_crossentropy', optimizer=optimizer)
    
    return model

discriminator = _make_discriminator()
discriminator.summary()

### Make GAN

In [None]:
def create_gan(discriminator, generator):
    discriminator.trainable=False
    gan_input = Input(shape=(maxlen, 90))
    x = generator(gan_input)
    gan_output= discriminator(x)
    gan = Model(inputs=gan_input, outputs=gan_output)
    gan.compile(loss='binary_crossentropy', optimizer='adam')
    return gan
gan = create_gan(discriminator,generator)
gan.summary()

### Train GAN

Given the most recent results, I am pretty sure I will have to enable the discriminator to analyse sequences, instead of singular vectors (duh...).

In [None]:
epochs = 1
batch_size = 128

generator= _make_generator()
discriminator= _make_discriminator()
gan = create_gan(discriminator, generator)
losses = []

for e in range(1, epochs+1):
    print(f'Epoch: {e}')
    for _ in tqdm(range(batch_size)):
        noise = np.random.normal(0,1, [batch_size, maxlen, 90])
        fake_stuff = generator.predict(noise)
        real_addresses = y[np.random.randint(low=0,high=y.shape[0],size=batch_size)]
        X = np.concatenate([real_addresses, fake_stuff])
        y_dis=np.zeros(2*batch_size, dtype=bool)
        y_dis[:batch_size] = 1
        discriminator.trainable=True
        discriminator.train_on_batch(X, y_dis)
        noise = np.random.normal(0,1, [batch_size, maxlen, 90])
        y_gen = np.ones(batch_size)
        discriminator.trainable=False
        acc_batch_loss = gan.train_on_batch(noise, y_gen, reset_metrics=False)
        losses.append(acc_batch_loss)

In [None]:
test_noise = np.random.normal(0,1, [500, maxlen, 1])

In [None]:
test_fake_stuff = generator.predict(test_noise)

In [None]:
test_fake_stuff.shape

In [None]:
def decode_synths(synth):
    decoded_rows = []
    for row in tqdm(synth):
        decoded_row = map(lambda x: decoding[np.argmax(x)], row)
        decoded_rows.append(''.join(tuple(decoded_row)))
        
    return decoded_rows

In [None]:
test_prediction = decode_synths(test_fake_stuff)

In [None]:
test_prediction[0]

In [None]:
!mkdir -p saved_model
generator.save('saved_model/generator')
discriminator.save('saved_model/discriminator')
gan.save('saved_model/gan')

Obviously, the results are garbage. I have now some theories why they are like that:
- Batch size is to large. Lower batch size might increase diversity.
- Base data is too small. Larger data set might alleviate this issue.
- Something with the batch processing. If nothing else helps, I would like to try to train the models directly instead of in a loop.

More Results:
- Until now, I could not really test any new approaches, since the model just really quickly gets too big for memory.