In [41]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

In [42]:
# define documents
data = pd.read_csv('data\ZIP_TRACT_122021.csv',dtype={
                     'zip': str,
                     'tract': str,
                     'usps_zip_pref_city': str,
                     'usps_zip_pref_state': str,
                     'res_ratio': float,
                     'bus_ratio': float,
                     'oth_ratio': float,
                     'tot_ratio': float})




In [43]:
data = data.drop(['res_ratio', 'bus_ratio', 'oth_ratio'], axis=1)

In [44]:
docs = data[['zip', 'tract',  'usps_zip_pref_city', 'usps_zip_pref_state']].apply(lambda x: ' '.join(x), axis=1)

In [45]:
labels = data['tot_ratio']

In [46]:
# integer encode the documents
vocab_size = 131322
encoded_docs = [one_hot(d, vocab_size) for d in docs]
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='linear'))
# compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# summarize the model
print(model.summary())


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 4, 8)              1050576   
                                                                 
 flatten_5 (Flatten)         (None, 32)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,050,609
Trainable params: 1,050,609
Non-trainable params: 0
_________________________________________________________________
None


In [47]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=2)

Epoch 1/50
5381/5381 - 20s - loss: 0.0905 - 20s/epoch - 4ms/step
Epoch 2/50
5381/5381 - 19s - loss: 0.0691 - 19s/epoch - 4ms/step
Epoch 3/50
5381/5381 - 19s - loss: 0.0589 - 19s/epoch - 4ms/step
Epoch 4/50
5381/5381 - 19s - loss: 0.0540 - 19s/epoch - 4ms/step
Epoch 5/50
5381/5381 - 19s - loss: 0.0510 - 19s/epoch - 4ms/step
Epoch 6/50
5381/5381 - 19s - loss: 0.0489 - 19s/epoch - 4ms/step
Epoch 7/50
5381/5381 - 19s - loss: 0.0473 - 19s/epoch - 4ms/step
Epoch 8/50
5381/5381 - 20s - loss: 0.0461 - 20s/epoch - 4ms/step
Epoch 9/50
5381/5381 - 20s - loss: 0.0452 - 20s/epoch - 4ms/step
Epoch 10/50
5381/5381 - 20s - loss: 0.0443 - 20s/epoch - 4ms/step
Epoch 11/50
5381/5381 - 20s - loss: 0.0436 - 20s/epoch - 4ms/step
Epoch 12/50
5381/5381 - 20s - loss: 0.0430 - 20s/epoch - 4ms/step
Epoch 13/50
5381/5381 - 20s - loss: 0.0425 - 20s/epoch - 4ms/step
Epoch 14/50
5381/5381 - 20s - loss: 0.0421 - 20s/epoch - 4ms/step
Epoch 15/50
5381/5381 - 20s - loss: 0.0417 - 20s/epoch - 4ms/step
Epoch 16/50
5381/53

<keras.callbacks.History at 0x1b843c84eb0>

In [49]:
# evaluate the model
loss = model.evaluate(padded_docs, labels, verbose=2)
print(loss)

5381/5381 - 7s - loss: 0.0342 - 7s/epoch - 1ms/step
0.03421665355563164


In [54]:
embedding_layer = model.get_layer('embedding_5')

In [59]:
embeddings = embedding_layer.get_weights()[0]

In [60]:
embeddings.shape

(131322, 8)

In [61]:
padded_docs

array([[131184,  29458, 126416,  47504],
       [ 14742,  29458, 126416,  47504],
       [106321,  29458, 126416,  47504],
       ...,
       [ 38618,  49494,  10488,   4509],
       [ 67356, 123963,  47199,  58532],
       [ 67356,  77114,  47199,  58532]])

In [62]:
encoded_docs 

[[23000, 131184, 29458, 126416, 47504],
 [23000, 14742, 29458, 126416, 47504],
 [23000, 106321, 29458, 126416, 47504],
 [23000, 116376, 29458, 126416, 47504],
 [23000, 7760, 29458, 126416, 47504],
 [23000, 7150, 29458, 126416, 47504],
 [23000, 104667, 29458, 126416, 47504],
 [23000, 95516, 29458, 126416, 47504],
 [23000, 14784, 29458, 126416, 47504],
 [23000, 117439, 29458, 126416, 47504],
 [73402, 63517, 29458, 84478, 47504],
 [73402, 17052, 29458, 84478, 47504],
 [73402, 42840, 29458, 84478, 47504],
 [73402, 1386, 29458, 84478, 47504],
 [73402, 7668, 29458, 84478, 47504],
 [73402, 73972, 29458, 84478, 47504],
 [73402, 42107, 29458, 84478, 47504],
 [73402, 120852, 29458, 84478, 47504],
 [73402, 43173, 29458, 84478, 47504],
 [73402, 68140, 29458, 84478, 47504],
 [73402, 13030, 29458, 84478, 47504],
 [73402, 15123, 29458, 84478, 47504],
 [73402, 95359, 29458, 84478, 47504],
 [73402, 106406, 29458, 84478, 47504],
 [73402, 93861, 29458, 84478, 47504],
 [88261, 129072, 30630, 35173],
 [882