In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
# define documents
data = pd.read_csv('data\ZIP_TRACT_122021.csv',dtype={
                     'zip': str,
                     'tract': str,
                     'usps_zip_pref_city': str,
                     'usps_zip_pref_state': str,
                     'res_ratio': float,
                     'bus_ratio': float,
                     'oth_ratio': float,
                     'tot_ratio': float})

In [3]:
data = data.drop(['res_ratio', 'bus_ratio', 'oth_ratio'], axis=1)
docs = data[['zip', 'tract',  'usps_zip_pref_city', 'usps_zip_pref_state']].apply(lambda x: ' '.join(x), axis=1)
labels = data['tot_ratio']

In [4]:
# integer encode the documents
vocab_size = 131322
# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(docs)

In [5]:
encoded_docs = t.texts_to_sequences(docs)

In [6]:
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [7]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='linear'))
# compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# summarize the model
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 8)              1050576   
                                                                 
 flatten (Flatten)           (None, 32)                0         
                                                                 
 dense (Dense)               (None, 10)                330       
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 1,050,917
Trainable params: 1,050,917
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
# fit the model
model.fit(padded_docs, labels, epochs=250, verbose=2)

Epoch 1/250
5381/5381 - 22s - loss: 0.0881 - 22s/epoch - 4ms/step
Epoch 2/250
5381/5381 - 21s - loss: 0.0600 - 21s/epoch - 4ms/step
Epoch 3/250
5381/5381 - 21s - loss: 0.0450 - 21s/epoch - 4ms/step
Epoch 4/250
5381/5381 - 21s - loss: 0.0371 - 21s/epoch - 4ms/step
Epoch 5/250
5381/5381 - 21s - loss: 0.0309 - 21s/epoch - 4ms/step
Epoch 6/250
5381/5381 - 21s - loss: 0.0258 - 21s/epoch - 4ms/step
Epoch 7/250
5381/5381 - 21s - loss: 0.0221 - 21s/epoch - 4ms/step
Epoch 8/250
5381/5381 - 21s - loss: 0.0192 - 21s/epoch - 4ms/step
Epoch 9/250
5381/5381 - 22s - loss: 0.0171 - 22s/epoch - 4ms/step
Epoch 10/250
5381/5381 - 21s - loss: 0.0156 - 21s/epoch - 4ms/step
Epoch 11/250
5381/5381 - 21s - loss: 0.0144 - 21s/epoch - 4ms/step
Epoch 12/250
5381/5381 - 21s - loss: 0.0135 - 21s/epoch - 4ms/step
Epoch 13/250
5381/5381 - 21s - loss: 0.0127 - 21s/epoch - 4ms/step
Epoch 14/250
5381/5381 - 21s - loss: 0.0121 - 21s/epoch - 4ms/step
Epoch 15/250
5381/5381 - 21s - loss: 0.0116 - 21s/epoch - 4ms/step
Epoc

5381/5381 - 23s - loss: 0.0054 - 23s/epoch - 4ms/step
Epoch 124/250
5381/5381 - 24s - loss: 0.0054 - 24s/epoch - 4ms/step
Epoch 125/250
5381/5381 - 24s - loss: 0.0053 - 24s/epoch - 4ms/step
Epoch 126/250
5381/5381 - 23s - loss: 0.0053 - 23s/epoch - 4ms/step
Epoch 127/250
5381/5381 - 23s - loss: 0.0053 - 23s/epoch - 4ms/step
Epoch 128/250
5381/5381 - 23s - loss: 0.0053 - 23s/epoch - 4ms/step
Epoch 129/250
5381/5381 - 23s - loss: 0.0053 - 23s/epoch - 4ms/step
Epoch 130/250
5381/5381 - 23s - loss: 0.0053 - 23s/epoch - 4ms/step
Epoch 131/250
5381/5381 - 23s - loss: 0.0053 - 23s/epoch - 4ms/step
Epoch 132/250
5381/5381 - 23s - loss: 0.0053 - 23s/epoch - 4ms/step
Epoch 133/250
5381/5381 - 23s - loss: 0.0053 - 23s/epoch - 4ms/step
Epoch 134/250
5381/5381 - 23s - loss: 0.0053 - 23s/epoch - 4ms/step
Epoch 135/250
5381/5381 - 23s - loss: 0.0053 - 23s/epoch - 4ms/step
Epoch 136/250
5381/5381 - 27s - loss: 0.0053 - 27s/epoch - 5ms/step
Epoch 137/250
5381/5381 - 26s - loss: 0.0053 - 26s/epoch - 5ms

Epoch 244/250
5381/5381 - 25s - loss: 0.0049 - 25s/epoch - 5ms/step
Epoch 245/250
5381/5381 - 66s - loss: 0.0049 - 66s/epoch - 12ms/step
Epoch 246/250
5381/5381 - 70s - loss: 0.0049 - 70s/epoch - 13ms/step
Epoch 247/250
5381/5381 - 23s - loss: 0.0049 - 23s/epoch - 4ms/step
Epoch 248/250
5381/5381 - 22s - loss: 0.0049 - 22s/epoch - 4ms/step
Epoch 249/250
5381/5381 - 24s - loss: 0.0049 - 24s/epoch - 4ms/step
Epoch 250/250
5381/5381 - 33s - loss: 0.0049 - 33s/epoch - 6ms/step


<keras.callbacks.History at 0x1db08a94c40>

In [9]:
# evaluate the model
loss = model.evaluate(padded_docs, labels, verbose=2)
print(loss)

5381/5381 - 12s - loss: 0.0049 - 12s/epoch - 2ms/step
0.004895442631095648


In [11]:
embedding_layer = model.get_layer('embedding')

In [12]:
embeddings = embedding_layer.get_weights()[0]

In [13]:
embeddings.shape

(131322, 8)

In [14]:
type(embeddings)

numpy.ndarray

In [15]:
np.savetxt("embeddings_model1.csv", embeddings, delimiter=",")