In [3]:
%load_ext autoreload
%autoreload

## This expands a notebook to full width
from IPython.display import display, HTML

display(HTML("<style>.container { width:90% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))
## Show Python Version
import sys
print("Python: {0}".format(sys.version))

## Show Current Time
import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Python: 3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]
Notebook Last Run Initiated: 2022-06-14 15:44:02.866754


In [4]:
import warnings
warnings.filterwarnings("ignore")

seed = 23

import numpy as np
np.random.seed(seed)

from tensorflow.compat.v1 import set_random_seed 
set_random_seed(seed)

import pandas as pd

import tensorflow as tf

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Model

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold
from sklearn.metrics.pairwise import cosine_similarity

from scipy.spatial.distance import cdist

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
%matplotlib inline

In [5]:
# Input vector, returns nearest word(s)
def Cosine_Similarity(word,weight,word_to_index,vocab_size,index_to_word):
    
    #Get the index of the word from the dictionary
    index = word_to_index[word]
    
    #Get the correspondin weights for the word
    word_vector_1 = weight[index]
    
    
    word_similarity = {}

    for i in range(vocab_size):
        
        j = i
        
        word_vector_2 = weight[j]
        
        theta_sum = np.dot(word_vector_1, word_vector_2)
        theta_den = np.linalg.norm(word_vector_1) * np.linalg.norm(word_vector_2)
        theta = theta_sum / theta_den
        
        word = index_to_word[j]
        word_similarity[word] = theta
    
    return word_similarity #words_sorted

In [6]:
# define documents
data = pd.read_csv('data\ZIP2LATLON_VER1.csv',dtype={
                   'postal_code': str,
                   'country_code': str,
                   'place': str,
                   'state': str,
                   'statecode': str,
                   'province_or_county': str,
                   'province_or_countycode': str,
                   'latitude': float,
                   'longitude': float})

data['postal_code'] = data['postal_code'].str.zfill(5)

In [7]:
#data = data.fillna(0)

In [8]:
target = data[['latitude','longitude']]
data = data[['postal_code','country_code','place','state','statecode','province_or_county','province_or_countycode']]

In [9]:
target = StandardScaler().fit_transform(target.values)

In [10]:
#data['province_or_county'].replace([0], 'NONE ', inplace=True)

In [11]:
docs = data.iloc[:,0:6].apply(lambda x: ' '.join(x), axis=1)

In [12]:
# integer encode the documents
vocab_size = 61837
# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(docs)

In [13]:
encoded_docs = t.texts_to_sequences(docs)

In [14]:
# pad documents to a max length of 7 words
max_length = 6
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [15]:
def BuildModel(learning_rate):
        
    input_text  = tf.keras.layers.Input(shape=(max_length,),name='text')

    x = tf.keras.layers.Embedding(vocab_size, 20, input_length=max_length,name='embeddings')(input_text)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(40, activation='relu',name='act01')(x)
    output = tf.keras.layers.Dense(1, activation='linear',name='linear_output')(x)
    
    model = Model(input_text,output)
    
    model.compile(loss="mean_squared_error", optimizer=tf.keras.optimizers.Adam(learning_rate = learning_rate))
    return model

In [16]:
learning_rate = 1e-3
n_epochs = 250

model = BuildModel(learning_rate)
        
ckp_path = f'models/Model_05.hdf5'

cb_lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'loss', factor = 0.4, patience = 5, verbose = 0, min_delta = 0.001, mode = 'min')
        
es = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='min', verbose=0, patience=10, restore_best_weights=True)
mc = tf.keras.callbacks.ModelCheckpoint(ckp_path, monitor='loss', mode='min', verbose=0, save_best_only=True, save_weights_only=True)
        
    #tb_cb = tf.keras.callbacks.TensorBoard(log_dir="logs")

# train the model
model.fit(padded_docs,target,epochs=n_epochs, batch_size=256,
          callbacks=[mc,cb_lr_schedule,es], shuffle=False,
          workers=-1, use_multiprocessing=True, verbose=2)
        


Epoch 1/250
161/161 - 2s - loss: 0.9882 - lr: 0.0010 - 2s/epoch - 14ms/step
Epoch 2/250
161/161 - 1s - loss: 0.7606 - lr: 0.0010 - 704ms/epoch - 4ms/step
Epoch 3/250
161/161 - 1s - loss: 0.6347 - lr: 0.0010 - 710ms/epoch - 4ms/step
Epoch 4/250
161/161 - 1s - loss: 0.5987 - lr: 0.0010 - 708ms/epoch - 4ms/step
Epoch 5/250
161/161 - 1s - loss: 0.5904 - lr: 0.0010 - 717ms/epoch - 4ms/step
Epoch 6/250
161/161 - 1s - loss: 0.5732 - lr: 0.0010 - 708ms/epoch - 4ms/step
Epoch 7/250
161/161 - 1s - loss: 0.5723 - lr: 0.0010 - 702ms/epoch - 4ms/step
Epoch 8/250
161/161 - 1s - loss: 0.5728 - lr: 0.0010 - 681ms/epoch - 4ms/step
Epoch 9/250
161/161 - 1s - loss: 0.5704 - lr: 0.0010 - 712ms/epoch - 4ms/step
Epoch 10/250
161/161 - 1s - loss: 0.5700 - lr: 0.0010 - 721ms/epoch - 4ms/step
Epoch 11/250
161/161 - 1s - loss: 0.5611 - lr: 0.0010 - 727ms/epoch - 5ms/step
Epoch 12/250
161/161 - 1s - loss: 0.5582 - lr: 0.0010 - 699ms/epoch - 4ms/step
Epoch 13/250
161/161 - 1s - loss: 0.5526 - lr: 0.0010 - 708ms/e

<keras.callbacks.History at 0x24aaebbca90>

In [17]:
# evaluate the model
loss = model.evaluate(padded_docs, target, verbose=2)
print(loss)

1286/1286 - 2s - loss: 0.5390 - 2s/epoch - 2ms/step
0.538978636264801


In [18]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text (InputLayer)           [(None, 6)]               0         
                                                                 
 embeddings (Embedding)      (None, 6, 20)             1236740   
                                                                 
 flatten (Flatten)           (None, 120)               0         
                                                                 
 act01 (Dense)               (None, 40)                4840      
                                                                 
 linear_output (Dense)       (None, 1)                 41        
                                                                 
Total params: 1,241,621
Trainable params: 1,241,621
Non-trainable params: 0
_________________________________________________________________


In [19]:
embedding_layer = model.get_layer('embeddings')

In [20]:
embeddings = embedding_layer.get_weights()[0]

In [21]:
np.savetxt("embeddings_model3.csv", embeddings, delimiter=",")

In [22]:
embeddings[t.word_index['60126'],:]

array([-0.02314374,  0.00356477, -0.02511154, -0.00574221, -0.02037801,
        0.00612416,  0.00661747,  0.01302327, -0.01203879,  0.03196121,
        0.06389735,  0.02450056,  0.05767334,  0.01309287,  0.05158229,
       -0.00494286, -0.07346611,  0.00055991, -0.03272279, -0.04898806],
      dtype=float32)

In [23]:
cosine_similarity(np.reshape(embeddings[t.word_index['60126'],:],(1,20)),np.reshape(embeddings[t.word_index['55929'],:],(1,20)))[0][0]

-0.4122358

In [24]:
1. - cdist(np.reshape(embeddings[t.word_index['60126'],:],(1,20)), np.reshape(embeddings[t.word_index['60163'],:],(1,20)), 'cosine')[0][0]

0.5723244716108641

In [25]:
states = pd.DataFrame(pd.unique(data.statecode),columns=['states'])

In [26]:
states_list = []

for index, row in states.iterrows():
    state = row[0].lower()
    states_list.append(pd.concat([pd.Series(state), pd.DataFrame(np.reshape(embeddings[t.word_index[state],:],(1,20)))   ],axis=1))   

In [27]:
states = pd.concat(states_list)
states = states.reset_index(drop=True)

In [28]:
states.columns = range(states.columns.size)

In [29]:
x = StandardScaler().fit_transform(states.iloc[:,1:-1].values)

In [32]:
#TSNE : Compressing the weights to 3 dimensions to plot the data
tsne_model = TSNE(perplexity=40, n_components=3, init='pca', n_iter=2500, random_state=seed)
new_values = tsne_model.fit_transform(x)



In [33]:
principalDf = pd.DataFrame(data = new_values
             , columns = ['component 1', 'component 2', 'component 3'])

In [34]:
#principalDf = pd.DataFrame(data = principalComponents
#             , columns = ['principal component 1', 'principal component 2'])

In [41]:
finalDf = pd.concat([principalDf, states.iloc[:,0]], axis = 1)
finalDf.columns = ['x', 'y', 'z', 'State']

In [None]:
word_to_index = t.word_index
index_to_word = dict()

for key in word_to_index:
    index_to_word.update({word_to_index[key] : key })

In [49]:
import plotly.express as px

fig = px.scatter_3d(finalDf, x='x', y='y', z='z',
              color='State')
fig.show()

In [52]:
postalcodes = pd.DataFrame(pd.unique(data.postal_code),columns=['postal_code'])

In [67]:
postalcodes_list = []

for index, row in postalcodes.iterrows():
    postalcode = row[0].lower()
    postalcodes_list.append(pd.concat([pd.Series(postalcode), pd.DataFrame(np.reshape(embeddings[t.word_index[postalcode],:],(1,20)))   ],axis=1))   

In [68]:
postalcodes = pd.concat(postalcodes_list)
postalcodes = postalcodes.reset_index(drop=True)

In [69]:
postalcodes.columns = range(postalcodes.columns.size)

In [70]:
x = StandardScaler().fit_transform(postalcodes.iloc[:,1:-1].values)

In [72]:
#TSNE : Compressing the weights to 3 dimensions to plot the data
tsne_model = TSNE(perplexity=40, n_components=3, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(x)


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.


The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.



In [73]:
principalDf = pd.DataFrame(data = new_values
             , columns = ['component 1', 'component 2', 'component 3'])

In [74]:
principalDf

Unnamed: 0,component 1,component 2,component 3
0,-6.042058,37.164200,-45.383648
1,-19.057133,18.453873,13.318251
2,-6.696831,45.710239,1.665204
3,36.231270,36.668354,37.621300
4,-0.741387,50.336079,-21.107321
...,...,...,...
41131,11.463096,54.772797,-5.244538
41132,6.330010,-76.985588,-1.255011
41133,6.381685,-26.371647,30.920942
41134,31.961596,-15.484950,36.837898


In [86]:
finalDf = pd.concat([principalDf, postalcodes.iloc[:,0]], axis = 1)
finalDf.columns = ['x', 'y', 'z', 'postalcode']

In [100]:
finalDf = finalDf[finalDf['postalcode'].str.slice(start=0, stop=3) == '601']

In [101]:
import plotly.express as px

fig = px.scatter_3d(finalDf, x='x', y='y', z='z',
              color='postalcode')
fig.show()