In [None]:
%load_ext autoreload
%autoreload

## This expands a notebook to full width
from IPython.display import display, HTML

display(HTML("<style>.container { width:90% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))
## Show Python Version
import sys
print("Python: {0}".format(sys.version))

## Show Current Time
import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import tensorflow_addons as tfa
import tensorflow as tf

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import load_model
from tensorflow.keras import layers

from sklearn.metrics.pairwise import cosine_similarity

from scipy.spatial.distance import cdist

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
%matplotlib inline

In [None]:
seed = 23

learning_rate = 0.001
weight_decay = 0.0001
batch_size = 512
num_epochs = 50
projection_dim = 60
num_heads = 4

transformer_units = [
    projection_dim * 2,
    projection_dim,
]  # Size of the transformer layers

transformer_layers = 4

embed_dim = 60
embeddings_shape = (1,embed_dim)

mlp_head_units = [1024, 512]  # Size of the dense layers of the final classifier

ckp_path = 'models/Model_Embedding_transformers.hdf5'

In [None]:
tf.keras.utils.set_random_seed(seed)
np.random.seed(seed)

In [None]:
class PositionEmbedding(layers.Layer):
    def __init__(self, maxlen, embed_dim):
        super().__init__()
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        return positions + x

In [None]:
class PositionEmbeddingFixedWeights(layers.Layer):
    def __init__(self, sequence_length, vocab_size, output_dim, **kwargs):
        super(PositionEmbeddingFixedWeights, self).__init__(**kwargs)
        word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)   
        position_embedding_matrix = self.get_position_encoding(sequence_length, output_dim)                                          
        self.word_embedding_layer = Embedding(
            input_dim=vocab_size, output_dim=output_dim,
            weights=[word_embedding_matrix],
            trainable=False
        )
        self.position_embedding_layer = Embedding(
            input_dim=sequence_length, output_dim=output_dim,
            weights=[position_embedding_matrix],
            trainable=False
        )
             
    def get_position_encoding(self, seq_len, d, n=10000):
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        return P


    def call(self, inputs):        
        position_indices = tf.range(tf.shape(inputs)[1])
        embedded_words = self.word_embedding_layer(inputs)
        embedded_indices = self.position_embedding_layer(position_indices)
        return embedded_words + embedded_indices

In [None]:
def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = layers.Dense(units, activation=tf.nn.gelu)(x)
        x = layers.Dropout(dropout_rate)(x)
    return x

In [None]:
def create_classifier(learning_rate):
    
    input_text  = layers.Input(shape=(max_length-1,),dtype="int32",name='input_text')
    
    embeddings = tf.keras.layers.Embedding(input_dim=vocab_size, input_length=max_length, output_dim=embed_dim,name='embeddings')(input_text)
    
    embeddings = PositionEmbedding(maxlen=max_length-1,embed_dim=embed_dim)(embeddings)
    
    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        # Layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=1e-6)(embeddings)
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1)(x1, x1)
        # Skip connection 1.
        x2 = layers.Add()([attention_output, embeddings])
        # Layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        # MLP.
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
        # Skip connection 2.
        embeddings = layers.Add()([x3, x2])

    # Create a [batch_size, projection_dim] tensor.
    representation = layers.LayerNormalization(epsilon=1e-6)(embeddings)
    representation = layers.GlobalAveragePooling1D()(representation)
    representation = layers.Dropout(0.5)(representation)
    
    # Add MLP.
    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5)
    
    # Classify outputs.
    outputs = layers.Dense(num_classes,activation='softmax', name='activation')(features)
    
    # Create the Keras model.
    model = Model(inputs=input_text, outputs=outputs,name='Postal_Code_Embeddings')
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = learning_rate),loss="sparse_categorical_crossentropy")
    
    return model

In [None]:
# Input vector, returns nearest word(s)
def Cosine_Similarity(word,weight,word_to_index,vocab_size,index_to_word):
    
    #Get the index of the word from the dictionary
    index = word_to_index[word]
    
    #Get the correspondin weights for the word
    word_vector_1 = weight[index]
    
    word_similarity = {}

    for i in range(vocab_size):
        
        j = i
        
        word_vector_2 = weight[j]
        
        theta_sum = np.dot(word_vector_1, word_vector_2)
        theta_den = np.linalg.norm(word_vector_1) * np.linalg.norm(word_vector_2)
        theta = theta_sum / theta_den
        
        word = index_to_word[j]
        word_similarity[word] = theta
    
    return word_similarity #words_sorted

In [None]:
# define documents
data = pd.read_csv('data\zipcodedata.csv')
docs = data[['data']].squeeze()

In [None]:
# create the tokenizer
t = Tokenizer(filters=' ')
# fit the tokenizer on the documents
t.fit_on_texts(docs)

In [None]:
# integer encode the documents
vocab_size = len(t.word_index) + 1
num_classes = vocab_size

In [None]:
encoded_docs = t.texts_to_sequences(docs)

In [None]:
# pad documents to a max length of 3 words, windows size = 3
max_length = 3
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [None]:
train = pd.DataFrame(padded_docs[:,[0,1]])
label = padded_docs[:,[2]]

In [None]:
model = create_classifier(learning_rate)
        
lr = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'loss', factor = 0.4, patience = 2, verbose = 0, min_delta = 0.001, mode = 'min')
es = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='min', verbose=0, patience=5, restore_best_weights=True)
mc = tf.keras.callbacks.ModelCheckpoint(ckp_path, monitor='loss', mode='min', verbose=0, save_best_only=True, save_weights_only=True)
        
# train the model
history = model.fit(x=train,
                    y=label,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    callbacks=[mc,lr,es],  
                    shuffle=True)


In [None]:
model = create_classifier(learning_rate)

In [None]:
model.load_weights(ckp_path)

In [None]:
embedding_layer = model.get_layer('embeddings')

In [None]:
embeddings = embedding_layer.get_weights()[0]

In [None]:
np.savetxt("embeddings_model_w2v.csv", embeddings, delimiter=",")

In [None]:
word_to_index = t.word_index
index_to_word = dict()

for key in word_to_index:
    index_to_word.update({word_to_index[key] : key })

word_to_index.update({'unk':0})
index_to_word.update({0:'unk'})  

In [None]:
embeddings[t.word_index['illinois']]

In [None]:
cosine_similarity(embeddings[t.word_index['illinois']].reshape(1,60),embeddings[t.word_index['oregon']].reshape(1,60))[0][0]

In [None]:
cosine_similarity(embeddings[t.word_index['60126']].reshape(1,60),embeddings[t.word_index['60181']].reshape(1,60))[0][0]

In [None]:
il_60126 = np.reshape(embeddings[t.word_index['illinois']],embeddings_shape) + np.reshape(embeddings[t.word_index['60126']],embeddings_shape)
il_60181 = np.reshape(embeddings[t.word_index['illinois']],embeddings_shape) + np.reshape(embeddings[t.word_index['60181']],embeddings_shape)
or_97035 = np.reshape(embeddings[t.word_index['oregon']],embeddings_shape) + np.reshape(embeddings[t.word_index['97035']],embeddings_shape)

In [None]:
cosine_similarity(il_60126,il_60181)[0][0]

In [None]:
cosine_similarity(il_60126,or_97035)[0][0]

In [None]:
il_60126 += np.reshape(embeddings[t.word_index['dupage'],:],embeddings_shape)
il_60181 += np.reshape(embeddings[t.word_index['dupage'],:],embeddings_shape)
or_97035 += np.reshape(embeddings[t.word_index['clackamas'],:],embeddings_shape)

In [None]:
cosine_similarity(il_60126,il_60181)[0][0]

In [None]:
cosine_similarity(il_60126,or_97035)[0][0]

In [None]:
# define documents
data = pd.read_csv('data\ZIP2LATLON_VER1.csv',dtype={
                   'postal_code': str,
                   'country_code': str,
                   'place': str,
                   'state': str,
                   'statecode': str,
                   'province_or_county': str,
                   'province_or_countycode': str,
                   'latitude': float,
                   'longitude': float})

data['postal_code'] = data['postal_code'].str.zfill(5)

In [None]:
states = pd.DataFrame(pd.unique(data.statecode),columns=['states'])

In [None]:
states_list = []

for index, row in states.iterrows():
    state = row[0].lower()
    states_list.append(pd.concat([pd.Series(state), pd.DataFrame(np.reshape(embeddings[t.word_index[state],:],embeddings_shape))   ],axis=1))   

In [None]:
states = pd.concat(states_list)
states = states.reset_index(drop=True)

In [None]:
states.columns = range(states.columns.size)

In [None]:
x = StandardScaler().fit_transform(states.iloc[:,1:-1].values)

In [None]:
#TSNE : Compressing the weights to 3 dimensions to plot the data
tsne_model = TSNE(perplexity=40, n_components=3, init='pca', n_iter=2500, random_state=seed)
new_values = tsne_model.fit_transform(x)

In [None]:
principalDf = pd.DataFrame(data = new_values, columns = ['component 1', 'component 2', 'component 3'])

In [None]:
finalDf = pd.concat([principalDf, states.iloc[:,0]], axis = 1)
finalDf.columns = ['x', 'y', 'z', 'State']

In [None]:
import plotly.express as px

fig = px.scatter_3d(finalDf, x='x', y='y', z='z',color='State')
fig.show()

In [None]:
postalcodes = pd.DataFrame(pd.unique(data.postal_code),columns=['postal_code'])

In [None]:
postalcodes_list = []

for index, row in postalcodes.iterrows():
    postalcode = row[0].lower()
    postalcodes_list.append(pd.concat([pd.Series(postalcode), pd.DataFrame(np.reshape(embeddings[t.word_index[postalcode],:],(1,embed_dim)))   ],axis=1))   

In [None]:
postalcodes = pd.concat(postalcodes_list)
postalcodes = postalcodes.reset_index(drop=True)

In [None]:
postalcodes.columns = range(postalcodes.columns.size)

In [None]:
x = StandardScaler().fit_transform(postalcodes.iloc[:,1:-1].values)

In [None]:
#TSNE : Compressing the weights to 3 dimensions to plot the data
tsne_model = TSNE(perplexity=40, n_components=3, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(x)

In [None]:
principalDf = pd.DataFrame(data = new_values, columns = ['component 1', 'component 2', 'component 3'])

In [None]:
finalDf = pd.concat([principalDf, postalcodes.iloc[:,0]], axis = 1)
finalDf.columns = ['x', 'y', 'z', 'postalcode']

In [None]:
finalDf = finalDf[finalDf['postalcode'].str.slice(start=0, stop=3) == '601']

In [None]:
import plotly.express as px

fig = px.scatter_3d(finalDf, x='x', y='y', z='z',color='postalcode')
fig.show()