In [21]:
%load_ext autoreload
%autoreload

## This expands a notebook to full width
from IPython.display import display, HTML

display(HTML("<style>.container { width:90% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))
## Show Python Version
import sys
print("Python: {0}".format(sys.version))

## Show Current Time
import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))

Python: 3.8.8 (default, Feb 24 2021, 15:54:32) [MSC v.1928 64 bit (AMD64)]
Notebook Last Run Initiated: 2022-06-16 06:17:35.299261


In [1]:
import warnings
warnings.filterwarnings("ignore")

seed = 23

import numpy as np
np.random.seed(seed)

from tensorflow.compat.v1 import set_random_seed 
set_random_seed(seed)

import pandas as pd

import tensorflow as tf

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import load_model

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold
from sklearn.metrics.pairwise import cosine_similarity

from scipy.spatial.distance import cdist

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
%matplotlib inline

In [2]:
class DataGenerator(Sequence):
        
    def __init__(self, df_X, seq_Y, batch_size=32, vocab_size=None, shuffle=False):
        self.batch_size = batch_size
        self.df_X = df_X
        self.seq_Y = seq_Y
        self.indices = self.df_X.index.tolist()
        self.vocab_size = vocab_size
        self.shuffle = shuffle
        self.on_epoch_end()
        
    def __len__(self):
        return int(np.floor(len(self.indices) / self.batch_size))

    def __getitem__(self, index):
        index = self.index[index * self.batch_size:(index + 1) * self.batch_size]
        batch = [self.indices[k] for k in index]
        
        X, y = self.__get_data(batch)
        return X, y
    
    def n(self):
        return len(self.indices)
    
    def on_epoch_end(self):
        self.index = np.arange(len(self.indices))
        if self.shuffle == True:
            np.random.shuffle(self.index)

    def __get_data(self, batch):
        X1 = []
        y  = []
        
        for i, id in enumerate(batch):
            
            # Data
            docs = self.df_X.iloc[self.indices[id]]
                       
            #Labels
            output_seq = self.seq_Y[ self.indices[id]]
            output_seq = to_categorical([output_seq], num_classes=self.vocab_size)[0]
                
            X1.append(docs)
            y.append(output_seq)
                            
        return np.array(X1), np.array(y).reshape(self.batch_size,self.vocab_size)

In [3]:
# Input vector, returns nearest word(s)
def Cosine_Similarity(word,weight,word_to_index,vocab_size,index_to_word):
    
    #Get the index of the word from the dictionary
    index = word_to_index[word]
    
    #Get the correspondin weights for the word
    word_vector_1 = weight[index]
    
    
    word_similarity = {}

    for i in range(vocab_size):
        
        j = i
        
        word_vector_2 = weight[j]
        
        theta_sum = np.dot(word_vector_1, word_vector_2)
        theta_den = np.linalg.norm(word_vector_1) * np.linalg.norm(word_vector_2)
        theta = theta_sum / theta_den
        
        word = index_to_word[j]
        word_similarity[word] = theta
    
    return word_similarity #words_sorted

In [4]:
def BuildModel(learning_rate):
        
    input_text  = tf.keras.layers.Input(shape=(max_length-1,),dtype="int32",name='text')

    x = tf.keras.layers.Embedding(input_dim=vocab_size, input_length=max_length, output_dim=20,name='embeddings')(input_text)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(400, activation='relu',name='act01')(x)
    
    output = tf.keras.layers.Dense(vocab_size, activation='softmax', name='act02')(x)
    
    model = Model(input_text,output)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = learning_rate),loss="categorical_crossentropy")
    return model

In [5]:
# define documents
data = pd.read_csv('data\zipcodedata.csv')

In [6]:
docs = data.squeeze()

In [7]:
# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(docs)

In [8]:
# integer encode the documents
vocab_size = len(t.word_index) + 1

In [9]:
encoded_docs = t.texts_to_sequences(docs)

In [10]:
# pad documents to a max length of 3 words, windows size = 3
max_length = 3
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [11]:
train = pd.DataFrame(padded_docs[:,[0,1]])
label = padded_docs[:,[2]]

In [13]:
batch_size=256
train_generator = DataGenerator(df_X=train, seq_Y=label, batch_size=batch_size, vocab_size=vocab_size, shuffle=True)
STEP_SIZE_TRAIN=train_generator.n()//train_generator.batch_size

In [14]:
learning_rate = 1e-3
n_epochs = 512

model = BuildModel(learning_rate)
        
ckp_path = 'models/Model_w2v.hdf5'

cb_lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'loss', factor = 0.4, patience = 2, verbose = 0, min_delta = 0.001, mode = 'min')
        
es = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='min', verbose=0, patience=4, restore_best_weights=True)
mc = tf.keras.callbacks.ModelCheckpoint(ckp_path, monitor='loss', mode='min', verbose=0, save_best_only=True, save_weights_only=True)
        
    #tb_cb = tf.keras.callbacks.TensorBoard(log_dir="logs")

# train the model
history = model.fit(x=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    epochs=n_epochs,
                    callbacks=[mc,cb_lr_schedule,es],  
                    shuffle=True)


Epoch 1/512
Epoch 2/512
Epoch 3/512
Epoch 4/512
Epoch 5/512
Epoch 6/512
Epoch 7/512
Epoch 8/512
Epoch 9/512
Epoch 10/512
Epoch 11/512
Epoch 12/512
Epoch 13/512
Epoch 14/512
Epoch 15/512
Epoch 16/512
Epoch 17/512
Epoch 18/512
Epoch 19/512
Epoch 20/512
Epoch 21/512
Epoch 22/512
Epoch 23/512
Epoch 24/512
Epoch 25/512
Epoch 26/512
Epoch 27/512
Epoch 28/512
Epoch 29/512
Epoch 30/512
Epoch 31/512
Epoch 32/512
Epoch 33/512
Epoch 34/512
Epoch 35/512
Epoch 36/512
Epoch 37/512
Epoch 38/512
Epoch 39/512
Epoch 40/512
Epoch 41/512
Epoch 42/512
Epoch 43/512
Epoch 44/512
Epoch 45/512
Epoch 46/512
Epoch 47/512
Epoch 48/512
Epoch 49/512
Epoch 50/512
Epoch 51/512
Epoch 52/512
Epoch 53/512
Epoch 54/512
Epoch 55/512
Epoch 56/512
Epoch 57/512
Epoch 58/512
Epoch 59/512
Epoch 60/512
Epoch 61/512
Epoch 62/512
Epoch 63/512
Epoch 64/512
Epoch 65/512
Epoch 66/512
Epoch 67/512
Epoch 68/512
Epoch 69/512
Epoch 70/512
Epoch 71/512
Epoch 72/512
Epoch 73/512
Epoch 74/512
Epoch 75/512
Epoch 76/512
Epoch 77/512
Epoch 78

In [13]:
learning_rate = 1e-3
model = BuildModel(learning_rate)

In [14]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text (InputLayer)           [(None, 2)]               0         
                                                                 
 embeddings (Embedding)      (None, 2, 20)             1129360   
                                                                 
 flatten (Flatten)           (None, 40)                0         
                                                                 
 act01 (Dense)               (None, 400)               16400     
                                                                 
 act02 (Dense)               (None, 56468)             22643668  
                                                                 
Total params: 23,789,428
Trainable params: 23,789,428
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.load_weights('models/Model_w2v.hdf5')

In [16]:
# evaluate the model
loss = model.evaluate(train.iloc[0:100,:], to_categorical(label[0:100], num_classes=vocab_size), verbose=2)
print(loss)

4/4 - 2s - loss: 1.9769 - 2s/epoch - 453ms/step
1.976943850517273


In [17]:
embedding_layer = model.get_layer('embeddings')

In [18]:
embeddings = embedding_layer.get_weights()[0]

In [19]:
np.savetxt("embeddings_model_w2v.csv", embeddings, delimiter=",")

In [20]:
embeddings[t.word_index['60126'],:]

array([ 0.26666126,  0.303636  ,  1.0568444 , -0.03580691,  0.49443388,
       -1.1844802 , -0.8280229 , -0.24679595,  1.3320878 , -1.2740679 ,
        0.04827994,  1.3138323 , -0.6633513 ,  0.8621474 ,  0.25619456,
        0.4419295 ,  0.06416243,  0.18444656, -0.3404408 , -0.23613566],
      dtype=float32)

In [42]:
cosine_similarity(np.reshape(embeddings[t.word_index['villa'],:],(1,20)),np.reshape(embeddings[t.word_index['elmhurst'],:],(1,20)))[0][0]

-0.056522503

In [36]:
1. - cdist(np.reshape(embeddings[t.word_index['60126'],:],(1,20)), np.reshape(embeddings[t.word_index['65402'],:],(1,20)), 'cosine')[0][0]

-0.19200744551648508

In [None]:
states = pd.DataFrame(pd.unique(data.statecode),columns=['states'])

In [None]:
states_list = []

for index, row in states.iterrows():
    state = row[0].lower()
    states_list.append(pd.concat([pd.Series(state), pd.DataFrame(np.reshape(embeddings[t.word_index[state],:],(1,20)))   ],axis=1))   

In [None]:
states = pd.concat(states_list)
states = states.reset_index(drop=True)

In [None]:
states.columns = range(states.columns.size)

In [None]:
x = StandardScaler().fit_transform(states.iloc[:,1:-1].values)

In [None]:
#TSNE : Compressing the weights to 3 dimensions to plot the data
tsne_model = TSNE(perplexity=40, n_components=3, init='pca', n_iter=2500, random_state=seed)
new_values = tsne_model.fit_transform(x)

In [None]:
principalDf = pd.DataFrame(data = new_values
             , columns = ['component 1', 'component 2', 'component 3'])

In [None]:
#principalDf = pd.DataFrame(data = principalComponents
#             , columns = ['principal component 1', 'principal component 2'])

In [None]:
finalDf = pd.concat([principalDf, states.iloc[:,0]], axis = 1)
finalDf.columns = ['x', 'y', 'z', 'State']

In [None]:
word_to_index = t.word_index
index_to_word = dict()

for key in word_to_index:
    index_to_word.update({word_to_index[key] : key })

In [None]:
import plotly.express as px

fig = px.scatter_3d(finalDf, x='x', y='y', z='z',
              color='State')
fig.show()

In [None]:
postalcodes = pd.DataFrame(pd.unique(data.postal_code),columns=['postal_code'])

In [None]:
postalcodes_list = []

for index, row in postalcodes.iterrows():
    postalcode = row[0].lower()
    postalcodes_list.append(pd.concat([pd.Series(postalcode), pd.DataFrame(np.reshape(embeddings[t.word_index[postalcode],:],(1,20)))   ],axis=1))   

In [None]:
postalcodes = pd.concat(postalcodes_list)
postalcodes = postalcodes.reset_index(drop=True)

In [None]:
postalcodes.columns = range(postalcodes.columns.size)

In [None]:
x = StandardScaler().fit_transform(postalcodes.iloc[:,1:-1].values)

In [None]:
#TSNE : Compressing the weights to 3 dimensions to plot the data
tsne_model = TSNE(perplexity=40, n_components=3, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(x)

In [None]:
principalDf = pd.DataFrame(data = new_values
             , columns = ['component 1', 'component 2', 'component 3'])

In [None]:
principalDf

In [None]:
finalDf = pd.concat([principalDf, postalcodes.iloc[:,0]], axis = 1)
finalDf.columns = ['x', 'y', 'z', 'postalcode']

In [None]:
finalDf = finalDf[finalDf['postalcode'].str.slice(start=0, stop=3) == '601']

In [None]:
import plotly.express as px

fig = px.scatter_3d(finalDf, x='x', y='y', z='z',
              color='postalcode')
fig.show()

In [None]:
output_seq = to_categorical([label[0:32]], num_classes=vocab_size)[0] 

In [None]:
output_seq