In [1]:
import re
import zipfile
import numpy as np
import pandas as pd
from nltk.stem.snowball import PorterStemmer
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow import keras

In [2]:
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    stops = stopwords.words('english')
    #print(stops)
    porter = PorterStemmer()
    for word in sentence.split():
        if word in stops:
            sentence = sentence.replace(word, '')
        sentence = sentence.replace(word, porter.stem(word))
    sentence = " ".join(sentence.split())
    return sentence.lower()

In [3]:
def mapping(tokens):
    word_to_id = {}
    id_to_word = {}

    for i, token in enumerate(set(tokens)):
        word_to_id[token] = i
        id_to_word[i] = token

    return word_to_id, id_to_word

def Map(text):
    a = []
    for word in text.split(' '):
        a.append(word_to_id[word])
    return a

def prepare_training_data(text):
    return list(map(Map,text))

In [4]:
text = pd.read_csv('SPAM text message 20170820 - Data.csv')
text = list(text.Message)
text = [preprocess_text(text_) for text_ in text]

In [5]:
word_to_id, id_to_word = mapping(" ".join(text).split(" "))

In [6]:
train = prepare_training_data(text)

In [7]:
# prepare training data N X window_size, label as N X 1

def prepare_data(train,window_size=2):
    training_data = []
    training_label = []
    
    for data in train:
        if(len(data)<window_size):
            continue
        for i in range(window_size,len(data)-window_size-1):
            training_data.append(data[i])
            training_label.append(data[i-window_size:i] + data[i+1:i+window_size+1])
    return training_data,training_label

In [8]:
training_data,training_label = prepare_data(train)

In [9]:
def one_hot_vector(word_index,word_to_id):
    num = len(word_to_id)
    
    vector = np.zeros(shape=(num))
    
    pos = word_index
    
    vector[pos] =1
    return vector

In [10]:
# def initialize_parameter(word_to_id,latent_dim=5):
    
#     num = len(word_to_id)
#     latent_dim =latent_dim
    
#     parameter_0 = np.random.normal( size = (num,latent_dim))
#     parameter_1 = np.random.normal(size = (latent_dim,num))
    
#     return parameter_0,parameter_1

num,latent_dim = len(word_to_id) , 5

In [11]:
def create_batch(training_data,training_label,batch_size = 16):
    
    for i in range((len(training_label)//batch_size)-1):
        
        yield training_data[i*batch_size:(i+1)*batch_size], training_label[i*batch_size:(i+1)*batch_size]

In [12]:
def unit_vector(vector):
    """ Returns the unit vector of the vector.  """
    return vector / np.linalg.norm(vector)

def angle_between(v1, v2):
    
    v1_u = unit_vector(v1)
    v2_u = unit_vector(v2)
    return np.dot(v1_u, v2_u)

In [13]:
model = keras.models.Sequential()
model.add(keras.layers.Dense(latent_dim,input_shape = [num]))
model.add(keras.layers.Dense(num,activation='softmax'))

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 5)                 36155     
_________________________________________________________________
dense_1 (Dense)              (None, 7230)              43380     
Total params: 79,535
Trainable params: 79,535
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(loss="categorical_crossentropy",
              optimizer=keras.optimizers.SGD(lr=0.01),
              metrics=["accuracy"])

In [16]:
## start forward network
NUM_EPOCH = 30
batch_size = 64
for i in range(NUM_EPOCH):
    
    gen = create_batch(training_data,training_label,batch_size)
    
    Loss = 0

    for iterat in range((len(training_label)//batch_size)-1):
        data, label = next(gen)
        trainX = []
        for data_ in label:
            data_ = np.sum([one_hot_vector(d,word_to_id) for d in data_],axis=0)
            trainX.append(data_)
            
        trainY = np.array([[one_hot_vector(y,word_to_id)] for y in label])
        trainX = np.array(trainX)

        trainY = np.squeeze(trainY,axis =1)
        loss = model.train_on_batch(trainX,trainY)
        Loss += loss[0]
    print("Epoch {} loss {}".format(i,np.round(Loss,2)))

W1208 15:36:48.745772 4552582592 deprecation.py:323] From /anaconda3/envs/generative/lib/python3.6/site-packages/tensorflow_core/python/ops/math_grad.py:1394: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 0 loss 15686.53
Epoch 1 loss 15642.09
Epoch 2 loss 15597.68
Epoch 3 loss 15552.65
Epoch 4 loss 15504.56
Epoch 5 loss 15444.15
Epoch 6 loss 15337.16
Epoch 7 loss 15073.32
Epoch 8 loss 14542.46
Epoch 9 loss 14018.74
Epoch 10 loss 13692.73
Epoch 11 loss 13513.98
Epoch 12 loss 13410.26
Epoch 13 loss 13344.11
Epoch 14 loss 13298.72
Epoch 15 loss 13265.36
Epoch 16 loss 13238.96
Epoch 17 loss 13216.33
Epoch 18 loss 13195.21
Epoch 19 loss 13173.88
Epoch 20 loss 13151.07
Epoch 21 loss 13126.17
Epoch 22 loss 13099.69
Epoch 23 loss 13073.22
Epoch 24 loss 13048.56
Epoch 25 loss 13026.84
Epoch 26 loss 13008.22
Epoch 27 loss 12992.02
Epoch 28 loss 12977.32
Epoch 29 loss 12963.44


In [17]:
W1 = np.array(model.get_weights()[0])
W2 = np.array(model.get_weights()[2].T)

In [20]:
WORD_EMBEDDING = (W1 + W2)/2

In [21]:
angle_between(WORD_EMBEDDING[word_to_id['walk']],WORD_EMBEDDING[word_to_id['run']])

0.95900124

In [22]:
# Very closely related 