In [1]:
import numpy as np
import pandas as pd
import gensim
import nltk

from sklearn.feature_extraction.text import CountVectorizer

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

# Glove Embeddings

In [2]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding="utf8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [3]:
word_to_index, index_to_word, word2vec = read_glove_vecs('data/glove.6B.200d.txt')

In [4]:
def get_embeddings(X, embedding, avg = False):
    ans = np.zeros((X.shape[0], X.shape[1], len(embedding['word'])))
    ans.shape

    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            ans[i, j] = embedding[X[i, j]]
       
    if(avg):
        ans = np.average(ans, axis=1)
        
    return ans

# Preprocess Dataset

In [7]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['shall', 'also', 'upon', 'upto'])

In [19]:
file = open('data/Embedding_Data.txt', 'r', encoding="utf8")
content = file.read()
num_lines = 0

for i in content.split("\n"):
    if i:
        num_lines += 1

data = []
f = open('data/Embedding_Data.txt', 'r', encoding='utf8')
for i in range(num_lines):
    words = nltk.word_tokenize(f.readline())
    words=[word.lower() for word in words if word.isalpha() and word.lower() not in stopwords]
    if(len(words)>3):
        data.append(words)
        
# data

In [22]:
len(data)

623

In [23]:
def create_dataset(data, n, add_reverse = True):
    ngram_list = []

    for text in data:
        ngram = list(nltk.ngrams(text, n))
        ngram = [list(ele) for ele in ngram]
        
        ngram_list.extend(ngram)

        if(add_reverse):
            ngram_rev = []
            for sent in ngram:
                nr = [w for w in reversed(sent)]
                ngram_rev.append(nr)
            ngram_list.extend(ngram_rev)
    
    dataset = np.array(ngram_list)
    X = dataset[:, :-1]
    Y = dataset[:, -1].reshape(X.shape[0], 1)
    
    return X, Y

In [24]:
X, Y = create_dataset(data, 3, True)
print(X.shape)
print(Y.shape)

(22220, 2)
(22220, 1)


In [25]:
def preprocess_data(X):
    
    X = [" ".join(sent) for sent in X]
    
    # Create Bag-of-words
    CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                               stop_words='english')

    Count_data = CountVec.fit_transform(X)

    cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names())

    Xt = cv_dataframe.values
    
    return Xt, cv_dataframe

In [26]:
Xdata, _ = preprocess_data(X)
Ydata, cv = preprocess_data(Y)
all_words = np.array(cv.columns)

print(Ydata.shape)
print(Xdata.shape)

(22220, 2495)
(22220, 2495)


In [27]:
embed_size = len(word2vec['word'])

model = Sequential([
    Dense(embed_size, activation = 'relu', input_shape = (Xdata.shape[1],)),
    Dense(Xdata.shape[1], activation = 'softmax')
])

In [28]:
weights = []
for word in all_words:
    try:
        weights.append(word2vec[word])
    except:
        weights.append(np.zeros((len(word2vec['word']),)))
    
weights = np.array(weights)
weights

array([[ 0.17784 ,  1.0057  ,  0.66254 , ..., -0.091201,  0.18042 ,
        -0.33554 ],
       [ 0.31527 ,  0.16268 , -0.18361 , ...,  0.52234 ,  0.33492 ,
         0.3082  ],
       [ 0.42923 ,  0.22089 , -0.22135 , ...,  0.59913 , -0.18488 ,
         0.069688],
       ...,
       [-0.57573 ,  0.2476  ,  0.60361 , ...,  0.083726,  0.38442 ,
         0.013646],
       [-0.11818 ,  0.040206, -0.74116 , ..., -0.12308 ,  0.11491 ,
        -0.030226],
       [-0.1524  ,  0.13831 , -0.82515 , ...,  0.12213 ,  0.2555  ,
         0.007435]])

In [29]:
model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics=['accuracy'])

In [30]:
model.layers[0].set_weights([weights, np.random.randn(model.get_weights()[1].shape[0], )])

In [33]:
model.fit(Xdata, Ydata, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1d800849970>

In [68]:
weights = model.get_weights()[0]

In [69]:
weights.shape

(2367, 100)

In [70]:
w2v = {}
for i, w in enumerate(all_words):
    word2vec[w] = weights[i]

In [77]:
# def get_score(word1, word2):
#     print(np.sum(np.abs(w2v[word1] - w2v[word2])))
    
def get_score2(word1, word2):
    print(np.sum(np.abs(word2vec[word1] - word2vec[word2])))

In [80]:
get_score('performance', 'speed')
get_score2('performance', 'speed')
get_score('specification', 'requirements')
get_score2('specification', 'requirements')
get_score('software', 'user')
get_score2('software', 'user')
get_score('software', 'marvel')

63.98974
45.098294
95.04921
42.49889600000001
85.340256
34.26461385000001
168.12871


Access, information, ensure, authentication, secure, malicious, percent, incorrect, login,
password, confidentiality, integrity, completeness, accuracy, perturbation, virus,
authorization, validation, encryption, decryption 

Second, response, time, longer, fast, minute, take, process, system, maximum,
minimum, flow, every, space, time, memory, storage, throughput, peak, mean, index,
compress, uncompress, runtime, perform, execute, dynamic, offset, reduce, fixing,
early, late, acceptable, capacity

In [115]:
clus1 = ['access', 'information', 'ensure', 'authentication', 'secure', 'incorrect', 'login', 'password', 'authorization', 'validation']
clus1 = np.array(clus1).reshape(len(clus1), 1)

In [116]:
new_vals = get_embeddings(clus1, w2v, True)
old_vals = get_embeddings(clus1, word2vec, True)

print(new_vals.shape)

(10, 100)


In [117]:
np.sum(np.std(new_vals, axis = 0))

77.3194915594963

In [118]:
np.sum(np.std(old_vals, axis = 0))

38.57120067777425

In [121]:
print(w2v['security'])
print()
print(word2vec['security'])

[-0.4177245  -0.05832736  0.69810474  0.12418088  0.69168586  0.6958351
 -0.33664554  0.758713    1.3777964   0.9757648   0.7765456  -0.06642433
  0.4850468   0.6928285  -0.20906155  0.38179663  1.1796879   0.43780074
 -1.0015404   0.09326015  0.15187779 -0.41366392  0.5962879   0.05639357
  0.02737966 -0.21387483 -0.01159031 -0.2525727  -0.57675683  1.1950502
  0.7578642  -0.09896418 -0.05481916  0.14516234  0.42515987  0.3766653
 -0.3547588   0.6432506   0.44843525  1.2638814   0.1365273  -0.40339026
  2.4992285   0.7677544   0.40470415 -0.23895818 -0.34537     0.5834055
 -0.57938975 -0.5073305   1.2602712   0.17959294  0.10180157  1.6459987
 -0.3406694  -2.3517368   1.5475576  -0.0892527   1.9136493   0.6960121
  0.00464339 -0.14616165 -0.12455428  0.02014696 -0.21472593  0.79754823
  0.08699043  0.999122    1.3614556   0.64468396  0.41722915 -0.0772491
  0.21912956 -0.47087342  0.7792182   0.41929328  0.7091661   0.5178715
 -1.345449    0.45738646  1.2294705  -0.00831395  0.4048674

In [38]:
file = open('data/word2Vec.txt', 'w')
file.write(str(word2vec))

KeyboardInterrupt: 

In [None]:
len(word2vec)