In [1]:
import itertools
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.layers import Input, Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.models import Model
from scipy import sparse

In [2]:
texts1 = "Word2vec is a technique for natural language processing (NLP) published in 2013. The word2vec algorithm uses a neural network model to learn word associations from a large corpus of text. Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence. As the name implies, word2vec represents each distinct word with a particular list of numbers called a vector. The vectors are chosen carefully such that they capture the semantic and syntactic qualities of words; as such, a simple mathematical function (cosine similarity) can indicate the level of semantic similarity between the words represented by those vectors. Word2vec is a group of related models that are used to produce word embeddings. These models are shallow, two-layer neural networks that are trained to reconstruct linguistic contexts of words. Word2vec takes as its input a large corpus of text and produces a vector space, typically of several hundred dimensions, with each unique word in the corpus being assigned a corresponding vector in the space. Word2vec can utilize either of two model architectures to produce these distributed representations of words: continuously sliding bag-of-words (CBOW) or continuously sliding skip-gram. In both architectures, word2vec considers both individual words and a sliding context window as it iterates over the corpus. The CBOW can be viewed as a ‘fill in the blank’ task, where the word embedding represents the way the word influences the relative probabilities of other words in the context window. Words which are semantically similar should influence these probabilities in similar ways, because semantically similar words should be used in similar contexts. The order of context words does not influence prediction (bag-of-words assumption). In the continuous skip-gram architecture, the model uses the current word to predict the surrounding window of context words.[1][2] The skip-gram architecture weighs nearby context words more heavily than more distant context words. According to the authors' note,[3] CBOW is faster while skip-gram does a better job for infrequent words. After the model has trained, the learned word embeddings are positioned in the vector space such that words that share common contexts in the corpus — that is, words that are semantically and syntactically similar — are located close to one another in the space.[1] More dissimilar words are located farther from one another in the space."

In [3]:
texts2 = 'GloVe, coined from Global Vectors, is a model for distributed word representation. The model is an unsupervised learning algorithm for obtaining vector representations for words. This is achieved by mapping words into a meaningful space where the distance between words is related to semantic similarity.[1] Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. It is developed as an open-source project at Stanford[2] and was launched in 2014. As log-bilinear regression model for unsupervised learning of word representations, it combines the features of two model families, namely the global matrix factorization and local context window methods.[3] GloVe can be used to find relations between words like synonyms, company-product relations, zip codes and cities, etc. However, the unsupervised learning algorithm is not effective in identifying homographs, that is, words with the same spelling and different meanings. This is as the unsupervised learning algorithm calculates a single set of vectors for words with the same morphological structure.[4] The algorithm is also used by the SpaCy library to build semantic word embedding features, while computing the top list words that match with distance measures such as cosine similarity and Euclidean distance approach.[5] GloVe was also used as the word representation framework for the online and offline systems designed to detect psychological distress in patient interviews.'

In [4]:
texts1 = texts1.split('.')
texts2 = texts2.split('.')

In [5]:
f'len of text1: {len(texts1)}, len of text2: {len(texts2)}'

'len of text1: 19, len of text2: 12'

In [6]:
texts1

['Word2vec is a technique for natural language processing (NLP) published in 2013',
 ' The word2vec algorithm uses a neural network model to learn word associations from a large corpus of text',
 ' Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence',
 ' As the name implies, word2vec represents each distinct word with a particular list of numbers called a vector',
 ' The vectors are chosen carefully such that they capture the semantic and syntactic qualities of words; as such, a simple mathematical function (cosine similarity) can indicate the level of semantic similarity between the words represented by those vectors',
 ' Word2vec is a group of related models that are used to produce word embeddings',
 ' These models are shallow, two-layer neural networks that are trained to reconstruct linguistic contexts of words',
 ' Word2vec takes as its input a large corpus of text and produces a vector space, typically of several hundred dime

In [7]:
texts2

['GloVe, coined from Global Vectors, is a model for distributed word representation',
 ' The model is an unsupervised learning algorithm for obtaining vector representations for words',
 ' This is achieved by mapping words into a meaningful space where the distance between words is related to semantic similarity',
 '[1] Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space',
 ' It is developed as an open-source project at Stanford[2] and was launched in 2014',
 ' As log-bilinear regression model for unsupervised learning of word representations, it combines the features of two model families, namely the global matrix factorization and local context window methods',
 '[3] GloVe can be used to find relations between words like synonyms, company-product relations, zip codes and cities, etc',
 ' However, the unsupervised learning algorithm is not effec

In [8]:
def text_preprocessing(TEXT: str, punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_“~''', stop_words=None) -> list:
    """
    A method to preprocess a text
    """
    if TEXT == '':
        return
    if stop_words is None:
        stop_words = ['and', 'a', 'is', 'the', 'in', 'be', 'will', 'was', 'but', 'this', 'were', 'with', 'of', 'also', 'on', '.', 'for', 'any', 'its', 'and', 'are', 'from', 'both', 'as']

    for x in TEXT.lower():
        if x in punctuations:
            TEXT = TEXT.replace(x, "")

    # Removing words that have numbers in them
    TEXT = re.sub(r'\w*\d\w*', '', TEXT)

    # Removing digits
    TEXT = re.sub(r'[0-9]+', '', TEXT)

    # Cleaning the whitespaces
    TEXT = re.sub(r'\s+', ' ', TEXT).strip()

    # Setting every word to lower
    TEXT = TEXT.lower()

    # Converting all our text to a list 
    TEXT = TEXT.split(' ')

    # Dropping empty strings
    TEXT = [x for x in TEXT if x != '']

    # Dropping stop words
    TEXT = [x for x in TEXT if x not in stop_words]

    return TEXT

In [9]:
def get_training_data(texts: list, window=2):
    # Defining the window for context
    # window = 2

    # Creating a placeholder for the scanning of the word list
    word_lists = []
    all_text = []
    for text in texts:
        if text is None:
            continue
        # Cleaning the text
        text = text_preprocessing(text)
        print (text)

        # Appending to the all text lists
        all_text += text

        # Creating a context dictionary
        for i, word in enumerate(text):
            for w in range(window):
                # Getting the context that is ahead by *window* words
                if i + 1 + w < len(text):
                    word_lists.append([word] + [text[(i + 1 + w)]])
                # Getting the context that is behind by *window* words
                if i - w - 1 >= 0:
                    word_lists.append([word] + [text[(i - w - 1)]])
    return word_lists, all_text

In [10]:
word_lists, all_text = get_training_data(texts1)
word_lists

['technique', 'natural', 'language', 'processing', 'nlp', 'published']
['algorithm', 'uses', 'neural', 'network', 'model', 'to', 'learn', 'word', 'associations', 'large', 'corpus', 'text']
['once', 'trained', 'such', 'model', 'can', 'detect', 'synonymous', 'words', 'or', 'suggest', 'additional', 'words', 'partial', 'sentence']
['name', 'implies', 'represents', 'each', 'distinct', 'word', 'particular', 'list', 'numbers', 'called', 'vector']
['vectors', 'chosen', 'carefully', 'such', 'that', 'they', 'capture', 'semantic', 'syntactic', 'qualities', 'words', 'such', 'simple', 'mathematical', 'function', 'cosine', 'similarity', 'can', 'indicate', 'level', 'semantic', 'similarity', 'between', 'words', 'represented', 'by', 'those', 'vectors']
['group', 'related', 'models', 'that', 'used', 'to', 'produce', 'word', 'embeddings']
['these', 'models', 'shallow', 'twolayer', 'neural', 'networks', 'that', 'trained', 'to', 'reconstruct', 'linguistic', 'contexts', 'words']
['takes', 'input', 'large', 

TypeError: 'NoneType' object is not iterable

In [None]:
all_text

In [None]:
def create_unique_word_dict(TEXT: list) -> dict:
    """
    A method that creates a dictionary where the keys are unique words
    and key values are indices
    """
    # Getting all the unique words from our text and sorting them alphabetically
    Words = list(set(TEXT))
    Words.sort()

    # Creating the dictionary for the unique words
    UniqueWordDict = {}
    for i, Word in enumerate(Words):
        UniqueWordDict.update({Word: i})

    return UniqueWordDict

In [None]:
unique_word_dict = create_unique_word_dict(all_text)
# Defining the number of features (unique words)
n_words = len(unique_word_dict)
unique_word_dict

In [None]:
# Getting all the unique words
words = list(unique_word_dict.keys())
print(words)

In [None]:
# Creating the X and Y matrices using one hot encoding
print(n_words)
X = []
Y = []
for i, word_list in tqdm(enumerate(word_lists)):
    # Getting the indices
    print(word_list)
    main_word_index = unique_word_dict.get(word_list[0])
    context_word_index = unique_word_dict.get(word_list[1])
    # print (word_list)
    print(word_list[0], main_word_index)
    print(word_list[1], context_word_index)

    # Creating the placeholders
    X_row = np.zeros(n_words)
    Y_row = np.zeros(n_words)

    # One hot encoding the main word
    X_row[main_word_index] = 1

    # One hot encoding the Y matrix words
    Y_row[context_word_index] = 1

    # Appending to the main matrices
    X.append(X_row)
    Y.append(Y_row)

# Converting the matrices into a sparse format because the vast majority of the data are 0s

In [None]:
X

In [None]:
Y

In [None]:
XX = tf.convert_to_tensor(X, dtype=tf.float32)
YY = tf.convert_to_tensor(Y, dtype=tf.float32)
print(XX.shape)
print(YY.shape)

In [None]:
def CreateModel():
    # Defining the size of the embedding
    embed_size = 2
    # Defining the neural network

    # inp = Input(shape=(X.shape[1],))
    inp = Input(shape=XX.shape[1])  # 21
    x = Dense(units=embed_size, activation='linear')(inp)
    # x = Dense(units=21, activation='softmax')(x)
    x = Dense(units=YY.shape[1], activation='softmax')(x)

    model = Model(inputs=inp, outputs=x)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')
    model.summary()
    return model

In [None]:
model = CreateModel()
# Optimizing the network weights
model.fit(
    x=XX,
    y=YY,
    batch_size=64,
    epochs=10000
)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(model.history.history['loss'])
plt.plot(model.history.history['accuracy'])
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()

In [None]:
# The input layer

weights = model.get_weights()[0]  # 21*2
print(weights.shape)
print(weights[1][1])
print(weights)

# weights = model.get_weights()[2]
# print (weights)

In [None]:
# get the weight for each unique word
embedding_dict = {}
for word in words:  # to pick a row of weight of two values for each unique word since weights = 21*2
    embedding_dict.update({word: weights[unique_word_dict.get(word)]})

In [None]:
embedding_dict

In [None]:
plt.figure(figsize=(10, 10))
i = 0
for word in list(unique_word_dict.keys()):
    print(i, ' >> ', word)
    coord = embedding_dict.get(word)
    plt.scatter(coord[0], coord[1])
    plt.annotate(word, (coord[0], coord[1]))
    i = i + 1

In [None]:
# The input layer
weights = model.get_weights()[0]
# weights[: , 0] = 0

In [None]:
plt.figure(figsize=(10, 10))
i = 0
for word in list(unique_word_dict.keys()):
    print(i, ' >> ', word)
    # coord = embedding_dict.get(word)
    coord = embedding_dict.get(word)
    if weights[i][0] < 0 < weights[i][1]:
        plt.scatter(0, weights[i][1])
        plt.annotate(word, (0, weights[i][1]))
    else:
        plt.scatter(weights[i][0], weights[i][1])
        plt.annotate(word, (weights[i][0], weights[i][1]))
    i = i + 1