# Word Vectorisation

Below are multiple techniques to convert words into vectors. 

Note: tokenisation here is a simple word based split and clean


In [None]:
import re
import math

## Prepare corpus

In [26]:
def load_corpus(filename):
    '''Load a corpus from a file and return a list of tokens and a list of documents.
    
    filename: the name of the file to load
    returns: a tuple containing a list of tokens and a list of documents'''
    
    corpus = []
    corpus_documents = []
    next_document = []
    with open(filename, 'r', encoding='mbcs') as file:
        for line in file:
            line_tokens = line.strip().split()
            cleaned_line_tokens = [
                re.sub(r'^\W+|\W+$', '', token.casefold()) 
                for token in line_tokens ]
            cleaned_line_tokens = [token for token in cleaned_line_tokens if token != '']
            corpus.extend(cleaned_line_tokens)
            next_document.extend(cleaned_line_tokens)
            if len(cleaned_line_tokens) == 0 and len(next_document) > 0:
                corpus_documents.append(next_document)
                next_document = []
    if len(next_document) > 0:
        corpus_documents.append(next_document)

    vocabulary = list(set(corpus))
    return corpus, vocabulary, corpus_documents

corpus, vocabulary, corpus_documents = load_corpus('./datasets/Alice.txt')

print('Corpus length:', len(corpus))
print('Vocabulary size:', len(vocabulary))

Corpus length: 29521
Vocabulary size: 4275


## Evaluation of vectors

We simply display the vector similarities for visual validation as evaluation.

In [27]:
def cosine_similarity(vector1, vector2):
    '''Compute the cosine similarity between two vectors.

    vector1: the first vector
    vector2: the second vector
    returns: the cosine similarity between the two vectors'''

    dot_product = sum([v1_dim*v2_dim for v1_dim, v2_dim in zip(vector1, vector2)])
    v1_magnitude = sum([v1_dim**2 for v1_dim in vector1])**0.5
    v2_magnitude = sum([v2_dim**2 for v2_dim in vector2])**0.5
    return dot_product / (v1_magnitude * v2_magnitude)

def print_similarities(vectors, tokens=["tea", "cakes", "tarts", "dormouse"]):
    '''Print the cosine similarity between pairs of tokens.

    vectors: a dictionary of word vectors
    tokens: a list of tokens to compare'''

    for index, token1 in enumerate(tokens[:-1]):
        for token2 in tokens[index+1:]:
            similarity = cosine_similarity(vectors[token1], vectors[token2])
            print(f'\'{token1}\' and \'{token2}\' similarity = {similarity:.4f}')
        

## Hand Crafted Vectors to demonstrate evaluation

Dimensions:
- potable
- edible
- animated-ness

In [28]:
hand_crafted_vectors = {
    "tea":      [1,     0.3,    0.1],
    "cakes":    [0.3,   1,      0.1],
    "tarts":    [0.31,  1,      0.1],
    "dormouse": [0.1,   0.1,    1]
}

print_similarities(hand_crafted_vectors)

'tea' and 'cakes' similarity = 0.5545
'tea' and 'tarts' similarity = 0.5621
'tea' and 'dormouse' similarity = 0.2171
'cakes' and 'tarts' similarity = 1.0000
'cakes' and 'dormouse' similarity = 0.2171
'tarts' and 'dormouse' similarity = 0.2175


## Word to word vector (colocation)

Based on a training set, look at all of the words that appear in the same context as the target word (i.e. X nearest words left/right). 

In [29]:
def calculate_word_to_word_vectors(vocabulary, corpus, context_size=10):
    '''Calculate the word-to-word vectors for a corpus.

    vocabulary: the list of words in the vocabulary
    corpus: the list of words in the corpus
    context_size: the number of words to consider before and after the target word
    returns: a dictionary of word vectors'''
    
    word_to_word_vectors = {}
    for word in vocabulary:
        word_to_word_vectors[word] = [0] * len(vocabulary)
    vocabulary_indices = {word: index for index, word in enumerate(vocabulary)}
    for corpus_index, word in enumerate(vocabulary):
        context_start = max(0, corpus_index - context_size)
        context_end = min(len(corpus), corpus_index + context_size + 1)
        for word_context_index in range(context_start, context_end):
            if corpus_index == word_context_index: 
                continue
            context_word = corpus[word_context_index]
            word_context_vocabulary_index = vocabulary_indices[context_word]
            word_to_word_vectors[word][word_context_vocabulary_index] += 1
    return word_to_word_vectors

word_to_word_vectors = calculate_word_to_word_vectors(vocabulary, corpus)

print_similarities(word_to_word_vectors)


'tea' and 'cakes' similarity = 0.0769
'tea' and 'tarts' similarity = 0.0769
'tea' and 'dormouse' similarity = 0.3947
'cakes' and 'tarts' similarity = 0.2308
'cakes' and 'dormouse' similarity = 0.1316
'tarts' and 'dormouse' similarity = 0.2193


## TF-IDF

Term Frequency - Inverse Document Frequency. This is used to account for the natural frequency of words in documents. This is based on a corpus documents, and ratio of documents that contain the word, as a divisor of vector word frequency.

Note: in this case we have one text file as the corpora, so have split by blank line separated paragraphs.

In [30]:
def tfidf_normalise_word_to_word_vectors(word_to_word_vectors, corpus_documents):
    '''Normalise word-to-word vectors using TF-IDF.

    word_to_word_vectors: a dictionary of word vectors
    corpus_documents: a list of documents in the corpus
    returns: a dictionary of TF-IDF normalised word vectors'''
    
    log_word_to_word_vectors = {
        word: [math.log(frequency + 1, 10) for frequency in vector] 
        for word, vector in word_to_word_vectors.items()}
    document_frequencies = {word: 0 for word in word_to_word_vectors}
    for document in corpus_documents:
        for word in set(document):
            document_frequencies[word] += 1
    idf_word_to_word_vectors = {
        word: math.log(len(corpus_documents) / document_frequencies[word], 10) 
        for word in word_to_word_vectors.keys()}
    tfidf_word_to_word_vectors = {
        word: [
            log_frequency * idf_word_to_word_vectors[word] 
            for log_frequency in vector] 
        for word, vector in log_word_to_word_vectors.items()}
    return tfidf_word_to_word_vectors

tfidf_word_to_word_vectors = tfidf_normalise_word_to_word_vectors(word_to_word_vectors, corpus_documents)

print_similarities(tfidf_word_to_word_vectors)

'tea' and 'cakes' similarity = 0.0745
'tea' and 'tarts' similarity = 0.0736
'tea' and 'dormouse' similarity = 0.4137
'cakes' and 'tarts' similarity = 0.1491
'cakes' and 'dormouse' similarity = 0.0976
'tarts' and 'dormouse' similarity = 0.2009


## PMI

Pointwise Mutual Information. This is a measure of how much more likely the words are to appear together than if they were independent.

Note: the following implementation includes a add-one smoothing, which is advised to remove bias to infrequent words, but also avoids a log of 0.

In [32]:
def pmi_normalise_word_to_word_vectors(word_to_word_vectors, corpus, vocabulary):
    '''Normalise word-to-word vectors using PMI.

    word_to_word_vectors: a dictionary of word vectors
    corpus: a list of words in the corpus
    vocabulary: a list of words in the vocabulary
    returns: a dictionary of PMI normalised word vectors'''
    
    word_frequency = {word: 0 for word in word_to_word_vectors}
    for word in corpus:
        word_frequency[word] += 1
    corpus_size = len(corpus)
    pmi_word_to_word_vectors = {
        word: [
            max(0, math.log((corpus_size * (frequency + 1)) / (word_frequency[word] * word_frequency[vocabulary[context_word_index]]), 10)) 
            for context_word_index, frequency in enumerate(vector)] 
        for word, vector in word_to_word_vectors.items()}
    return pmi_word_to_word_vectors

pmi_word_to_word_vectors = pmi_normalise_word_to_word_vectors(word_to_word_vectors, corpus, vocabulary)

print_similarities(pmi_word_to_word_vectors)

'tea' and 'cakes' similarity = 0.9997
'tea' and 'tarts' similarity = 1.0000
'tea' and 'dormouse' similarity = 0.9994
'cakes' and 'tarts' similarity = 0.9997
'cakes' and 'dormouse' similarity = 0.9983
'tarts' and 'dormouse' similarity = 0.9993


## And for Shakespeare

Note: this takes 40 mins

In [35]:

shakespeare_corpus, shakespeare_vocabulary, shakespeare_corpus_documents = load_corpus('../datasets/Shakespeare.txt')

print('Corpus length:', len(shakespeare_corpus))
print('Vocabulary size:', len(shakespeare_vocabulary))

shakespeare_word_to_word_vectors = calculate_word_to_word_vectors(shakespeare_vocabulary, shakespeare_corpus)

print_similarities(shakespeare_word_to_word_vectors, tokens=["wherefore", "art", "macbeth", "dagger", "poison"])

shakespeare_pmi_word_to_word_vectors = pmi_normalise_word_to_word_vectors(shakespeare_word_to_word_vectors, shakespeare_corpus, shakespeare_vocabulary)

print_similarities(shakespeare_pmi_word_to_word_vectors, tokens=["wherefore", "art", "macbeth", "dagger", "poison"])

Corpus length: 900989
Vocabulary size: 29117
'wherefore' and 'art' similarity = 0.0833
'wherefore' and 'macbeth' similarity = 0.0772
'wherefore' and 'dagger' similarity = 0.1250
'wherefore' and 'poison' similarity = 0.0870
'art' and 'macbeth' similarity = 0.3858
'art' and 'dagger' similarity = 0.2083
'art' and 'poison' similarity = 0.0870
'macbeth' and 'dagger' similarity = 0.1157
'macbeth' and 'poison' similarity = 0.1612
'dagger' and 'poison' similarity = 0.0435
'wherefore' and 'art' similarity = 0.9983
'wherefore' and 'macbeth' similarity = 0.9998
'wherefore' and 'dagger' similarity = 0.9997
'wherefore' and 'poison' similarity = 0.9999
'art' and 'macbeth' similarity = 0.9992
'art' and 'dagger' similarity = 0.9966
'art' and 'poison' similarity = 0.9975
'macbeth' and 'dagger' similarity = 0.9991
'macbeth' and 'poison' similarity = 0.9995
'dagger' and 'poison' similarity = 0.9999


## Word2Vec

In [47]:
from gensim.models import Word2Vec


model = Word2Vec(corpus_documents, vector_size=100, min_count=1, window=10)
model.train(corpus_documents, total_examples=len(corpus_documents), epochs=10)

vectors = {word: model.wv[word] for word in model.wv.key_to_index.keys()}

print_similarities(vectors)

'tea' and 'cakes' similarity = 0.9692
'tea' and 'tarts' similarity = 0.9937
'tea' and 'dormouse' similarity = 0.9971
'cakes' and 'tarts' similarity = 0.9576
'cakes' and 'dormouse' similarity = 0.9663
'tarts' and 'dormouse' similarity = 0.9972


## Word2Vec most_similiar()

This is a useful function of the library, based on the most similar vectors to the target word.

In [45]:
print(model.wv.most_similar(positive=["alice"], topn=5))


[('thought', 0.9988740682601929), ('herself', 0.9986680746078491), ('much', 0.99854975938797), ('very', 0.998485267162323), ('poor', 0.9984777569770813)]
