# word2vec

https://towardsdatascience.com/word2vec-explained-49c52b4ccb71

In [43]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/mazz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mazz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [44]:
import pandas as pd
import nltk
import string
import matplotlib.pyplot as plt

In [45]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from gensim.models import Word2Vec as w2v

In [46]:
PATH = '/Users/mazz/Documents/Programming/Python/NLP/data/shakespeare.txt'
sw = stopwords.words('english')

In [47]:
lines = []
with open(PATH, 'r') as f:
    for l in f:
        lines.append(l)

In [48]:
print(lines[:10])

['"ACT I"\n', '"SCENE I. London. The palace."\n', '"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others"\n', '"So shaken as we are, so wan with care,"\n', '"Find we a time for frighted peace to pant,"\n', '"And breathe short-winded accents of new broils"\n', '"To be commenced in strands afar remote."\n', '"No more the thirsty entrance of this soil"\n', '"Shall daub her lips with her own children\'s blood,"\n', '"Nor more shall trenching war channel her fields,"\n']


### Data Analysis

Remove new lines

In [49]:
lines = [line.rstrip('\n') for line in lines]

In [50]:
print(lines[:10])

['"ACT I"', '"SCENE I. London. The palace."', '"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others"', '"So shaken as we are, so wan with care,"', '"Find we a time for frighted peace to pant,"', '"And breathe short-winded accents of new broils"', '"To be commenced in strands afar remote."', '"No more the thirsty entrance of this soil"', '"Shall daub her lips with her own children\'s blood,"', '"Nor more shall trenching war channel her fields,"']


make all characters lower

In [51]:
lines = [line.lower() for line in lines]

In [52]:
print(lines[:10])

['"act i"', '"scene i. london. the palace."', '"enter king henry, lord john of lancaster, the earl of westmoreland, sir walter blunt, and others"', '"so shaken as we are, so wan with care,"', '"find we a time for frighted peace to pant,"', '"and breathe short-winded accents of new broils"', '"to be commenced in strands afar remote."', '"no more the thirsty entrance of this soil"', '"shall daub her lips with her own children\'s blood,"', '"nor more shall trenching war channel her fields,"']


remove punctuations

In [53]:
lines = [line.translate(str.maketrans('', '', string.punctuation)) for line in lines]

In [54]:
print(lines[:10])

['act i', 'scene i london the palace', 'enter king henry lord john of lancaster the earl of westmoreland sir walter blunt and others', 'so shaken as we are so wan with care', 'find we a time for frighted peace to pant', 'and breathe shortwinded accents of new broils', 'to be commenced in strands afar remote', 'no more the thirsty entrance of this soil', 'shall daub her lips with her own childrens blood', 'nor more shall trenching war channel her fields']


tokenize

In [55]:
lines = [word_tokenize(line) for line in lines]

In [56]:
print(lines[:10])

[['act', 'i'], ['scene', 'i', 'london', 'the', 'palace'], ['enter', 'king', 'henry', 'lord', 'john', 'of', 'lancaster', 'the', 'earl', 'of', 'westmoreland', 'sir', 'walter', 'blunt', 'and', 'others'], ['so', 'shaken', 'as', 'we', 'are', 'so', 'wan', 'with', 'care'], ['find', 'we', 'a', 'time', 'for', 'frighted', 'peace', 'to', 'pant'], ['and', 'breathe', 'shortwinded', 'accents', 'of', 'new', 'broils'], ['to', 'be', 'commenced', 'in', 'strands', 'afar', 'remote'], ['no', 'more', 'the', 'thirsty', 'entrance', 'of', 'this', 'soil'], ['shall', 'daub', 'her', 'lips', 'with', 'her', 'own', 'childrens', 'blood'], ['nor', 'more', 'shall', 'trenching', 'war', 'channel', 'her', 'fields']]


removing stopwords  

Stop words are a set of commonly used words in a language. Examples of stop words in English are “a”, “the”, “is”, “are” and etc

In [57]:
def remove_stopwords(lines, sw = sw):
    '''
    The purpose of this function is to remove stopwords from a given array of 
    lines.
    
    params:
        lines (Array / List) : The list of lines you want to remove the stopwords from
        sw (Set) : The set of stopwords you want to remove
        
    example:
        lines = remove_stopwords(lines = lines, sw = sw)
    '''
    
    res = []
    for line in lines:
        original = line
        line = [w for w in line if w not in sw]
        if len(line) < 1:
            line = original
        res.append(line)
    return res
    
filtered_lines = remove_stopwords(lines = lines, sw = sw)

In [58]:
print(filtered_lines[:10])

[['act'], ['scene', 'london', 'palace'], ['enter', 'king', 'henry', 'lord', 'john', 'lancaster', 'earl', 'westmoreland', 'sir', 'walter', 'blunt', 'others'], ['shaken', 'wan', 'care'], ['find', 'time', 'frighted', 'peace', 'pant'], ['breathe', 'shortwinded', 'accents', 'new', 'broils'], ['commenced', 'strands', 'afar', 'remote'], ['thirsty', 'entrance', 'soil'], ['shall', 'daub', 'lips', 'childrens', 'blood'], ['shall', 'trenching', 'war', 'channel', 'fields']]


### Embedding

In [59]:
print(dir(w2v))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_check_corpus_sanity', '_check_training_sanity', '_clear_post_train', '_do_train_epoch', '_do_train_job', '_get_next_alpha', '_get_thread_working_mem', '_job_producer', '_load_specials', '_log_epoch_end', '_log_epoch_progress', '_log_progress', '_log_train_end', '_raw_word_count', '_save_specials', '_scan_vocab', '_smart_save', '_train_epoch', '_train_epoch_corpusfile', '_worker_loop', '_worker_loop_corpusfile', 'add_lifecycle_event', 'add_null_word', 'build_vocab', 'build_vocab_from_freq', 'create_binary_tree', 'estimate_memory', 'get_latest_training_loss', 'init_sims', 'init_weights', 'load', 'make_cum_table', 'predict_outpu

In [60]:
help(w2v)

Help on class Word2Vec in module gensim.models.word2vec:

class Word2Vec(gensim.utils.SaveLoad)
 |  Word2Vec(sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), comment=None, max_final_vocab=None, shrink_windows=True)
 |  
 |  Method resolution order:
 |      Word2Vec
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=100

sg : {0, 1}, optional
 |          Training algorithm: 1 for skip-gram; otherwise CBOW.

min_count : int, optional
 |          Ignores all words with total frequency lower than this.



In [61]:

w = w2v(
    filtered_lines,
    min_count=3,  
    sg = 1,       
    window=7      
)       

print(w.wv)
print(w.wv.most_similar('thou'))

<gensim.models.keyedvectors.KeyedVectors object at 0x7f978bab0fa0>
[('thyself', 0.8456547856330872), ('art', 0.8376896381378174), ('villain', 0.793484628200531), ('dost', 0.7878931760787964), ('wherefore', 0.7261180281639099), ('kill', 0.7127459049224854), ('wilt', 0.7055133581161499), ('hast', 0.6970827579498291), ('slave', 0.6943764090538025), ('didst', 0.6916892528533936)]


In [62]:
print(dir(w.wv))

['__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_load_specials', '_log_evaluate_word_analogies', '_save_specials', '_smart_save', '_upconvert_old_d2vkv', '_upconvert_old_vocab', 'add_lifecycle_event', 'add_vector', 'add_vectors', 'allocate_vecattrs', 'closer_than', 'cosine_similarities', 'distance', 'distances', 'doesnt_match', 'evaluate_word_analogies', 'evaluate_word_pairs', 'expandos', 'fill_norms', 'get_index', 'get_normed_vectors', 'get_vecattr', 'get_vector', 'has_index_for', 'index2entity', 'index2word', 'index_to_key', 'init_sims', 'intersect_word2vec_format', 'key_to_index', 'load', 'load_word2vec_format', 'l

In [63]:
print(w.wv.vocab)

AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

In [64]:
w.wv.word_vec('thou')

  w.wv.word_vec('thou')


array([-0.15144841,  0.08171807,  0.75215757,  0.7282437 , -0.5837944 ,
       -0.66217893,  0.38535234,  0.44035137, -0.31881237, -0.2835821 ,
        0.7160918 , -0.14869876, -0.03428611, -0.08077147, -0.4392117 ,
       -0.34691635,  0.19411936,  0.01353056, -0.72241807, -0.8375711 ,
        0.24856794,  0.11444633,  0.7670785 , -0.04980489,  0.36199507,
       -0.10808496, -0.22805522,  0.26493514, -0.54204226, -0.26386055,
       -0.48448694,  0.17973013,  0.7107661 , -0.10974073, -0.0530719 ,
        0.02056419,  0.44529846, -0.6169424 , -0.3827765 , -0.49636447,
       -0.49311233, -0.0447424 , -0.6783579 , -0.20771371, -0.11619701,
       -0.08550096, -0.14903958,  0.4375947 , -0.04921498,  0.7413388 ,
       -0.46258855,  0.02722253, -0.4517082 ,  0.173224  ,  0.2497509 ,
       -0.41471267,  0.542515  , -0.6503098 , -0.3340822 ,  0.56054765,
        0.1749393 , -0.18986471, -0.20277207, -0.5215067 , -0.42000398,
        0.35314348, -0.02558299,  0.6867087 , -0.3585968 , -0.04

In [65]:
w.wv.sort_by_descending_frequency() # does what it says XD

In [66]:
w.wv.vectors_norm()

AttributeError: The `.vectors_norm` attribute is computed dynamically since Gensim 4.0.0. Use `.get_normed_vectors()` instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

In [67]:
w.wv.get_normed_vectors()

array([[-0.036308  ,  0.01959096,  0.18032104, ..., -0.05673955,
         0.01367292,  0.12555578],
       [-0.01018749,  0.07913145,  0.17272131, ..., -0.11400472,
        -0.01301266,  0.01412703],
       [-0.02375143,  0.06602585,  0.01409985, ..., -0.01900571,
         0.1350618 ,  0.04146515],
       ...,
       [-0.02486376,  0.07495868,  0.01859752, ..., -0.11829139,
         0.1457074 ,  0.01621905],
       [-0.00813951,  0.09692871,  0.11010306, ..., -0.1742077 ,
         0.20440203, -0.01465781],
       [-0.06069689,  0.06077525,  0.07973552, ..., -0.1932309 ,
         0.11311837,  0.00585645]], dtype=float32)

In [68]:
print(w.wv.get_index('thou'))

0


In [69]:
print(w.wv.rank('thou', 0))

1


In [70]:
help(w.wv)

Help on KeyedVectors in module gensim.models.keyedvectors object:

class KeyedVectors(gensim.utils.SaveLoad)
 |  KeyedVectors(vector_size, count=0, dtype=<class 'numpy.float32'>, mapfile_path=None)
 |  
 |  Method resolution order:
 |      KeyedVectors
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __contains__(self, key)
 |  
 |  __getitem__(self, key_or_keys)
 |      Get vector representation of `key_or_keys`.
 |      
 |      Parameters
 |      ----------
 |      key_or_keys : {str, list of str, int, list of int}
 |          Requested key or list-of-keys.
 |      
 |      Returns
 |      -------
 |      numpy.ndarray
 |          Vector representation for `key_or_keys` (1D if `key_or_keys` is single key, otherwise - 2D).
 |  
 |  __init__(self, vector_size, count=0, dtype=<class 'numpy.float32'>, mapfile_path=None)
 |      Mapping between keys (such as words) and vectors for :class:`~gensim.models.Word2Vec`
 |      and related models.
 |

### Advantages and Disadvantages
https://blog.actorsfit.com/a?ID=00800-99c55383-688c-4e1b-9873-6e6c43a4b883#word2vec

Advantages  

- gives context
- versatility in NLP

Disadvantages  

- word with different (on context) may cause problems
- The model could be very difficult to train if use the softmax function, since the number of categories is too large (the size of vocabulary). Though approxination algorithms like negative sampling (NEG) and hierarchical softmax (HS) are proposed to address the issue, other problems happen. For example, the word vectors by NEG are not distributed uniformally, they are located within a cone in the vector space hence the vector space is not sufficiently utilized.