In [1]:
import multiprocessing
import os
import requests
import re
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.contrib.tensorboard.plugins import projector

In [2]:
# Download NLTK tokenizer models (only the first time)
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anuj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anuj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]", " ", raw)
    words = clean.split()
    return list(map(lambda x: x.lower(), words))

In [5]:
print(sentence_to_wordlist('Hello there, this is first sentence'))

['hello', 'there', 'this', 'is', 'first', 'sentence']


In [6]:
# Download text from Gutenberg website
filepath = 'http://www.gutenberg.org/files/33224/33224-0.txt'
corpus_raw = requests.get(filepath).text

# Clean text
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(corpus_raw)

# Sentence where each word is tokenized
sentences = (sentence_to_wordlist(raw) for raw in raw_sentences if raw)
sentences = list(sentences)
token_count = sum([len(sentence) for sentence in sentences])
print(f'The book corpus contains {token_count} tokens.')

The book corpus contains 425633 tokens.


In [9]:
# More dimensions, more computationally expensive to train
# but also more accurate -- more dimensions = more generalized
num_features = 300

# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel
# The more workers, the faster we train
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 7

# Downsample setting for frequent words. 0 to 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
seed = 1

model2vec = w2v.Word2Vec(
    sg=1, seed=seed, workers=num_workers, size=num_features,
    min_count=min_word_count, window=context_size, sample=downsampling
)

model2vec.build_vocab(list(sentences))

In [12]:
os.chdir('C:\\Users\\Anuj\\Documents\\GitHub\\Natural Language Processing')

In [13]:
# Start training, this might take a minute or two...
model2vec.train(
    sentences, total_examples=model2vec.corpus_count, epochs=10
)

# Save to file, can be useful later
if not os.path.exists(os.path.join('trained', 'sample')):
    os.makedirs(os.path.join('trained', 'sample'))

model2vec.save(os.path.join('trained', 'sample', 'sample.w2v'))

## Analyzing the model
Now that we have trained our word2vec model, let's explore what our model was able to learn.
We will use most_similar() to explore the relations between various words. 
In the following example, you see that the model was able to learn that
the word earth is related to crust, globe, and other words.
It is interesting to see that we only provided the raw data and the model 
was able to learn all of these relations and concepts automatically! The following is the example:

In [14]:
# Analyzing the model
print('Most similar to "earth":')
for s in model2vec.wv.most_similar("earth"):
    print(s)
    
print('\nMost similar to "human":')
for s in model2vec.wv.most_similar("human"):
    print(s)
    
print('\nPositive words contribute positively towards similarity, negative words negatively:')
for s in model2vec.wv.most_similar_cosmul(
        positive=['earth', 'moon'],
        negative=['orbit']):
    print(s)

Most similar to "earth":
('crust', 0.7226534485816956)
('globe', 0.6591298580169678)
('inequalities', 0.6219401955604553)
('planet', 0.6030153036117554)
('orbit', 0.5927286148071289)
('laboring', 0.567184329032898)
('moon', 0.5649024248123169)
('unevenness', 0.5619377493858337)
('remodelled', 0.5601164102554321)
('reduce', 0.5525733828544617)

Most similar to "human":
('man', 0.6671838760375977)
('art', 0.6557581424713135)
('race', 0.6433838605880737)
('industry', 0.6433637142181396)
('rude', 0.637246310710907)
('gods', 0.6188982725143433)
('affairs', 0.6096276044845581)
('beings', 0.6006961464881897)
('comparative', 0.5970616340637207)
('population', 0.590903103351593)

Positive words contribute positively towards similarity, negative words negatively:
('sound', 0.815029501914978)
('remodelled', 0.800189197063446)
('crust', 0.7973071932792664)
('employed', 0.795359194278717)
('planet', 0.7855542898178101)
('globe', 0.7845449447631836)
('laboring', 0.7828304767608643)
('sun', 0.7820301

## Word cluster

In [15]:
# Reduce dimensionality using t-SNE
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0) #2-D space output
all_word_vectors_matrix = model2vec.wv.vectors
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [16]:
# Create DataFrame to store words and coordinates
points = pd.DataFrame(
    [(word, coords[0], coords[1]) for word, coords in
        [(word, all_word_vectors_matrix_2d[model2vec.wv.vocab[word].index])
            for word in model2vec.wv.vocab]
     ],
    columns=["word", "x", "y"]
)