In [None]:
from __future__ import absolute_import, division, print_function
import codecs
import glob
#concurrency
import multiprocessing
import os
import pprint
import re
import nltk
import gensim.models.word2vec as w2v
# dimensionality reduction visualize a dataset easily
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns



In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
book_filenames = sorted(glob.glob("./*.txt"))

In [None]:
print(book_filenames)

In [None]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()


In [None]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(corpus_raw)

In [None]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ",raw)
    words = clean.split()
    return words

In [None]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [None]:
print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))

In [None]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

## Train Word2Vec
### once we have vectors we can build our modle
### The three main tasks are: distance, similarity. ranking


In [None]:
# Dimensionality of the resulting word vectors
# More dimensionality more expensive to train but more
# accurate more dimensions = more generalized
num_features = 300
# Minimum word count threshold
min_word_count = 3


In [None]:
#Number of threads to run in parallel
num_workers = multiprocessing.cpu_count()

In [None]:
# Context window length
context_size = 7

In [None]:
# Downsample setting for frequent words
#0 - 1e-5 is good for this
downsampling =  1e-3

In [None]:
# Seed for the RNG to make the result reproducialbe
# Random number generator
# Deterministic, good for debugging
seed = 1

In [None]:
hp2vec = w2v.Word2Vec(
    sg = 1,
    seed = seed,
    workers = num_workers,
    size = num_features,
    min_count = min_word_count,
    window = context_size,
    sample = downsampling
    )

In [None]:
hp2vec.build_vocab(sentences)

In [None]:
print("Word2Vec vocabulary length:", len(hp2vec.vocab))

### Start training this might take few minutes

In [None]:
hp2vec.train(sentences)

### Save to file, can be useful later


In [None]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [None]:
hp2vec.save(os.path.join("trained","hp2vec.w2v"))

### explore the trained model

In [None]:
hp2vec = w2v.Word2Vec.load(os.path.join("trained","hp2vec.w2v"))

### compress the word vectors into 2D space and plot them


In [None]:
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
all_word_vectors_matrix = hp2vec.syn0

### train t-SNE 

In [None]:
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [None]:
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[hp2vec.vocab[word].index])
            for word in hp2vec.vocab
        ]
    ],
    columns = ["word","x","y"]
)

In [None]:
points.head(50)

In [None]:
sns.set_context("poster")

In [None]:
%pylab inline

In [None]:
points.plot.scatter("x","y",s=10,figsize=(20,12))

### zoom in to some interesting places

In [None]:
def plot_region(x_bounds, y_bounds):
    slice = points[
        (x_bounds[0] <= points.x) &
        (points.x <= x_bounds[1]) & 
        (y_bounds[0] <= points.y) &
        (points.y <= y_bounds[1])
    ]
    
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)

In [None]:
plot_region(x_bounds=(-3.4,-3.2), y_bounds=(0.9,1.1))

### Explore semantic similarities between book characters

In [None]:
plot_region(x_bounds=(-4,-3), y_bounds=(0.3,0.5))

In [None]:
hp2vec.most_similar("Harry")

In [None]:
hp2vec.most_similar("Hermione")

In [None]:
hp2vec.most_similar("Hogwarts")

In [None]:
hp2vec.most_similar("Ron")

In [None]:
hp2vec.most_similar("Malfoy")

In [None]:
hp2vec.most_similar("Muggle")

### Linear relationships between word pairs

In [None]:
def nearest_similarity_cosmul(start1,end1,end2):
    similarities = hp2vec.most_similar_cosmul(
        positive=[end2,start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

                            

In [None]:
nearest_similarity_cosmul("Harry", "Hermione", "Malfoy")
nearest_similarity_cosmul("boy", "Harry", "Hermione")
# nearest_similarity_cosmul("Arya", "Nymeria", "dragons")
