In [None]:
from __future__ import absolute_import, division, print_function
#future is the missing compatibility layer between python 2 and 3.

In [None]:
import codecs #to do word encoding
import glob #finds all pathnames matching a pattern, like regex. to effectively search for large text
import logging  #log events for libraries
import multiprocessing #concurrency: a way of running multiple threads and having each thread run a differen process. it's a way of running your program faster
import os #dealing with operating system, like reading file
import pprint #pretty print, human readable
import re #regular expressions

In [None]:
import nltk #importing natural language toolkit
import gensim.models.word2vec as w2v #word 2 vec
import sklearn.manifold #dimensionality reduction #the vectors are going to be multidimensional
import numpy as np #math
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%pylab inline

In [None]:
#set up logging:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
#nltk tokenize the sentences: exm:
nltk.download('punkt')

from nltk import word_tokenize,sent_tokenize, pos_tag

sentence = ''''At eight oclock on Thursday morning Arthur didn't feel very good'''

tokens = nltk.word_tokenize(sentence)

tokens

In [None]:
nltk.download("punkt") #pretrained tokenizer
nltk.download("stopwords") # words like, and, the an, a, of
#we want to remove those words so the vectors we create are more accurate

In [None]:
book_filenames = sorted(glob.glob("*.txt"))

print("Found books:")
book_filenames

In [None]:
#initialize a raw corpus, we'll add all text to this file
corpus_raw = u'' #we start with u, because it's a unicode, and we want to convert into utf-8
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("corpus is now {0} characters long".format(len(corpus_raw)))
    print()
    
#for each book, read it, open it in utf-8 format, add it to the corpus_raw

In [None]:
#load the trained model
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [None]:
#convert into list of words
#remove unnecessary characters, split into words

def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]", " ", raw)
    words = clean.split()
    return words
#"[^a-zA-Z]": match all the stings that contain a non-letter
#"^[a-zA-Z]": means match all the strings with a letter

In [None]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [None]:
print(raw_sentences[5])

In [None]:
print(sentence_to_wordlist(raw_sentences[5]))

In [None]:
sentences[5]

In [None]:
#Build Model
num_features = 300 #dimensionality of the resulting word vectors
min_word_count = 3
num_workers = multiprocessing.cpu_count()# number of threads to run in parallel
context_size = 7
downsampling = 1e-3 #downsamle setting for frequent words = how often to use
seed = 1

In [None]:
thrones2vec = w2v.Word2Vec(sg=1,
                          seed=seed,
                          workers=num_workers, 
                          size=num_features, 
                          min_count=min_word_count, 
                          window=context_size, 
                          sample=downsampling)
#sg=1: If 1, skip-gram is employed; otherwise, CBOW is used.
#workers=num_workers:Use these many worker threads to train the model (=faster training with multicore machines).
#size=num_features: Dimensionality of the feature vectors.
#min_count=min_word_count:  Ignores all words with total frequency lower than this.
#window=context_size: The maximum distance between the current and predicted word within a sentence.
#sample=downsampling:  The threshold for configuring which higher-frequency words are randomly downsampled, useful range is
(0, 1e-5).

In [None]:
thrones2vec.build_vocab(sentences) # the new words in `sentences` will be added to model's vocab.

In [None]:
print("Word2Vec vocabulary length:", len(thrones2vec.wv.vocab))

In [None]:
#Start training
thrones2vec.train(sentences, total_words=token_count, epochs=100)

In [None]:
#save file
if not os.path.exists("trained"):
    os.makedirs("trained")

In [None]:
thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))

In [None]:
#load model
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "thrones2vec.w2v"))

In [None]:
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

In [None]:
all_word_vectors_matrix = thrones2vec.wv.syn0

In [None]:
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [None]:
#plot the big picture
points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[thrones2vec.wv.vocab[word].index])
            for word in thrones2vec.wv.vocab
        ]
    ],
    columns=["word", "x", "y"]
)

In [None]:
points.head(10)

In [None]:
sns.set_context("poster")

In [None]:
points.plot.scatter("x", "y", s=10, figsize=(25, 12))

In [None]:
#details
def plot_region(x_bounds, y_bounds):
    slice = points[
        (x_bounds[0] <= points.x) &
        (points.x <= x_bounds[1]) & 
        (y_bounds[0] <= points.y) &
        (points.y <= y_bounds[1])
    ]
    
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)

In [None]:
print("Range of t-sne viz in x = {}, {} and y = {}, {}" .format(min(points.x), max(points.x), min(points.y), max(points.y)))

In [None]:
#Semantic Similarities
thrones2vec.most_similar("Stark")

In [None]:
thrones2vec.most_similar("Dragons")

In [None]:
thrones2vec.most_similar("direwolf")

In [None]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = thrones2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [None]:
nearest_similarity_cosmul("Stark", "Winterfell", "Martell") #Leader
nearest_similarity_cosmul("Stark", "Winterfell", "Bolton")  # Head of families
nearest_similarity_cosmul("Arya", "Horseface", "Daenerys") # Nicknames
nearest_similarity_cosmul("Stark", "Winterfell", "Riverrun") # Leaders 
nearest_similarity_cosmul("Jaime", "sword", "wine")
nearest_similarity_cosmul("Tyrion", "wine", "dragons")
nearest_similarity_cosmul("Arya", "Nymeria", "dragons") # Mystic/Extinct creatures
nearest_similarity_cosmul("Snow", "Jon", "Ellaria") # Bastards in North/Dorneb

In [None]:
thrones2vec.wv.doesnt_match("Tyrion Daenerys Gendry Bran Jon".split())  #Iron Throne contenders

In [None]:
thrones2vec.wv.doesnt_match("Jaime Cersei Robert".split())

In [None]:
thrones2vec.wv.doesnt_match("Ramsay Jon Ellaria ".split())  #Snow