In [1]:
import codecs #unicode encoding
import glob #for reading files
import multiprocessing #to run model faster
import os #access the filesystem
import re # to run regular expressions

In [2]:
import nltk 
import gensim.models.word2vec as w2v
import numpy as np

In [3]:
#open files 

book_filenames = sorted(glob.glob("data/*.txt"))

In [4]:
book_filenames

['data/got1.txt',
 'data/got2.txt',
 'data/got3.txt',
 'data/got4.txt',
 'data/got5.txt']

In [5]:
#combine all books into one

corpus_raw = u"" #unicode

for book in book_filenames:
    with codecs.open(book,"r","utf-8") as book_filenames:
        corpus_raw += book_filenames.read()
    print("Corpus length{0}".format(len(corpus_raw)))

Corpus length1770659
Corpus length4071041
Corpus length6391405
Corpus length8107945
Corpus length9719485


In [6]:
# download helper functions from nltk

nltk.download("punkt")
nltk.download("stopwords")


[nltk_data] Downloading package punkt to /Users/rushalip/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rushalip/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
#split text into sentences

#create tokenizer 
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

In [8]:
#split corpus into sentences using tokenizer

raw_sentences = tokenizer.tokenize(corpus_raw)


In [9]:
len(raw_sentences)

128868

In [12]:
raw_sentences[12886]

'Yet Theon had always called him uncle nonetheless.'

In [13]:
#cleaning data

def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [14]:
#split raw sentences 

sentences = []

for raw in raw_sentences:
    if len(raw)>0:
        sentences.append(sentence_to_wordlist(raw))

In [15]:
sentences[10]

['It', 'was', 'here', 'the', 'ravens', 'came', 'after', 'long', 'flight']

In [16]:
#word 2 vec model

word2vec = w2v.Word2Vec(
    sg=1,
    seed=1,
    workers= multiprocessing.cpu_count(),
    size=300,
    min_count=3,
    window=7,
    sample=1e-3
)

In [17]:
word2vec.build_vocab(sentences)

In [18]:
word2vec.train(sentences)

7022305

In [22]:
word2vec.wv.most_similar("Khaleesi")

[('Dragons', 0.7592031955718994),
 ('Missandei', 0.7435500621795654),
 ('khaleesi', 0.7377284169197083),
 ('Magnificence', 0.7320506572723389),
 ('maegi', 0.7285397052764893),
 ('Blue', 0.7243928909301758),
 ('Pyat', 0.724011242389679),
 ('Galazza', 0.7201337218284607),
 ('Hero', 0.7174426317214966),
 ('Jhogo', 0.7171635627746582)]

In [29]:
word2vec.wv.most_similar_cosmul(positive=["Jaime","sword"], negative=["hand"])

[('Lannister', 0.9097367525100708),
 ('Kevan', 0.9084922075271606),
 ('Kingslayer', 0.9070748090744019),
 ('Ilyn', 0.8966287970542908),
 ('Mandon', 0.8959121108055115),
 ('Gregor', 0.8896611928939819),
 ('Bronn', 0.8773239850997925),
 ('Addam', 0.8747161626815796),
 ('Thorne', 0.8745303750038147),
 ('Loras', 0.8678922653198242)]