In [71]:
import re
import nltk
import gensim.models.word2vec as w2v
import pandas as pd
import glob
import codecs
import multiprocessing
import os

In [72]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/trigoros/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/trigoros/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [73]:
book_filenames = sorted(glob.glob("./data/*.txt"))
print("Found books:")
book_filenames

Found books:


['./data/HPBook1.txt',
 './data/HPBook2.txt',
 './data/HPBook3.txt',
 './data/HPBook4.txt',
 './data/HPBook5.txt',
 './data/HPBook6.txt',
 './data/HPBook7.txt']

In [74]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading './data/HPBook1.txt'...
Corpus is now 474429 characters long

Reading './data/HPBook2.txt'...
Corpus is now 1006137 characters long

Reading './data/HPBook3.txt'...
Corpus is now 1683115 characters long

Reading './data/HPBook4.txt'...
Corpus is now 2870365 characters long

Reading './data/HPBook5.txt'...
Corpus is now 4479128 characters long

Reading './data/HPBook6.txt'...
Corpus is now 5538150 characters long

Reading './data/HPBook7.txt'...
Corpus is now 6765174 characters long



In [75]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [76]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [77]:
#convert into a list of words
#rtemove unnnecessary,, split into words, no hyphens
#list of words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [78]:
#sentence where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [79]:
print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))

The Dursley s had a small son 
called Dudley and in their opinion there was no finer 
boy anywhere.
['The', 'Dursley', 's', 'had', 'a', 'small', 'son', 'called', 'Dudley', 'and', 'in', 'their', 'opinion', 'there', 'was', 'no', 'finer', 'boy', 'anywhere']


In [80]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 1,174,677 tokens


Building Vocab and Training Model


In [81]:
num_features = 300
min_word_count = 3
num_workers = multiprocessing.cpu_count()
context_size = 7
downsampling = 1e-3
seed = 1

In [82]:
hp2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    vector_size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [83]:
hp2vec.build_vocab(sentences)

In [84]:
vocab_len = len(hp2vec.wv)
print("Word2Vec vocabulary length:", vocab_len)

Word2Vec vocabulary length: 13105


In [85]:
hp2vec.train(sentences,total_examples=hp2vec.corpus_count, epochs=5)

(4426164, 5873385)

Save trained model


In [86]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [87]:
hp2vec.save(os.path.join("trained", "hp2vec.w2v"))

Load trained model


In [88]:
hp2vec = w2v.Word2Vec.load(os.path.join("trained", "hp2vec.w2v"))

Test the model


In [89]:
hp2vec.wv.most_similar("snap")

[('shutting', 0.8840312957763672),
 ('squeal', 0.86234050989151),
 ('creak', 0.8495935797691345),
 ('moan', 0.8477634191513062),
 ('stall', 0.8454773426055908),
 ('sob', 0.8436517119407654),
 ('yell', 0.8382592797279358),
 ('pop', 0.8347529768943787),
 ('tinkle', 0.8344127535820007),
 ('swing', 0.834400475025177)]

In [90]:
hp2vec.wv.most_similar("wizard")

[('witch', 0.7098701000213623),
 ('man', 0.6763317584991455),
 ('thief', 0.6224886178970337),
 ('Marvolo', 0.5958719849586487),
 ('clever', 0.5892096757888794),
 ('Tri', 0.582741916179657),
 ('wandmaker', 0.5817017555236816),
 ('century', 0.5784750580787659),
 ('boy', 0.5772622227668762),
 ('adult', 0.5766698718070984)]

In [91]:
hp2vec.wv.most_similar("Harry")

[('Tomorrow', 0.6662760376930237),
 ('Finally', 0.6638698577880859),
 ('Fascinating', 0.6610673666000366),
 ('Mine', 0.6609613299369812),
 ('Tm', 0.6536983847618103),
 ('LOVEGOOD', 0.6529023051261902),
 ('Back', 0.6526535153388977),
 ('Without', 0.6476196646690369),
 ('unprecedented', 0.646226167678833),
 ('Show', 0.6454821825027466)]

In [92]:
hp2vec.wv.most_similar("Potter")

[('Goblet', 0.7659329175949097),
 ('Prisoner', 0.7628973126411438),
 ('Order', 0.7598740458488464),
 ('Blood', 0.7514839768409729),
 ('Philosophers', 0.7509821057319641),
 ('Hallows', 0.7474702596664429),
 ('Chamber', 0.7467625737190247),
 ('Fire', 0.7454984188079834),
 ('Deathly', 0.7417952418327332),
 ('Half', 0.7298355102539062)]

In [93]:
hp2vec.wv.most_similar("Hermione")

[('Lavender', 0.5959439277648926),
 ('Ginny', 0.595808207988739),
 ('Luna', 0.5891612768173218),
 ('Parvati', 0.5884323716163635),
 ('miserably', 0.5865969061851501),
 ('incredulously', 0.5780034065246582),
 ('amazed', 0.576050877571106),
 ('excitedly', 0.5711442232131958),
 ('fearfully', 0.5678232312202454),
 ('Romilda', 0.566021740436554)]

In [94]:
hp2vec.wv.most_similar("Voldemort")

[('Wormtail', 0.7454556226730347),
 ('Lord', 0.7206600904464722),
 ('power', 0.6617326736450195),
 ('Sorcerer', 0.6585953831672668),
 ('Gregorovitch', 0.657398521900177),
 ('prophecy', 0.656592845916748),
 ('Frank', 0.6487449407577515),
 ('Wand', 0.6456465125083923),
 ('Elder', 0.6451976299285889),
 ('Bellatrix', 0.6445302367210388)]

In [95]:
hp2vec.wv.most_similar('Dumbledore')

[('Slughorn', 0.6427043676376343),
 ('Karkaroff', 0.6423910856246948),
 ('Lupin', 0.6403194665908813),
 ('Riddle', 0.6343071460723877),
 ('Quirrell', 0.622565746307373),
 ('headmaster', 0.6222773790359497),
 ('Fudge', 0.6083312034606934),
 ('Dippet', 0.5851658582687378),
 ('Scrimgeour', 0.5850939750671387),
 ('Aberforth', 0.5848838090896606)]

In [96]:
hp2vec.wv.most_similar('James')

[('Lily', 0.807587742805481),
 ('Peter', 0.7490341663360596),
 ('father', 0.7172917723655701),
 ('godfather', 0.7094799876213074),
 ('Aberforth', 0.7002931237220764),
 ('Morfin', 0.6974421143531799),
 ('hex', 0.6970621943473816),
 ('Regulus', 0.6930758357048035),
 ('Pettigrew', 0.6915147304534912),
 ('dad', 0.6820852756500244)]

In [97]:
hp2vec.wv.similarity('Gryffindor','Ravenclaw')

0.7488101

In [98]:
hp2vec.wv.similarity('Ravenclaw','Hufflepuff')

0.8418069

In [99]:
hp2vec.wv.similarity('Gryffindor','Slytherin')

0.76888597

In [100]:
hp2vec.wv.similarity('Harry','Hermione')

0.43279967

In [101]:
hp2vec.wv.similarity('Harry','Ron')

0.47812068

In [102]:
hp2vec.wv.similarity('Harry','Ginny')

0.4428412

In [103]:
hp2vec.wv.similarity('Harry','Draco')

0.45599478

In [104]:
hp2vec.wv.get_vector('system')

array([-0.04349536,  0.17675944, -0.00191631, -0.00189263, -0.04998869,
       -0.02312861,  0.12973048,  0.2328556 ,  0.04606979, -0.03963939,
       -0.03859284, -0.08789378,  0.03335472, -0.09642242, -0.07743017,
       -0.04697024, -0.00562109, -0.00777015,  0.00543146, -0.00374775,
       -0.01104846,  0.01855687,  0.06936868, -0.00309733,  0.04061465,
       -0.08505926, -0.04342438, -0.02491045, -0.04676975, -0.10168947,
       -0.0061103 ,  0.03896707, -0.00371508, -0.07508431,  0.02300765,
        0.08184256,  0.10690916, -0.0121919 , -0.05799888, -0.03149782,
       -0.08977556,  0.04436041,  0.05323583, -0.01539325,  0.03791279,
        0.05417796,  0.06638142,  0.0696206 , -0.01274208,  0.06091055,
       -0.00572199, -0.04293516, -0.05032465,  0.00269276, -0.04941019,
        0.0567391 , -0.02937475, -0.05634034,  0.04263777, -0.02708771,
       -0.0372689 ,  0.0063677 , -0.06550132,  0.00414233,  0.0337784 ,
       -0.10559937, -0.01871193,  0.00646665, -0.11498868,  0.09

In [105]:
hp2vec.wv.get_vector('human')

array([-0.08045113,  0.06063795, -0.00505731,  0.0371425 , -0.12732024,
        0.0708539 ,  0.04922074,  0.554285  ,  0.12218071, -0.01879714,
       -0.08789112, -0.06460731,  0.16549183, -0.12124288, -0.20773135,
       -0.05454928, -0.01517518,  0.16176088,  0.19269353, -0.05061263,
        0.1664173 ,  0.03067138,  0.1293357 , -0.04480602,  0.07088   ,
        0.06115659,  0.07420342,  0.09343738,  0.00056983, -0.105098  ,
        0.00916057, -0.02035011, -0.07311337,  0.03089395,  0.16654505,
        0.05730806,  0.16355793,  0.0603353 , -0.08437596,  0.07044517,
        0.00895525,  0.02151919,  0.00616799,  0.0359314 , -0.0920094 ,
        0.01261032,  0.1539332 ,  0.04732376,  0.09538186,  0.11063606,
       -0.04109185, -0.07032701,  0.06811712,  0.14533184, -0.05797526,
        0.20171033, -0.07405247, -0.11806196,  0.04018091, -0.10428611,
       -0.01041579, -0.0551133 ,  0.00644415,  0.05261223, -0.03813637,
       -0.09882693,  0.02096165,  0.06387884,  0.00135209,  0.17

In [106]:
hp2vec.wv.vectors

array([[-0.10706326,  0.14897585,  0.07541998, ...,  0.08790331,
        -0.0688742 , -0.16693042],
       [ 0.13155799,  0.14000912, -0.0445254 , ...,  0.15634803,
        -0.162596  ,  0.01006121],
       [-0.14200254,  0.23447265, -0.04040307, ...,  0.2687296 ,
         0.11906455, -0.01007703],
       ...,
       [-0.04056953,  0.13463387,  0.00691438, ..., -0.0479149 ,
         0.04231695, -0.00899411],
       [-0.03606883,  0.10840391,  0.01783056, ..., -0.03091026,
         0.04272952,  0.01369259],
       [-0.05501752,  0.09110323,  0.02968378, ..., -0.03472082,
         0.02711793,  0.03779325]], dtype=float32)

In [None]:
hp2vec.save('word2vec.model')