## Tutorial on how to use word2vec in gensim

In [30]:
from gensim.models import doc2vec
import logging

### Preparing the input

In [3]:
# import modules & set up logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences = [['first', 'sentence'], ['second', 'sentence']]
# train word2vec on the two sentences
model = gensim.models.Word2Vec(sentences, min_count=1)

In [None]:
# for different texts if input stream is non-repeatable
# model = gensim.models.Word2Vec() # an empty model, no training
# model.build_vocab(some_sentences)  # can be a non-repeatable, 1-pass generator
# model.train(other_sentences)  # can be a non-repeatable, 1-pass generator

### Training

In [6]:
# prune vocab dictionary by ignoring very rare words
#model = gensim.models.Word2Vec(sentences, min_count=10)

### Using Model

In [None]:
#model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
#[('queen', 0.50882536)]
# model.doesnt_match("breakfast cereal dinner lunch".split())
#'cereal'
# model.similarity('woman', 'man')
#0.73723527
# model['computer']  # raw NumPy vector of a word
# array([-0.00449447, -0.00310097,  0.02421786, ...], dtype=float32)

### Load Data

In [7]:
from os import listdir
from os.path import isfile, join

In [8]:
listdir("TextCorpus")

['.DS_Store',
 'Berlin1.txt',
 'Berlin2.txt',
 'Berlin3.txt',
 'Berlin4.txt',
 'Berlin5.txt',
 'NewYork1.txt',
 'NewYork2.txt',
 'NewYork3.txt',
 'NewYork4.txt',
 'NewYork5.txt']

In [36]:
textLabels = []
textLabels = [f for f in listdir("TextCorpus") if f.endswith('.txt')]
print textLabels

['Berlin1.txt', 'Berlin2.txt', 'Berlin3.txt', 'Berlin4.txt', 'Berlin5.txt', 'NewYork1.txt', 'NewYork2.txt', 'NewYork3.txt', 'NewYork4.txt', 'NewYork5.txt']


In [59]:
data = []
for text in textLabels:
    f = open("TextCorpus/" + text, 'r')
    data.append(f.read())

### Preparing the data for Gensim Doc2vec

In [77]:
class LabeledLineSentence(object):
    def __init__(self, text_list, labels_list):
        self.labels_list = labels_list
        self.text_list = text_list
    def __iter__(self):
        for idx, text in enumerate(self.text_list):
            yield doc2vec.LabeledSentence(text.split(), [self.labels_list[idx]])

### Training the model

In [78]:
Sentences = LabeledLineSentence(data, textLabels)

In [109]:
# use fixed learning rate
model = gensim.models.Doc2Vec(size=40, window=10, min_count=5, workers=11,alpha=0.025, min_alpha=0.025)

In [110]:
model.build_vocab(Sentences)

In [111]:
for _ in range(10):
    model.train(Sentences)
    model.alpha -= 0.002 # decreasing the learning rate
    model.min_alpha = model.alpha
    model.train(Sentences)

In [112]:
model.save("doc2vec.model")

In [113]:
print model.most_similar("New")

[('City', 0.8016244769096375), ('at', 0.5614316463470459), ('area', 0.5085005164146423), ('their', 0.49119099974632263), ('States.', 0.46879851818084717), ('state.', 0.4626597464084625), ('C', 0.44130706787109375), ('with', 0.40370967984199524), ('or', 0.3933374285697937), ('than', 0.384970486164093)]


In [114]:
print model.most_similar("the")

[('Berlin.', 0.8318343162536621), ('C', 0.7524179220199585), ('an', 0.709546685218811), ('their', 0.6911014318466187), ('were', 0.6829645037651062), ('The', 0.6490375399589539), ('all', 0.6445956230163574), ('Germany,', 0.640369713306427), ('than', 0.5894448757171631), ('Battle', 0.5605651140213013)]


In [115]:
print model.most_similar("Berlin")

[('Germany', 0.8650304079055786), ('West', 0.762969970703125), ('East', 0.7270813584327698), ('but', 0.6876924633979797), ('Republic', 0.6717215180397034), ('Germany,', 0.5637524127960205), ('as', 0.5467364192008972), ('a', 0.5360344052314758), ('its', 0.5218937397003174), ('Berlin.', 0.5196490287780762)]


In [116]:
print model.most_similar('Hudson')

[('only', 0.6900793313980103), ('most', 0.6829732656478882), ('at', 0.6787878274917603), ('largest', 0.659099817276001), ('Great', 0.6515200138092041), ('during', 0.6449869871139526), ('United', 0.6372557282447815), ('be', 0.6264005899429321), ('on', 0.6208404898643494), ('Iroquois', 0.5854104161262512)]


In [117]:
print model["Berlin"]

[-0.02410355  0.01731708  0.03344956  0.00445156 -0.03585097 -0.10849235
 -0.09031273 -0.14000477 -0.04865702 -0.00177307 -0.06984299  0.09222768
 -0.01337598  0.09027207 -0.00017309 -0.05016653  0.10461748 -0.00132738
  0.00335197  0.04781001  0.0302741   0.07161038  0.03542482  0.07597228
  0.03054213  0.04453614  0.05662571 -0.06729747  0.06246189 -0.04098909
 -0.0585022  -0.10646819  0.06129624  0.0489495  -0.12531349 -0.1486468
  0.05472993 -0.03053526 -0.0924937  -0.05409999]
