Code is based on [The following example](http://sujitpal.blogspot.nl/2016/04/predicting-movie-tags-from-plots-using.html)

In [13]:
import gensim
import json

In [80]:
def read_corpus(fname, tokens_only=False):
    
    file = '../data_resources/topics/{}.json'.format(fname)
    data = json.load(open(file))
    
    for obj in data:
        if tokens_only:
            yield gensim.utils.simple_preprocess(obj['content'])
        else:
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(obj['content']), [obj['sub_topic']])

In [81]:
train_corpus = list(read_corpus('vng_training_cleaned_no_news'))
test_corpus = list(read_corpus('vng_training_cleaned_no_news'))

In [82]:
train_corpus[:1]

[TaggedDocument(words=['schulden', 'en', 'armoede', 'staan', 'volop', 'in', 'de', 'schijnwerpers', 'vandaag', 'praat', 'de', 'tweede', 'kamer', 'over', 'de', 'aanpak', 'van', 'problematische', 'schulden', 'wethouder', 'arjan', 'vliegenthart', 'neemt', 'namens', 'de', 'vng', 'deel', 'aan', 'het', 'met', 'de', 'vaste', 'kamercommissie', 'szw', 'hiervoor', 'dienden', 'vng', 'en', 'divosa', 'samen', 'een', 'position', 'paper', 'in', 'met', 'het', 'position', 'paper', 'roepen', 'gemeenten', 'de', 'tweede', 'kamer', 'en', 'het', 'kabinet', 'op', 'om', 'concreet', 'acht', 'punten', 'aan', 'te', 'pakken', 'deze', 'punten', 'zijn', 'om', 'de', 'gemeentelijke', 'ambitie', 'waar', 'te', 'kunnen', 'maken', 'wat', 'is', 'er', 'concreet', 'nodig', 'zorg', 'voor', 'financiële', 'educatie', 'in', 'het', 'onderwijs', 'neem', 'belemmeringen', 'privacy', 'en', 'weg', 'om', 'outreachende', 'integrale', 'te', 'kunnen', 'bieden', 'realiseer', 'sociale', 'incasso', 'en', 'pas', 'de', 'bijzondere', 'en', 'de'

In [83]:
test_corpus[:1]

[TaggedDocument(words=['schulden', 'en', 'armoede', 'staan', 'volop', 'in', 'de', 'schijnwerpers', 'vandaag', 'praat', 'de', 'tweede', 'kamer', 'over', 'de', 'aanpak', 'van', 'problematische', 'schulden', 'wethouder', 'arjan', 'vliegenthart', 'neemt', 'namens', 'de', 'vng', 'deel', 'aan', 'het', 'met', 'de', 'vaste', 'kamercommissie', 'szw', 'hiervoor', 'dienden', 'vng', 'en', 'divosa', 'samen', 'een', 'position', 'paper', 'in', 'met', 'het', 'position', 'paper', 'roepen', 'gemeenten', 'de', 'tweede', 'kamer', 'en', 'het', 'kabinet', 'op', 'om', 'concreet', 'acht', 'punten', 'aan', 'te', 'pakken', 'deze', 'punten', 'zijn', 'om', 'de', 'gemeentelijke', 'ambitie', 'waar', 'te', 'kunnen', 'maken', 'wat', 'is', 'er', 'concreet', 'nodig', 'zorg', 'voor', 'financiële', 'educatie', 'in', 'het', 'onderwijs', 'neem', 'belemmeringen', 'privacy', 'en', 'weg', 'om', 'outreachende', 'integrale', 'te', 'kunnen', 'bieden', 'realiseer', 'sociale', 'incasso', 'en', 'pas', 'de', 'bijzondere', 'en', 'de'

In [84]:
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=55)

In [85]:
model.build_vocab(train_corpus)

In [86]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

CPU times: user 1min 7s, sys: 889 ms, total: 1min 8s
Wall time: 26.3 s


25519793

In [87]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens
    
def tokenize_tags(label):
    tags = label.split("::")
    tags = map(lambda tok: mark_tag(tok), tags)
    return tags

def jaccard_similarity(labels, preds):
    lset = set(labels)
    pset = set(preds)
    return len(lset.intersection(pset)) / len(lset.union(pset))

def mark_tag(s):
    return "_" + s.replace(" ", "_")
    
def unmark_tag(s):
    return s[1:].replace("_", " ")

In [None]:
from random import shuffle
import numpy as np

In [None]:
# evaluate the model
tot_sim = 0.0
for test_document in test_corpus:
    pred_vec = model.infer_vector(test_document.words)
#     actual_tags = map(lambda x: unmark_tag(x), test_document.tags)
    pred_tags = model.docvecs.most_similar([pred_vec], topn=2)
    sim = jaccard_similarity(test_document.tags, [x[0] for x in pred_tags])
    tot_sim += sim
    
print("Average Similarity on Test Set: {}".format(tot_sim / len(test_corpus)))  

# print out random test result
for i in range(10):
    docid = np.random.randint(len(train_corpus))
    pred_vec = model.infer_vector(train_corpus[docid].words)
#     actual_tags = map(lambda x: unmark_tag(x), train_corpus[docid].tags)
    actual_tags = train_corpus[docid].tags
    pred_tags = model.docvecs.most_similar([pred_vec], topn=5)
    print("Text: {}".format(train_corpus[docid]))
    print("... Actual tags: {}".format(", ".join(actual_tags)))
    
    
    
    print("... Predicted tags: {}".format(pred_tags))