Code is based on [The following example](http://sujitpal.blogspot.nl/2016/04/predicting-movie-tags-from-plots-using.html)

In [84]:
import gensim
import json
from random import shuffle
import numpy as np

In [140]:
def construct_topic_knowledge(fname):
    file = '../data_resources/topics/{}.json'.format(fname)
    data = json.load(open(file))
    
    topics = {}
    
    for obj in data:
        main = obj['main_topic']
        sub = obj['sub_topic']
        
        if not main in topics:
            topics[main] = []
        
        if not sub in topics[main]:
            topics[main].append(sub)
    
    return topics
        

def read_corpus(fname, tokens_only=False, topic_type='sub_topic'):
    file = '../data_resources/topics/{}.json'.format(fname)
    data = json.load(open(file))
    
    for obj in data:
        if tokens_only:
            yield gensim.utils.simple_preprocess(obj['content'])
        else:
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(obj['content']), [obj[topic_type]])
            
    
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens
    
def tokenize_tags(label):
    tags = label.split("::")
    tags = map(lambda tok: mark_tag(tok), tags)
    return tags

def jaccard_similarity(labels, preds):
    lset = set(labels)
    pset = set(preds)
    return len(lset.intersection(pset)) / len(lset.union(pset))


In [141]:
topics = construct_topic_knowledge('vng_training_cleaned_no_news')

In [142]:
corpus_sub = list(read_corpus('vng_training_cleaned_no_news'))
corpus_main = list(read_corpus('vng_training_cleaned_no_news', topic_type='main_topic'))

In [190]:
split = int(0.9*len(corpus))

train_corpus_sub = corpus_sub[0:split]
train_corpus_main = corpus_main[0:split]

test_corpus_sub = corpus_sub[split:]
test_corpus_main = corpus_main[split:]

In [191]:
model_sub = gensim.models.doc2vec.Doc2Vec(size=49, min_count=2, iter=100)
model_sub.build_vocab(train_corpus_sub)
%time model_sub.train(train_corpus_sub, total_examples=model_sub.corpus_count, epochs=model_sub.iter)

CPU times: user 2min 5s, sys: 1.69 s, total: 2min 7s
Wall time: 48.8 s


42023578

In [192]:
model_main = gensim.models.doc2vec.Doc2Vec(size=51, min_count=2, iter=100)
model_main.build_vocab(train_corpus_main)
%time  model_main.train(train_corpus_main, total_examples=model_main.corpus_count, epochs=model_main.iter)

CPU times: user 2min 8s, sys: 1.67 s, total: 2min 10s
Wall time: 49 s


42023811

In [269]:
def top_tags(model_main, model_sub, doc, topics_knowledge, top_n=1):
    
    pred_vec_main = model_main.infer_vector(doc.words)
    pred_tags_main = model_main.docvecs.most_similar([pred_vec_main], topn=15)

    pred_vec_sub = model_sub.infer_vector(doc.words)
    pred_tags_sub = model_sub.docvecs.most_similar([pred_vec_sub], topn=5)
    
    final_sub_tags = []
    
    for pred_tag_sub in pred_tags_sub:
        max_score = pred_tag_sub[1]
        
        for pred_tag_main in pred_tags_main:
            
            if pred_tag_sub[0] in topics_knowledge[pred_tag_main[0]]:
                
                score = max(max_score, pred_tag_sub[1] + pred_tag_main[1])       
                
        final_sub_tags.append((pred_tag_sub[0], max_score))

    while len(final_sub_tags) > top_n:
        min_certainty = 999
        min_certainty_tag = None

        for tag in final_sub_tags:
            if tag[1] < min_certainty:
                min_certainty = tag[1]
                min_certainty_tag = tag

        final_sub_tags.remove(tag)

    return final_sub_tags

In [270]:
def evaluate_dual_model(model_main, model_sub, corpus_test, topics_knowledge):
    tot_sim = 0
    for doc in corpus_test:
        pred_tags = top_tags(model_main, model_sub, doc, topics_knowledge, top_n=1)
        
        sim = jaccard_similarity(doc.tags, [x[0] for x in pred_tags])
        tot_sim += sim
    print("Average Similarity on Test Set: {}".format(tot_sim / len(corpus_test)))

In [271]:
evaluate_dual_model(model_main, model_sub, test_corpus_sub, topics)

Average Similarity on Test Set: 0.21172638436482086


In [265]:
# SIMPLE DOCTAG2VEC MODEL WORKS BETTER THAN COMBINING MAIN TOPICS WITH SUB TOPICS.

def evaluate_model(model, corpus):
    tot_sim = 0
    for doc in corpus:
        pred_vec = model.infer_vector(doc.words)
        pred_tags = model.docvecs.most_similar([pred_vec], topn=1)
        sim = jaccard_similarity(doc.tags, [x[0] for x in pred_tags])
        tot_sim += sim
    print("Average Similarity on Test Set: {}".format(tot_sim / len(corpus)))
    
evaluate_model(model_sub, test_corpus_sub)

Average Similarity on Test Set: 0.21498371335504887


In [34]:
# print out random test result
for i in range(10):
    docid = np.random.randint(len(train_corpus))
    pred_vec = model.infer_vector(train_corpus[docid].words)
#     actual_tags = map(lambda x: unmark_tag(x), train_corpus[docid].tags)
    actual_tags = train_corpus[docid].tags
    pred_tags = model.docvecs.most_similar([pred_vec], topn=5)
    
    print("Actual tags: {}".format(", ".join(actual_tags)))
    print("Predicted tags: {}".format(pred_tags))
    print('--')

Actual tags: huisvestingswet
Predicted tags: [('huisvestingswet', 0.8789634704589844), ('overheidsaansprakelijkheid', 0.8708899021148682), ('landbouw-en-veehouderij', 0.8487333059310913), ('luchtkwaliteit', 0.8310826420783997), ('grondbeleidgrondzaken', 0.8105636835098267)]
--
Actual tags: isd-informatievoorziening-sociaal-domein
Predicted tags: [('overheidsaansprakelijkheid', 0.7986610531806946), ('landbouw-en-veehouderij', 0.7889240384101868), ('winkeltijdenwet', 0.7824044823646545), ('wet-openbaarheid-van-bestuur-wob', 0.7823542952537537), ('awb', 0.7720432877540588)]
--
Actual tags: energie-en-klimaat
Predicted tags: [('bevolkingsdaling', 0.7620388865470886), ('leegstand-en-herbestemming', 0.7427639365196228), ('leegstand-transformatie-en-herbestemming', 0.7198410630226135), ('luchtkwaliteit', 0.7092674970626831), ('gemeenten-kapitaalmarkt', 0.7028281688690186)]
--
Actual tags: asielbeleid-en-integratie
Predicted tags: [('overheidsaansprakelijkheid', 0.8651185035705566), ('kernbele