# Эксперимент по построению третьего уровня иерархии

Модель построена на данных "Демо-корпус + Лента.ру", предлагается как родительская модель верхнего уровня иерархии, хотя бы для первых экспериментов построения следующих уровней иерархии.  
  
Данные для построения модели можно взять в общей папке с данными на Google Drive: архив 1lvl.zip

Рубрикатор верхнего уровня на 10 фиксированных категорий составляла Таснима Садекова (str12.01.94@gmail.com).  
  
Эксперименты по подбору гиперпараметров и измерению качества модели первого уровня (главным образом, точности классификации новостей с помощью модели) проводил Александр Романенко (angriff07@gmail.com).  

Эксперименты по подбору гиперпараметров и измерению качества модели второго и третьего уровней (главным уровнем, визуального восприятия матрицы связей psi, когерентности и перплексии тем) проводил Артём Попов (artmes-07@mail.ru).

Ниже представлен код самой модели, топ-слова, графики качества...

In [1]:
%matplotlib inline  
import time
import os
import codecs
import shutil
import sys
import numpy as np
import re
import glob
import sklearn.metrics

import artm
lc = artm.messages.ConfigureLoggingArgs()
lc.log_dir=r'C:/Users/Vlad/Documents/BigARTM_logs'
lib = artm.wrapper.LibArtm(logging_config=lc)

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib.ticker as ticker
from matplotlib import rc
from matplotlib.backends.backend_pdf import PdfPages
%matplotlib inline

sys.path.append('C:/Users/Vlad/Documents/Python_work/tm_nlp/tools')
import helpers 
import Collection
import ngrammer
reload(helpers)
reload(Collection)
reload(ngrammer)

import pickle

In [2]:
data_train_path = 'C:/Users/Vlad/Documents/Python_work/Mediaplaning/data_mediaplan/1lvl/trn/vw/data.vw.txt'
#batches_train_path = '../batches/trn'
batches_train_path = 'C:/Users/Vlad/Documents/Python_work/Mediaplaning/data_mediaplan/news_trn_vocab/batches/trn'

In [3]:
if len(glob.glob(batches_train_path + "/*.batch")) < 1:
    batch_vectorizer_train = artm.BatchVectorizer(data_path=data_train_path + '', collection_name='',
                                            data_format='vowpal_wabbit', batch_size = 1000, 
                                            target_folder=batches_train_path)
else:
    batch_vectorizer_train = artm.BatchVectorizer(data_path=batches_train_path, 
                                            data_format='batches')

In [4]:
my_dictionary = artm.Dictionary()

if len(glob.glob(batches_train_path + "/*.dict")) < 1:
    my_dictionary.gather(data_path=batches_train_path, vocab_file_path=data_train_path + '/vocab.mediaplanning.txt')
    my_dictionary.save(dictionary_path=batches_train_path + '/mediaplanning_dictionary')

my_dictionary.load(dictionary_path=batches_train_path + '/mediaplanning_dictionary.dict')

In [5]:
data_test_path = 'C:/Users/Vlad/Documents/Python_work/Mediaplaning/data_mediaplan/1lvl/tst/vw/data.vw.txt'
batches_test_path = 'C:/Users/Vlad/Documents/Python_work/Mediaplaning/data_mediaplan/news_trn_vocab/batches/tst'

In [6]:
if len(glob.glob(batches_test_path + "/*.batch")) < 1:
    batch_vectorizer_test = artm.BatchVectorizer(data_path=data_test_path + '', collection_name='',
                                            data_format='vowpal_wabbit', batch_size = 1000, 
                                            target_folder=batches_test_path)
else:
    batch_vectorizer_test = artm.BatchVectorizer(data_path=batches_test_path, 
                                            data_format='batches')

In [7]:
dictionary_ppmi = artm.Dictionary()
dictionary_ppmi.gather(
    data_path=batches_train_path,
    cooc_file_path = "C:/Users/Vlad/Documents/Python_work/Mediaplaning/data_mediaplan/1lvl/data/ppmi/ppmi.txt",
    vocab_file_path='C:/Users/Vlad/Documents/Python_work/Mediaplaning/data_mediaplan/1lvl/data/trn/UCI/vocab.news.txt',
    symmetric_cooc_values=True)

In [8]:
answers_test = helpers.loadFileAsStringArray('C:/Users/Vlad/Documents/Python_work/Mediaplaning/data_mediaplan/1lvl/labels/news.1lvlcat.tst.txt', True)
id_test = helpers.loadFileAsStringArray('C:/Users/Vlad/Documents/Python_work/Mediaplaning/data_mediaplan/1lvl/labels/news.id.tst.txt', True)
answers_train = helpers.loadFileAsStringArray('C:/Users/Vlad/Documents/Python_work/Mediaplaning/data_mediaplan/1lvl/labels/news.1lvlcat.trn.txt', True)
id_train = helpers.loadFileAsStringArray('C:/Users/Vlad/Documents/Python_work/Mediaplaning/data_mediaplan/1lvl/labels/news.id.trn.txt', True)

id2answer_tst = dict(zip(id_test,answers_test))
id2answer_trn = dict(zip(id_train,answers_train))
print 'Lengthes:',len(id2answer_tst),len(id2answer_trn)

Lengthes: 1000 28921


In [9]:
lvl1cats = sorted(list(set(id2answer_tst.values())))
for cat in lvl1cats:
    print cat,

бизнес культура медиа наука_и_техника общество_и_путешествия политика происшествия силовые_структуры спорт экономика_и_финансы


In [10]:
hartm = artm.hARTM()

## первый уровень

Код повторяет один из экспериментов Александра Романенко.

In [11]:
category2number = dict(zip(lvl1cats,range(len(lvl1cats))))

In [12]:
category_matrix = []
lenta_doc_ids = []
topic_num = 11
for idx,cat in id2answer_trn.iteritems():    
    if category2number.has_key(cat):
#         lenta_doc_ids += [idx]
        temp_list = [1] * topic_num
        # Здесь можно либо 0, либо -1
        temp_list[category2number[cat]] = -100
    else:
        temp_list = [0] * topic_num
    for x in range(10,topic_num):
        temp_list[x] = -1000
    lenta_doc_ids += [idx]
    category_matrix += [temp_list]


In [13]:
subj_num_1 = len(lvl1cats)
bckgrnd_num_1 = 2

topics_names_lvl1 = [u'lvl1_'+ x for x in lvl1cats + ['background']]

subj_topics = topics_names_lvl1[:10]
bckgrnd_topics = topics_names_lvl1[10:]

In [24]:
# моё
subj_num_1 = len(lvl1cats)
bckgrnd_num_1 = 2

topics_names_lvl1 = [u'lvl1_'+ x for x in lvl1cats + ['background1', 'background2']]

subj_topics = topics_names_lvl1[:10]
bckgrnd_topics = topics_names_lvl1[10:]

In [14]:
regularizers_lvl1 = []
regularizers_lvl1 += [artm.SmoothSparsePhiRegularizer(name='SparseCategory', class_ids=['1lvlcat'], tau=0)]
regularizers_lvl1 += [artm.DecorrelatorPhiRegularizer(name='DeccorTextSubj', class_ids=['text_ngramm'], tau=1000000)]
regularizers_lvl1 += [artm.SmoothSparseThetaRegularizer(name='SST', tau=-20, doc_titles=lenta_doc_ids,
                                                       doc_topic_coef=category_matrix)]



regularizers_lvl1 += [artm.SmoothSparsePhiRegularizer(name='SmoothPhi', class_ids=['text_ngramm'], 
                                                 topic_names=bckgrnd_topics,
                                                     tau=1)]

regularizers_lvl1 += [artm.SmoothSparseThetaRegularizer(name='SmoothTheta', 
                                                        topic_names=bckgrnd_topics,
                                                        tau=1)]

scores_lvl1 = []
scores_lvl1 += [artm.PerplexityScore(name='Perplexity', dictionary=my_dictionary, class_ids=['text_ngramm'])]
scores_lvl1 += [artm.PerplexityScore(name='Perplexity_with_cat', dictionary=my_dictionary, class_ids=['text_ngramm','1lvlcat'])]
scores_lvl1 += [artm.SparsityPhiScore(name='SparsityPhiText', class_id='text_ngramm')]
scores_lvl1 += [artm.SparsityPhiScore(name='SparsityPhiCategory', class_id='1lvlcat')]
scores_lvl1 += [artm.SparsityThetaScore(name='SparsityTheta')]
scores_lvl1 += [artm.TopicKernelScore(name='TopicKernelText', probability_mass_threshold=0.1, class_id='text_ngramm')]
scores_lvl1 += [artm.TopTokensScore(name='TopTokensText',class_id = 'text_ngramm', num_tokens=50)]
scores_lvl1 += [artm.TopTokensScore(name='TopTokensScoreNgramm',class_id='text_ngramm', 
                               num_tokens=10, dictionary=dictionary_ppmi)]

In [15]:
model_lvl1 = hartm.add_level(num_topics=topic_num,topic_names=topics_names_lvl1)
model_lvl1

artm.ARTM(num_topics=11, num_tokens=None)

In [16]:
for r in regularizers_lvl1:
    model_lvl1.regularizers.add(r) 
for s in scores_lvl1:
    model_lvl1.scores.add(s)
    
model_lvl1.class_ids = {'text_ngramm': 1.0, '1lvlcat':50.0}
model_lvl1.dictionary=my_dictionary
model_lvl1.reuse_theta=True
model_lvl1.cache_theta=True
model_lvl1.num_document_passes=1
model_lvl1.theta_columns_naming=u'title'
model_lvl1.initialize(dictionary=my_dictionary)

In [17]:
model_lvl1.fit_offline(batch_vectorizer=batch_vectorizer_train, num_collection_passes=30)

## второй уровень

In [20]:
# number of subject topics
subj_num_2 = 70
# old background topics from the 1st level
old_bckgrnd_num_2 = bckgrnd_num_1
# new background topics
bckgrnd_num_2 = 1

num_topics_2 = subj_num_2 + old_bckgrnd_num_2 + bckgrnd_num_2
topics_names_lvl2_subj = [u'lvl2_topic_subj_' + unicode(t) for t in range(subj_num_2)]
topics_names_lvl2_old_bckgrnd = [u'lvl2_topic_old_bck_' + unicode(t) for t in range(old_bckgrnd_num_2)]
topics_names_lvl2_bckgrnd = [u'lvl2_topic_bck_' + unicode(t) for t in range(bckgrnd_num_2)]
topics_names_lvl2 = topics_names_lvl2_subj + topics_names_lvl2_bckgrnd + topics_names_lvl2_old_bckgrnd

In [21]:
hartm.del_level(1)
model_lvl2 = hartm.add_level(num_topics=num_topics_2,
                             topic_names=topics_names_lvl2,
                             parent_level_weight=0.1)

In [22]:
# for psi regularizer
doc_topic_coef_1 = [[1] * (subj_num_2 + old_bckgrnd_num_2)  + [0] * bckgrnd_num_2]
doc_topic_coef_2 = [[0] * (subj_num_2 + old_bckgrnd_num_2) + [1] * bckgrnd_num_2]

Для улучшения модели более чётко сформулируем требования к фоновым темам. В этой модели фоновые темы с предыдущего уровня переносятся с помощью регуляризаторов матрицы пси с первого уровня на второй. Используется два регуляризатора: первый задаёт то, что ни у какой темы кроме old_background нет фоновых тем с первого уровня в качестве родителей, второй задаёт то, что у old_background тем на втором уровне в родителях только фоновая тема с первого уровня.

Регуляризатор HierSp контролирует вид матрицы Пси, его влияние проследить сложно, но он очень сильно улучшает модель.



In [23]:
regularizers_lvl2 = []
regularizers_lvl2 += [artm.DecorrelatorPhiRegularizer(name='DeccorText2', class_ids=['text_ngramm'],
                                                     tau=1 * (10 ** 6))]

regularizers_lvl2 += [artm.SmoothSparsePhiRegularizer(name='SparsePhi2_subj', tau=0,
                                                       topic_names=topics_names_lvl2_subj)]

regularizers_lvl2 += [artm.SmoothSparseThetaRegularizer(name='SparseTheta2_subj', tau=0,
                                                       topic_names=topics_names_lvl2_subj)]

regularizers_lvl2 += [artm.SmoothSparsePhiRegularizer(name='SmoothPhi2_bckgrnd', class_ids=['text_ngramm'], 
                                                 topic_names=topics_names_lvl2_bckgrnd, tau=10)]

regularizers_lvl2 += [artm.SmoothSparseThetaRegularizer(name='SmoothTheta2_bckgrnd',  
                                                        topic_names=topics_names_lvl2_bckgrnd,
                                                        tau=10)]

regularizers_lvl2 += [artm.HierarchySparsingThetaRegularizer(name='HierSp', tau=1000.0)]
regularizers_lvl2 += [artm.SmoothSparseThetaRegularizer(name='SparsePsi_bckgrnd1',
                                                        doc_titles=[u'lvl1_background'], 
                                                        doc_topic_coef=doc_topic_coef_1,
                                                        tau=-10 ** 6)]

regularizers_lvl2 += [artm.SmoothSparseThetaRegularizer(name='SparsePsi_bckgrnd2',
                                                          tau=-10 ** 4,
                                                          doc_titles=model_lvl1.topic_names[:-1],
                                                          doc_topic_coef=doc_topic_coef_2)]


scores_lvl2 = []
scores_lvl2 += [artm.PerplexityScore(name='Perplexity', dictionary=my_dictionary, class_ids=['text_ngramm'])]
scores_lvl2 += [artm.SparsityPhiScore(name='SparsityPhiText', class_id='text_ngramm')]
scores_lvl2 += [artm.SparsityThetaScore(name='SparsityTheta')]
scores_lvl2 += [artm.TopicKernelScore(name='TopicKernelText', probability_mass_threshold=0.1, class_id='text_ngramm')]
scores_lvl2 += [artm.TopTokensScore(name='TopTokensText',class_id = 'text_ngramm', num_tokens=50)]
scores_lvl2 += [artm.TopTokensScore(name='TopTokensScoreNgramm',class_id='text_ngramm', 
                               num_tokens=30, dictionary=dictionary_ppmi)]


In [24]:
for r in regularizers_lvl2:
    model_lvl2.regularizers.add(r, overwrite=True)

for s in scores_lvl2:
    model_lvl2.scores.add(s, overwrite=True)

    
model_lvl2.class_ids = {'text_ngramm' : 1.0}
model_lvl2.dictionary=my_dictionary
model_lvl2.reuse_theta=True
model_lvl2.cache_theta=True
model_lvl2.num_document_passes=1
model_lvl2.theta_columns_naming=u'title'
model_lvl2.initialize(dictionary=my_dictionary)

In [25]:
model_lvl2.fit_offline(batch_vectorizer_train, num_collection_passes=50)
psi12 = (hartm.get_level(1).get_psi())

## третий уровень

In [26]:
# number of subject topics
subj_num_3 = 400
# old background topics from the 1st level


old_bckgrnd_num_3 = bckgrnd_num_1 + bckgrnd_num_2
# new background topics
bckgrnd_num_3 = 0

num_topics_3 = subj_num_3 + old_bckgrnd_num_3 + bckgrnd_num_3
topics_names_lvl3_subj = [u'lvl3_topic_subj_' + unicode(t) for t in range(subj_num_3)]
topics_names_lvl3_old_bckgrnd = [u'lvl3_topic_old_bck_' + unicode(t) for t in range(old_bckgrnd_num_3)]
topics_names_lvl3_bckgrnd = [u'lvl3_topic_bck_' + unicode(t) for t in range(bckgrnd_num_3)]
topics_names_lvl3 = topics_names_lvl3_subj + topics_names_lvl3_bckgrnd + topics_names_lvl3_old_bckgrnd

In [27]:
hartm.del_level(2)
model_lvl3 = hartm.add_level(num_topics=num_topics_3,
                             topic_names=topics_names_lvl3,
                             parent_level_weight=0.1)

In [28]:
doc_topic_coef_3_1 = [[1] * (subj_num_3 + bckgrnd_num_3 + bckgrnd_num_2)  + [0] * bckgrnd_num_1]
doc_topic_coef_3_2 = [[0] * (subj_num_3 + bckgrnd_num_3 + bckgrnd_num_2) + [1] * bckgrnd_num_1]
doc_topic_coef_3_3 = [[1] * (subj_num_3 + bckgrnd_num_3) + [0] * bckgrnd_num_2 + [1] * bckgrnd_num_1]
doc_topic_coef_3_4 = [[0] * (subj_num_3 + bckgrnd_num_3) + [1] * bckgrnd_num_2 + [0] * bckgrnd_num_1]

In [29]:
regularizers_lvl3 = []
regularizers_lvl3 += [artm.DecorrelatorPhiRegularizer(name='DeccorText3', class_ids=['text_ngramm'],
                                                     tau=1 * (10 ** 5))]


regularizers_lvl3 += [artm.HierarchySparsingThetaRegularizer(name='HierSp', tau=1000.0)]

regularizers_lvl3 += [artm.SmoothSparseThetaRegularizer(name='SparsePsi23_bckgrnd1',
                                                        doc_titles=[u'lvl2_topic_old_bck_0'], 
                                                        doc_topic_coef=doc_topic_coef_3_1,
                                                        tau=-10 ** 6)]

regularizers_lvl3 += [artm.SmoothSparseThetaRegularizer(name='SparsePsi23_bckgrnd2',
                                                          tau=-10 ** 4,
                                                          doc_titles=model_lvl2.topic_names[:-1],
                                                          doc_topic_coef=doc_topic_coef_3_2)]


regularizers_lvl3 += [artm.SmoothSparseThetaRegularizer(name='SparsePsi23_bckgrnd3',
                                                        doc_titles=[u'lvl2_topic_bck_0'], 
                                                        doc_topic_coef=doc_topic_coef_3_3,
                                                        tau=-10 ** 6)]


regularizers_lvl3 += [artm.SmoothSparseThetaRegularizer(name='SparsePsi23_bckgrnd4',
                                                          tau=-10 ** 4,
                                                          doc_titles=model_lvl2.topic_names[:-2] + [model_lvl2.topic_names[-1]],
                                                          doc_topic_coef=doc_topic_coef_3_4)]


scores_lvl3 = []
scores_lvl3 += [artm.PerplexityScore(name='Perplexity', dictionary=my_dictionary, class_ids=['text_ngramm'])]
scores_lvl3 += [artm.SparsityPhiScore(name='SparsityPhiText', class_id='text_ngramm')]
scores_lvl3 += [artm.SparsityThetaScore(name='SparsityTheta')]
scores_lvl3 += [artm.TopicKernelScore(name='TopicKernelText', probability_mass_threshold=0.1, class_id='text_ngramm')]
scores_lvl3 += [artm.TopTokensScore(name='TopTokensText',class_id = 'text_ngramm', num_tokens=50)]
scores_lvl3 += [artm.TopTokensScore(name='TopTokensScoreNgramm',class_id='text_ngramm', 
                               num_tokens=30, dictionary=dictionary_ppmi)]


In [30]:
for r in regularizers_lvl3:
    model_lvl3.regularizers.add(r, overwrite=True)

for s in scores_lvl3:
    model_lvl3.scores.add(s, overwrite=True)

    
model_lvl3.class_ids = {'text_ngramm' : 1.0}
model_lvl3.dictionary=my_dictionary
model_lvl3.reuse_theta=True
model_lvl3.cache_theta=True
model_lvl3.num_document_passes=1
model_lvl3.theta_columns_naming=u'title'
model_lvl3.initialize(dictionary=my_dictionary)

In [31]:
model_lvl3.fit_offline(batch_vectorizer_train, num_collection_passes=50)
psi23 = (hartm.get_level(2).get_psi())


In [32]:
print 'Perplexity', model_lvl3.score_tracker['Perplexity'].value[-1]
print 'Coherence', model_lvl3.score_tracker['TopTokensScoreNgramm'].average_coherence[-1]

print 'Phi Sparsity', model_lvl3.score_tracker['SparsityPhiText'].value[-1]
print 'Theta Sparsity', model_lvl3.score_tracker['SparsityTheta'].value[-1]

Perplexity 2657.9113066
Coherence 0.0
Phi Sparsity 0.90613152502
Theta Sparsity 0.870452432076


In [33]:
phi = hartm.get_phi(class_ids=['text_ngramm'])
theta = hartm.get_theta()
psi12 = hartm.get_level(1).get_psi()
psi23 = hartm.get_level(2).get_psi()