## Выявление этнических тем в коллекции LiveJournal

#### Автор: Мурат Апишев (great-mel@yandex.ru)

In [1]:
# -*- coding: utf-8 -*-
import artm

import codecs
import os
import glob
import sys

def dynamic_print(str):
    sys.stdout.write('\r{}'.format(str))
    sys.stdout.flush()
    
batches_folder = 'lj_batches_10k'


In [2]:
if os.path.isdir(batches_folder) and len(glob.glob(os.path.join(batches_folder, '*.batch'))) > 0:
    batches = artm.BatchVectorizer(data_path=batches_folder)
else:
    batches = artm.BatchVectorizer(data_format='vowpal_wabbit',
                                   target_folder=batches_folder,
                                   data_path='lj_full.vw.txt',
                                   batch_size=10000)
    

In [3]:
def print_scores(model):
    for topic_name in model.topic_names:
        print '{}: '.format(topic_name),
        for elem in model.score_tracker['TopTokensAll'].last_topic_info[topic_name].tokens:
            print u'{}, '.format(elem),
        print '\n'
    
    print 'Sparsity Phi: {}'.format(model.score_tracker['SparsityPhiAll'].last_value)
    print 'Sparsity Theta: {}'.format(model.score_tracker['SparsityThetaEthnic'].last_value)
    print 'Topic Mass E: {}'.format(model.score_tracker['TopicMassPhiEthnic'].last_value)
    print 'Topic Mass G: {}'.format(model.score_tracker['TopicMassPhiGeneral'].last_value)


def save_scores(model, filename='results_file.txt'):
    with codecs.open(filename, 'w', 'utf-8') as fout:
        for topic_name in model.topic_names:
            fout.write('{}: '.format(topic_name))

            for elem in model.score_tracker['TopTokensAll'].last_topic_info[topic_name].tokens:
                fout.write(u'{}, '.format(elem))
            fout.write('\n')
    
        fout.write('Sparsity Phi: {}\n'.format(model.score_tracker['SparsityPhiAll'].last_value))
        fout.write('Sparsity Theta: {}\n'.format(model.score_tracker['SparsityThetaEthnic'].last_value))

        fout.write('Topic Mass E: {}\n\n'.format(model.score_tracker['TopicMassPhiEthnic'].last_value))
        for topic_name in e_topic_names:
            fout.write('{0}: {1}\n'.format(topic_name, model.score_tracker['TopicMassPhiEthnic'].last_topic_info[topic_name].topic_mass))
                   
        fout.write('Topic Mass G: {}\n\n'.format(model.score_tracker['TopicMassPhiGeneral'].last_value))
        for topic_name in g_topic_names:
            fout.write('{0}: {1}\n'.format(topic_name, model.score_tracker['TopicMassPhiGeneral'].last_topic_info[topic_name].topic_mass))        


def save_top_tokens_with_weights(model, filename='top_tokens_file.txt'):
    with codecs.open(filename, 'w', 'utf-8') as fout:
        for topic_name in model.topic_names:
            fout.write('{}: '.format(topic_name))

            for token, weight in zip(model.score_tracker['TopTokensAll'].last_topic_info[topic_name].tokens,
                                     model.score_tracker['TopTokensAll'].last_topic_info[topic_name].weights):
                fout.write(u'{0}: {1:0.3f}, '.format(token, weight))
            fout.write('\n')


In [4]:
num_ethnic_topics  = 250
num_general_topics = 150
update_every = 1
num_document_passes = 25

e_topic_names = ['e_topic_{}'.format(i) for i in xrange(num_ethnic_topics)]
g_topic_names = ['g_topic_{}'.format(i) for i in xrange(num_general_topics)]

#model = artm.ARTM(topic_names=e_topic_names + g_topic_names,
#                  cache_theta=False,
#                  class_ids={'@default_class': 1.0},
#                  num_processors=1)

# use the following code instead of previous if you use modalities
model = artm.ARTM(topic_names=e_topic_names + g_topic_names,
                  cache_theta=True,
                  class_ids={'@default_class': 1.0,
                             '@ethnic_class': 100.0})


In [5]:
model.load_dictionary(dictionary_name='dictionary',
                      dictionary_path=os.path.join(batches_folder, 'dictionary_ethnic_non_weighted'))
model.initialize(dictionary_name='dictionary')


In [6]:
model.scores.add(artm.TopTokensScore(name='TopTokensAll', num_tokens=20))
model.scores.add(artm.SparsityPhiScore(name='SparsityPhiAll'))
model.scores.add(artm.SparsityThetaScore(name='SparsityThetaEthnic', topic_names=e_topic_names))
model.scores.add(artm.TopicMassPhiScore(name='TopicMassPhiEthnic', topic_names=e_topic_names))
model.scores.add(artm.TopicMassPhiScore(name='TopicMassPhiGeneral', topic_names=g_topic_names))


In [7]:
model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SparseThetaEthnic', tau=-1.0, topic_names=e_topic_names))
model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SmoothThetaGeneral', tau=0.05, topic_names=g_topic_names))

model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhiEthnic',
                                                       tau=1500000,
                                                       topic_names=e_topic_names,
                                                       class_ids=['@default_class']))

model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhiEthnicModal',
                                                       tau=20000,
                                                       topic_names=e_topic_names,
                                                       class_ids=['@ethnic_class']))#

model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SmoothPhiEthnic',
                                                       tau=0.00011,
                                                       topic_names=e_topic_names,
                                                       class_ids=['@default_class'],
                                                       dictionary_name='dictionary'))

model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhiGeneral',
                                                       tau=-100,
                                                       topic_names=g_topic_names,
                                                       class_ids=['@default_class'],
                                                       dictionary_name='dictionary'))

model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SmoothPhiGeneral',
                                                       tau=0.0000001,
                                                       topic_names=g_topic_names,
                                                       class_ids=['@default_class']))

In [8]:
import time
start_time = time.time()

model.fit_online(batch_vectorizer=batches, num_document_passes=num_document_passes, update_every=update_every)

print 'Elapsed time: {} sec.'.format(time.time() - start_time)


Elapsed time: 6994.43866014 sec.


In [9]:
print_scores(model)

e_topic_0:  станция,  линия,  метро,  метрополитен,  московская,  схема,  строительство,  вестибюль,  платформа,  открывать,  выход,  подзамок,  кольцевой,  участие,  наземный,  эскалатор,  вагон,  поезд,  перенос,  путч,  

e_topic_1:  команда,  гонка,  финам,  очаг,  кубок,  чемпионат,  соревнование,  участник,  победа,  болельщик,  выигрывать,  траур,  дистанция,  секунд,  гонять,  игра,  фантв,  ральф,  эстафета,  сезон,  

e_topic_2:  жид,  организм,  вещество,  виталик,  здоровый,  заболевание,  помогать,  полететь,  продукт,  кислота,  система,  пиво,  содержать,  содержаться,  повышать,  кишечный,  желать,  способствовать,  питание,  процесс,  

e_topic_3:  упс,  заграничный,  доран,  баргузинский,  дск,  сбор,  учетный,  дьюк,  улан-удэ,  сберегательный,  тетечка,  журки,  усср,  отворот,  поселок,  коверный,  компенсационный,  бан,  федика,  корреспондентский,  

e_topic_4:  лагерь,  заключать,  тюрьма,  тюремный,  гулаг,  смертельно,  концлагерь,  цифра,  лагерный,  исправит

In [None]:
import pandas as pd

model.save('model_full')

phi_df = model.get_phi(class_ids=['@default_class'])
print phi_df
phi_df.to_pickle('phi_full')

In [None]:
topic_names=e_topic_names + g_topic_names

theta_df_1 = model.fit_transform(topic_names=topic_names[0: 50])
theta_df_2 = model.fit_transform(topic_names=topic_names[50: 100])
theta_df_3 = model.fit_transform(topic_names=topic_names[100: 150])
theta_df_4 = model.fit_transform(topic_names=topic_names[150: 200])
theta_df_5 = model.fit_transform(topic_names=topic_names[200: 250])
theta_df_6 = model.fit_transform(topic_names=topic_names[250: 300])
theta_df_7 = model.fit_transform(topic_names=topic_names[300: 350])
theta_df_8 = model.fit_transform(topic_names=topic_names[350: 400])

In [None]:
theta_df_1.to_pickle('theta_full_1')
theta_df_2.to_pickle('theta_full_2')
theta_df_3.to_pickle('theta_full_3')
theta_df_4.to_pickle('theta_full_4')
theta_df_5.to_pickle('theta_full_5')
theta_df_6.to_pickle('theta_full_6')
theta_df_7.to_pickle('theta_full_7')
theta_df_8.to_pickle('theta_full_8')