In [47]:
%matplotlib inline
import glob
import os
import matplotlib.pyplot as plt
import datetime

import artm

# Адрес, по которому находятся данные
BATCH_ADRESS = '../batches_news_ria_1kk'

# Вывод версии BigARTM
print(artm.version())

0.10.0


In [48]:
def print_now():
    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))

In [49]:
# Установка адреса, по которому находятся данные
batch_vectorizer = artm.BatchVectorizer(
    data_path=BATCH_ADRESS, data_format='batches'
)

In [50]:
#MODE = 'new'
MODE = 'load'

# Устанавливается название словаря для модели
dictionary_address = BATCH_ADRESS + '/dictionary.dict'

# Инициализация словоря для модели
dictionary = artm.Dictionary()

if MODE == 'new':
    print_now()

    # Удаление словаря, оставшегося после предыдущих запусков
    if os.path.isfile(dictionary_address):
        os.remove(dictionary_address)

    # Сохранение словаря в файл
    dictionary.gather(data_path=batch_vectorizer.data_path)
    dictionary.save(dictionary_path=dictionary_address)
    
    print_now()

# Загрузка словаря из файла
dictionary.load(dictionary_path=dictionary_address)
print(
    '{:>7}'.format(dictionary.__dict__['_master'].get_info().dictionary[0].num_entries),
    "- количество слов в словаре"
)

# Фильтрация

# Убираются редкие слова по всей коллекции
# Слово встречается во всей коллекции больше чем n раз
dictionary.filter(min_tf=10)
print(
    '{:>7}'.format(dictionary.__dict__['_master'].get_info().dictionary[0].num_entries),
    "- количество слов в словаре после фильтрации по min_tf", 
)

# Убираются частые слова по всей коллекции
# Слово встречается во всей коллекции меньше чем n раз
dictionary.filter(max_tf=10000)
print(
    '{:>7}'.format(dictionary.__dict__['_master'].get_info().dictionary[0].num_entries),
    "- количество слов в словаре после фильтрации по max_tf", 
)

# Убираются слова, которые попадаются меньше чем в n процентов документов
dictionary.filter(min_df_rate=0.0001)
print(
    '{:>7}'.format(dictionary.__dict__['_master'].get_info().dictionary[0].num_entries),
    "- количество слов в словаре после фильтрации по min_df_rate", 
)

# Убираются слова, которые попадаются больше чем в n процентов документов
dictionary.filter(max_df_rate=0.5)
print(
    '{:>7}'.format(dictionary.__dict__['_master'].get_info().dictionary[0].num_entries),
    "- количество слов в словаре после фильтрации по max_df_rate", 
)

  32265 - количество слов в словаре
  32265 - количество слов в словаре после фильтрации по min_tf
  32265 - количество слов в словаре после фильтрации по max_tf
  32265 - количество слов в словаре после фильтрации по min_df_rate
  32265 - количество слов в словаре после фильтрации по max_df_rate


In [51]:
def new_default_model(topic_count, dictionary):
    
    # Создание модели
    lda = artm.LDA(
        
        # Количество тем
        num_topics=topic_count, 
        # Разрешается хранить матрицу theta в памяти
        cache_theta=True,
        # Настройка - сколько раз обрабатывать каждый документ
        num_document_passes=1,
        # Фиксация seed для воспроизвдимости результатов
        seed=-1,
        #alpha=0.01,
        #beta=0.001,
        dictionary=dictionary,
    )
    
    # Инициализация модели
    #lda.initialize(dictionary=dictionary)
    
    return lda

In [52]:
params = {
    'topic_count': 75,
}

In [53]:
# Создание базовой пустой модели
lda = new_default_model(params['topic_count'], dictionary)

In [54]:
# Модель 0_1

# Обучение модели
print_now()
lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=30)
print_now()

# Вывод метрик модели
#model_print_results(lda)

# Сохранение модели
lda.save("news_model_0_1")
lda_0_1 = lda.clone()

2019-06-07 17:53
2019-06-07 19:00


In [58]:
top_tokens = lda.get_top_tokens(num_tokens=10)
for i, token_list in enumerate(top_tokens):
    print('Topic #{0}: {1}'.format(i, token_list))

Topic #0: ['компьютерный', 'поисковый', 'лукойл', 'яндекс', 'почтовый', 'катер', 'радиоактивный', 'радиационный', 'акватория', 'обнаружение']
Topic #1: ['блогер', 'соцсеть', 'персональный', 'нарышкин', 'контент', 'обновление', 'лекция', 'заблокировать', 'вконтакте', 'раздел']
Topic #2: ['соболезнование', 'таблица', 'сноудена', 'фифа', 'штрафной', 'тайм', 'эдвард', 'томь', 'сноудно', 'армеец']
Topic #3: ['паралимпийский', 'мурманский', 'мурманск', 'мокнуть', 'платежный', 'руслан', 'перо', 'романов', 'роберт', 'плакат']
Topic #4: ['наркотический', 'соотечественник', 'героин', 'фскн', 'сбыт', 'путевка', 'теннисист', 'ростуризм', 'кишинев', 'мадрид']
Topic #5: ['кострома', 'банда', 'астрахань', 'костромской', 'шувалов', 'крест', 'координационный', 'дополнение', 'предъявление', 'принадлежность']
Topic #6: ['удостоверение', 'часы', 'колледж', 'якутск', 'бостон', 'марафон', 'лариса', 'исполняться', 'борисов', 'велосипед']
Topic #7: ['прогреметь', 'магнитский', 'взрываться', 'ивановский', 'мем

In [59]:
phi = lda.phi_
phi

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_65,topic_66,topic_67,topic_68,topic_69,topic_70,topic_71,topic_72,topic_73,topic_74
преследователь,1.009288e-07,2.754655e-08,6.377641e-04,6.499994e-08,1.581421e-07,4.864109e-08,4.765715e-08,5.905504e-08,3.547854e-08,1.109372e-06,...,2.683473e-08,2.481710e-04,8.641394e-08,1.077703e-07,3.156505e-08,3.232194e-08,3.333024e-08,4.054005e-08,1.780844e-08,1.319599e-04
тайшет,4.778651e-08,2.049500e-08,1.440580e-08,4.354143e-08,7.271886e-08,4.859998e-08,4.411012e-08,3.975954e-08,7.965944e-08,9.668620e-09,...,3.045696e-05,3.399635e-06,2.730297e-08,4.069425e-08,1.136107e-07,6.175362e-08,2.906213e-08,1.525750e-07,2.176757e-08,1.344006e-04
финкельштейн,2.820610e-08,2.147400e-08,1.468912e-08,4.150844e-08,2.535255e-08,3.731184e-08,3.441043e-08,3.678342e-08,3.078940e-08,1.076503e-08,...,2.277629e-08,2.137013e-08,2.810934e-08,6.299770e-08,2.955387e-08,6.172731e-08,2.057950e-08,4.102543e-08,1.528488e-08,2.867671e-08
подкомитет,5.595131e-08,3.253394e-05,1.655614e-08,5.042933e-08,3.997419e-08,4.572534e-08,5.467282e-08,3.018645e-05,3.809138e-08,1.111110e-08,...,2.957282e-08,2.122746e-08,4.142955e-08,5.753636e-08,3.321210e-08,4.375759e-08,2.736990e-08,3.253030e-08,2.432172e-08,2.760120e-08
джулия,4.014306e-08,1.344259e-07,1.959893e-08,1.277749e-04,1.040717e-07,4.051167e-08,5.351148e-08,3.025037e-06,3.538910e-08,1.591284e-04,...,3.009477e-08,9.915867e-08,2.696784e-08,6.117870e-08,3.693792e-08,2.991289e-08,4.073094e-08,4.628089e-08,2.346028e-08,2.914421e-08
гомофобия,2.945471e-08,2.175597e-08,1.464927e-08,4.714284e-08,2.549857e-08,4.192806e-08,4.068975e-08,3.879123e-08,3.625357e-08,2.159684e-06,...,2.246734e-08,2.257933e-08,3.062548e-08,7.508476e-08,3.329552e-08,2.921927e-08,2.824810e-08,3.234523e-08,1.555992e-08,3.935394e-08
втискивать,2.996700e-08,2.863539e-08,1.595883e-08,4.128789e-08,3.157421e-08,5.027555e-08,4.978489e-08,4.086198e-08,4.475523e-08,1.022247e-05,...,2.453465e-08,2.035359e-08,2.563677e-08,4.251358e-08,3.213680e-08,2.997040e-08,2.207913e-08,3.617954e-08,1.195360e-07,2.571032e-08
роснано,2.866812e-08,2.113004e-08,1.418821e-08,3.911967e-08,2.656359e-08,4.155909e-08,3.312072e-08,3.651971e-08,3.286138e-08,9.328302e-09,...,2.388067e-08,2.103354e-08,2.375974e-08,3.917842e-08,2.927669e-08,2.960947e-08,2.022997e-08,2.843107e-08,1.803742e-08,2.761504e-08
побратим,3.193128e-08,1.959084e-08,1.517671e-08,5.164341e-08,3.139042e-08,4.511746e-08,4.602540e-08,4.529841e-08,3.372760e-08,1.137083e-08,...,2.524927e-08,2.071106e-08,3.572657e-08,4.609793e-08,3.160609e-08,3.222536e-08,2.132993e-08,4.628150e-08,1.571746e-08,2.880196e-08
конюхов,2.888450e-08,1.880509e-08,1.641485e-08,4.083784e-08,2.812027e-08,7.715382e-04,3.581058e-08,3.624388e-08,3.303385e-08,9.222003e-09,...,2.215531e-08,2.063510e-08,2.344317e-08,3.907115e-08,2.856051e-08,2.896405e-08,1.871295e-08,3.146467e-08,1.495525e-08,2.615887e-08


In [60]:
theta = lda.get_theta()
theta

Unnamed: 0,428000,428001,428002,428003,428004,428005,428006,428007,428008,428009,...,212990,212991,212992,212993,212994,212995,212996,212997,212998,212999
topic_0,0.007107,0.000329,0.000229,0.066912,0.000678,0.000224,0.001482,0.000391,0.000201,0.001190,...,0.475167,0.000420,0.000930,0.087747,0.000357,0.000257,0.000401,0.072540,0.030657,0.037020
topic_1,0.008688,0.000281,0.006455,0.000283,0.000678,0.000224,0.001482,0.000467,0.000168,0.001143,...,0.000376,0.000492,0.000936,0.000898,0.000320,0.000177,0.130170,0.000409,0.000301,0.000589
topic_2,0.000075,0.000328,0.000257,0.000553,0.000870,0.283229,0.552377,0.000391,0.000281,0.001360,...,0.000310,0.000470,0.000944,0.000569,0.000340,0.000181,0.000255,0.000479,0.000403,0.000668
topic_3,0.000075,0.000280,0.004317,0.000246,0.000681,0.000227,0.051876,0.000467,0.000168,0.001143,...,0.000306,0.000406,0.000933,0.000277,0.000652,0.000171,0.000254,0.000390,0.000304,0.000530
topic_4,0.000452,0.000282,0.023575,0.000248,0.101360,0.000241,0.001482,0.000391,0.000168,0.001194,...,0.000306,0.000389,0.000932,0.000277,0.000423,0.003564,0.025430,0.001461,0.000349,0.000520
topic_5,0.003070,0.000280,0.000238,0.000254,0.000678,0.000223,0.001482,0.000391,0.000167,0.001143,...,0.000309,0.000395,0.000938,0.000276,0.000288,0.000170,0.000336,0.000389,0.000392,0.000524
topic_6,0.000079,0.000280,0.000231,0.000310,0.000678,0.000224,0.001482,0.000388,0.000187,0.001143,...,0.000312,0.000400,0.000931,0.000282,0.000289,0.000171,0.029006,0.000423,0.000306,0.000529
topic_7,0.000073,0.000282,0.000237,0.000291,0.000678,0.000223,0.001482,0.000400,0.000168,0.001143,...,0.000311,0.000418,0.000937,0.000273,0.023754,0.000173,0.000260,0.000389,0.000297,0.004216
topic_8,0.171126,0.000297,0.000247,0.001324,0.000678,0.000232,0.001482,0.013848,0.000189,0.001143,...,0.000379,0.000422,0.000935,0.000512,0.000338,0.000174,0.000252,0.000390,0.000297,0.000565
topic_9,0.033561,0.000283,0.000723,0.000384,0.000678,0.000252,0.001481,0.000390,0.000206,0.001159,...,0.000416,0.000393,0.000936,0.000371,0.000296,0.000203,0.000310,0.000403,0.000319,0.000793
