In [1]:
import glob
import logging
import os
import re
import uuid
from pathlib import Path
from pymongo import MongoClient
import artm
import click
import mlflow
from tqdm.notebook import tqdm
import nltk

import multiprocessing as mp

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
  from collections import Sequence, defaultdict
  from collections import Counter, Iterable


In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
raw_data_path = os.path.join("..", "..", "data", "raw")
processed_data_path = os.path.join("..", "..", "data", "processed")
models_path = os.path.join("..", "..", "models")
experiments_path = os.path.join("..", "..", "experiments")

In [4]:
# Создание файла с данными для обучения

In [6]:
logging.info("Loading main dataset...")
client = MongoClient('localhost', 27017)
db = client.publicru_test
collection = db.documents_collection

2020-03-30 20:11:41,557 : INFO : Loading main dataset...


In [7]:
result = collection.find({}).skip(1000).limit(1)
document = result[0]

In [8]:
text, bigrams, trigrams = [], [], []
for para in document["t_body"][0]:
    for sent in para:
        bigrams += list(nltk.bigrams(sent))
        trigrams += list(nltk.trigrams(sent))
        text += sent
bigrams = ["!".join(b) for b in bigrams]
trigrams = ["!".join(t) for t in trigrams]
#             title = document['t_title'][0] if document['t_title'] else []
title = document['t_title'][0][0][0] if document['t_title'][0][0] else []
parts = [f"{document['_id']}"]
parts += ['|@title']  + title
parts += ['|@text']  + text
parts += ['|@bigrams']  + bigrams
parts += ['|@trigrams']  + trigrams
post = ' '.join(parts)

In [9]:
post

'5e7854ecc59124ce04bed321 |@title чат_S помощь_S |@text аукцион_S рубрика_S вести_V информационный_A центр_S ъ рисунок_S сергей_S голосов_S покупатель_S любить_V общаться_V интернет_S магазин_S данные_S последний_A опрос_S проводить_V facebook половина_S потребитель_S предпочитать_V онлайн_ADV покупка_S сайт_S функция_S чат_S вывод_S коммуникация_S мессенджер_S набирать_V сила_S собираться_V замедляться_V будущее_S респондент_S пользоваться_V мессенджер_S коммерческий_A цель_S посылать_V сообщение_S ожидать_V активный_A переписка_S следующий_A использование_S приложение_S переписка_S являться_V прерогатива_S молодой_A миллениал_S предпочитать_V мессенджер_S звонок_S почта_S отставать_V родитель_S поколение_S х_S практически_ADV вровень_ADV идти_V бебибумер_S рождаться_V мировой_A война_S покупатель_S любить_V пользоваться_V услуга_S обратный_A связь_S быстрый_A отклик_S спрашивать_V товар_S услуга_S местонахождение_S часы_S работа_S магазин_S взаимодействие_S онлайн_ADV магазин_S мессе

In [16]:
def create_corpus_file(collection, path_folder):
    with open(os.path.join(experiments_path, path_folder, 'vowpal_wabbit_corpus.txt'), 'w') as the_file:
        mongo_filter = {
            "t_title": {"$exists": True}, 
            "t_body": {"$exists": True}, 
            "edition_name": { "$in": ["РБК Журнал", "Российская газета"]}, 
            "words_count": {"$gt": 300}
        }
        for document in tqdm(
                collection.find(mongo_filter, {"_id": 1, "t_title": 1, "t_body": 1}, no_cursor_timeout=True)):
            text, bigrams, trigrams = [], [], []
            for para in document["t_body"][0]:
                for sent in para:
                    bigrams += list(nltk.bigrams(sent))
                    trigrams += list(nltk.trigrams(sent))
                    text += sent
            bigrams = ["!".join(b) for b in bigrams]
            trigrams = ["!".join(t) for t in trigrams]
#             title = document['t_title'][0] if document['t_title'] else []
            title = document['t_title'][0]
            if title:
                title = title[0][0] if title[0] else []
            parts = [f"{document['_id']}"]
            parts += ['|@title']  + title
            parts += ['|@text']  + text
            parts += ['|@bigrams']  + bigrams
            parts += ['|@trigrams']  + trigrams
            post = ' '.join(parts)
            the_file.write(f"{post}\n")

In [17]:
def preparing_batch(path_folder):
    if len(glob.glob(os.path.join(experiments_path, path_folder, 'batches', '*.batch'))) > 1:
        logging.info("Remove old batches.")
        pth = Path(os.path.join(experiments_path, path_folder, 'batches'))
        for child in pth.glob('*'):
            if child.is_file():
                child.unlink()
        pth.rmdir()

    logging.info("Generate batches files...")
    batch_vectorizer = artm.BatchVectorizer(
        data_path=os.path.join(experiments_path, path_folder, 'vowpal_wabbit_corpus.txt'),
        data_format='vowpal_wabbit',
        batch_size=5000,
        target_folder=os.path.join(experiments_path, path_folder, 'batches'),
    )
    logging.info("Gathering dictionary...")
    dictionary = batch_vectorizer.dictionary
    dictionary.save_text(dictionary_path=os.path.join(experiments_path, path_folder, 'dictionary.txt'))

In [18]:
data_folder = "multiclass_model_test"

In [19]:
# mongo_filter = {"words_count": {"$gt": 400}}
mongo_filter = {"edition_name": { "$in": ["РБК Журнал", "Российская газета"]}, "words_count": {"$gt": 300}}
logging.info(f"Original dataset size: {collection.count_documents({})}")
logging.info(f"Working dataset size: {collection.count_documents(mongo_filter)}")

2020-03-30 20:14:15,132 : INFO : Original dataset size: 588456
2020-03-30 20:14:16,258 : INFO : Working dataset size: 23612


In [20]:
logging.info("Creating corpus file in vw format...")
Path(os.path.join(experiments_path, data_folder)).mkdir(parents=True, exist_ok=True)
create_corpus_file(collection, data_folder)
logging.info("Done.")

logging.info("Preparing batches for artm model...")
preparing_batch(data_folder)
logging.info("Finish.")

2020-03-30 20:14:16,270 : INFO : Creating corpus file in vw format...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




2020-03-30 20:14:44,107 : INFO : Done.
2020-03-30 20:14:44,109 : INFO : Preparing batches for artm model...
2020-03-30 20:14:44,111 : INFO : Generate batches files...
2020-03-30 20:17:54,353 : INFO : Gathering dictionary...
2020-03-30 20:19:46,930 : INFO : Finish.


In [13]:
def load_batches(path_folder, min_df):
    """
    path_folder: string, folder with vowpal_wabbit_corpus.txt and batches/
    min_df: integer, minimal document frequency 
    """
    if len(glob.glob(os.path.join(experiments_path, path_folder, 'batches', '*.batch'))) < 1:
        logging.info("Generating batches files...")
        bv = artm.BatchVectorizer(
            data_path=os.path.join(experiments_path, path_folder, 'vowpal_wabbit_corpus.txt'),
            data_format='vowpal_wabbit',
            batch_size=2000,
            target_folder=os.path.join(experiments_path, path_folder, 'batches'),
        )
        dictionary = bv.dictionary
        dictionary.save_text(dictionary_path=os.path.join(experiments_path, path_folder, 'dictionary.txt'))
    else:
        logging.info("Loading batches files...")
        bv = artm.BatchVectorizer(
            data_path=os.path.join(experiments_path, path_folder, 'batches'),
            data_format='batches',
        )
        dictionary = artm.Dictionary()
        dictionary.load_text(dictionary_path=os.path.join(experiments_path, path_folder, 'dictionary.txt'))

    regex = "\d+"
    match = re.findall(regex, str(dictionary))
    logging.info(f"Original dictionary size: {int(match[-1])} words.")
    dictionary.filter(min_df=min_df, inplace=True)
    match = re.findall(regex, str(dictionary))
    logging.info(f"Filtered dictionary size: {int(match[-1])} words.")
    return bv, dictionary


  regex = "\d+"


In [14]:
def create_topic_names(topic_count=200, background_topic_count=20):
    objective_topics = ['objective_topic_' + str(x) for x in range(0, topic_count - background_topic_count)]
    background_topics = ['background_topic_' + str(x) for x in range(topic_count - background_topic_count, topic_count)]
    all_topics = objective_topics + background_topics

    return all_topics, objective_topics, background_topics

In [15]:
def print_measures(model):
    logging.info('Sparsity Title Phi: {0:.3f}'.format(model.score_tracker['SparsityPhiTitleScore'].last_value))
    logging.info('Sparsity Text Phi: {0:.3f}'.format(model.score_tracker['SparsityPhiTextScore'].last_value))
    logging.info('Sparsity Bigrams Phi: {0:.3f}'.format(model.score_tracker['SparsityPhiBigramsScore'].last_value))
    logging.info('Sparsity Trigrams Phi: {0:.3f}'.format(model.score_tracker['SparsityPhiTrigramsScore'].last_value))
    logging.info('Sparsity Theta: {0:.3f}'.format(model.score_tracker['SparsityThetaScore'].last_value))
    logging.info('Kernel title contrast: {0:.3f}'.format(model.score_tracker['TopicKernelTitleScore'].last_average_contrast))
    logging.info('Kernel text contrast: {0:.3f}'.format(model.score_tracker['TopicKernelTextScore'].last_average_contrast))
    logging.info('Kernel bigrams contrast: {0:.3f}'.format(model.score_tracker['TopicKernelBigramsScore'].last_average_contrast))
    logging.info('Kernel trigrams contrast: {0:.3f}'.format(model.score_tracker['TopicKernelTrigramsScore'].last_average_contrast))
    logging.info('Kernel title purity: {0:.3f}'.format(model.score_tracker['TopicKernelTitleScore'].last_average_purity))
    logging.info('Kernel text purity: {0:.3f}'.format(model.score_tracker['TopicKernelTextScore'].last_average_purity))
    logging.info('Kernel bigrams purity: {0:.3f}'.format(model.score_tracker['TopicKernelBigramsScore'].last_average_purity))
    logging.info('Kernel trigrams purity: {0:.3f}'.format(model.score_tracker['TopicKernelTrigramsScore'].last_average_purity))
    logging.info('Perplexity: {0:.3f}'.format(model.score_tracker['PerplexityScore'].last_value))

In [16]:
def generate_name():
    return str(uuid.uuid1())

In [17]:
def mlflow_log_metrics(model):
    mlflow.log_metrics({
        "DeccorPhi": model.regularizers['DeccorPhi'].tau,
        "SmoothPhi": model.regularizers['SmoothPhi'].tau,
        "SmoothTheta": model.regularizers['SmoothTheta'].tau,
        "SparsePhi": model.regularizers['SparsePhi'].tau,
        "SparseTheta": model.regularizers['SparseTheta'].tau,
        "SparsityPhiTitleScore": model.score_tracker['SparsityPhiTitleScore'].last_value,
        "SparsityPhiTextScore": model.score_tracker['SparsityPhiTextScore'].last_value,
        "SparsityPhiBigramsScore": model.score_tracker['SparsityPhiBigramsScore'].last_value,
        "SparsityPhiTrigramsScore": model.score_tracker['SparsityPhiTrigramsScore'].last_value,
        "SparsityThetaScore": model.score_tracker['SparsityThetaScore'].last_value,
        "KernelContrastTitleScore": model.score_tracker['TopicKernelTitleScore'].last_average_contrast,
        "KernelContrastTextScore": model.score_tracker['TopicKernelTextScore'].last_average_contrast,
        "KernelContrastBigramsScore": model.score_tracker['TopicKernelBigramsScore'].last_average_contrast,
        "KernelContrastTrigramsScore": model.score_tracker['TopicKernelTrigramsScore'].last_average_contrast,
        "TopicPurityTitleScore": model.score_tracker['TopicKernelTitleScore'].last_average_purity,
        "TopicPurityTextScore": model.score_tracker['TopicKernelTextScore'].last_average_purity,
        "TopicPurityBigramsScore": model.score_tracker['TopicKernelBigramsScore'].last_average_purity,
        "TopicPurityTrigramsScore": model.score_tracker['TopicKernelTrigramsScore'].last_average_purity,
        "PerplexityScore": model.score_tracker['PerplexityScore'].last_value,
    }, step=model.num_phi_updates)

In [18]:
def next_step(i, model, batch_vectorizer, step_size, dataset_name):
    model_name = generate_name()
    logging.info(model_name)

    for _ in tqdm(range(step_size)):
        model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=1)
        mlflow_log_metrics(model)
    print_measures(model)

    Path(os.path.join(experiments_path, 'models')).mkdir(parents=True, exist_ok=True)
    model_dir_name = os.path.join(experiments_path, 'models', f"{model_name}")
    model.dump_artm_model(model_dir_name)
    mlflow.set_tag(f"model_dump_{i}", model_dir_name)
    mlflow.log_artifacts(model_dir_name)

In [26]:
dataset_name = "multiclass_model_test"
min_df = 6
num_all_topics = 200
num_background_topics = 10
step_size = 20

In [27]:
bv, dictionary = load_batches(dataset_name, min_df)

2020-03-30 22:59:42,616 : INFO : Loading batches files...
2020-03-30 23:02:13,465 : INFO : Original dictionary size: 13350950 words.
2020-03-30 23:02:18,859 : INFO : Filtered dictionary size: 324844 words.


In [28]:
all_topics, objective_topics, background_topics = create_topic_names(num_all_topics, num_background_topics)

In [29]:
scores_artm = [
        artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary, class_ids=["@title", "@text", "@bigrams", "@trigrams"]),
        artm.SparsityPhiScore(name='SparsityPhiTitleScore', topic_names=objective_topics, class_id="@title"),
        artm.SparsityPhiScore(name='SparsityPhiTextScore', topic_names=objective_topics, class_id="@text"),
        artm.SparsityPhiScore(name='SparsityPhiBigramsScore', topic_names=objective_topics, class_id="@bigrams"),
        artm.SparsityPhiScore(name='SparsityPhiTrigramsScore', topic_names=objective_topics, class_id="@trigrams"),
        artm.SparsityThetaScore(name='SparsityThetaScore', topic_names=objective_topics),
        artm.TopTokensScore(name='TopTokensTitleScore', num_tokens=20, topic_names=objective_topics, dictionary=dictionary, class_id="@title"),
        artm.TopTokensScore(name='TopTokensTextScore', num_tokens=20, topic_names=objective_topics, dictionary=dictionary, class_id="@text"),
        artm.TopTokensScore(name='TopTokensBigramsScore', num_tokens=20, topic_names=objective_topics, dictionary=dictionary, class_id="@bigrams"),
        artm.TopTokensScore(name='TopTokensTrigramsScore', num_tokens=20, topic_names=objective_topics, dictionary=dictionary, class_id="@trigrams"),
        artm.TopicKernelScore(name='TopicKernelTitleScore', class_id="@title", probability_mass_threshold=0.25,
                              topic_names=objective_topics, dictionary=dictionary),
        artm.TopicKernelScore(name='TopicKernelTextScore', class_id="@text", probability_mass_threshold=0.25,
                              topic_names=objective_topics, dictionary=dictionary),
        artm.TopicKernelScore(name='TopicKernelBigramsScore', class_id="@bigrams", probability_mass_threshold=0.25,
                              topic_names=objective_topics, dictionary=dictionary),
        artm.TopicKernelScore(name='TopicKernelTrigramsScore', class_id="@trigrams", probability_mass_threshold=0.25,
                              topic_names=objective_topics, dictionary=dictionary),
    ]

In [30]:
regularizers_artm = [
        artm.DecorrelatorPhiRegularizer(name='DeccorPhi', topic_names=objective_topics, gamma=0, tau=0),
        artm.SmoothSparsePhiRegularizer(name='SparsePhi', topic_names=objective_topics, dictionary=dictionary, gamma=0,
                                        tau=0),
        artm.SmoothSparsePhiRegularizer(name='SmoothPhi', topic_names=background_topics, dictionary=dictionary, gamma=0,
                                        tau=0),
        artm.SmoothSparseThetaRegularizer(name='SparseTheta', topic_names=objective_topics, tau=0),
        artm.SmoothSparseThetaRegularizer(name='SmoothTheta', topic_names=background_topics, tau=0),
        artm.TopicSelectionThetaRegularizer(name='TopicSelectionTheta', topic_names=objective_topics, tau=0)
    ]

In [31]:
model = artm.ARTM(
        num_topics=num_all_topics,
        topic_names=all_topics,
        class_ids={'@title': 3.0, '@text': 1.0, "@bigrams": 2.0, "@trigrams": 4.0},
        num_processors=mp.cpu_count() - 1,
        num_document_passes=2,
        regularizers=regularizers_artm,
        scores=scores_artm,
        dictionary=dictionary,
        cache_theta=False,
        seed=42,
        show_progress_bars=False
    )

In [32]:
logging.info("Run learning...")
mlflow.set_experiment(dataset_name)
with mlflow.start_run():

    # этап 1 - сильная декорреляция + сглаживание
    # Sparse < 0
    # Smooth > 0
    model.regularizers['DeccorPhi'].tau = 0.005
    model.regularizers['SmoothPhi'].tau = 0.4
    model.regularizers['SmoothTheta'].tau = 0.4
    next_step(1, model, bv, step_size, dataset_name)

    model.regularizers['DeccorPhi'].tau = 0.015
    model.regularizers['SmoothPhi'].tau = 0.6
    model.regularizers['SmoothTheta'].tau = 0.6
    next_step(2, model, bv, step_size, dataset_name)

    model.regularizers['DeccorPhi'].tau = 0.03
    model.regularizers['SmoothPhi'].tau = 0.8
    model.regularizers['SmoothTheta'].tau = 0.8
    next_step(3, model, bv, step_size, dataset_name)

    # этап 2 - подключение разреживания предметных, постепенное увеличение разреживания
    # Sparse < 0
    # Smooth > 0
    model.regularizers['SparsePhi'].tau = -0.0001
    model.regularizers['SparseTheta'].tau = -0.1
    next_step(4, model, bv, step_size, dataset_name)

    model.regularizers['SparsePhi'].tau = -0.0002
    model.regularizers['SparseTheta'].tau = -0.2
    next_step(5, model, bv, step_size, dataset_name)

    model.regularizers['SparsePhi'].tau = -0.0003
    model.regularizers['SparseTheta'].tau = -0.3
    next_step(6, model, bv, step_size, dataset_name)

    # этап 3
    # Sparse < 0
    # Smooth > 0
    model.regularizers['SparsePhi'].tau = -0.0005
    model.regularizers['SparseTheta'].tau = -0.4
    next_step(7, model, bv, step_size, dataset_name)

    model.regularizers['SparsePhi'].tau = -0.0006
    model.regularizers['SparseTheta'].tau = -0.5
    next_step(8, model, bv, step_size, dataset_name)

    model.regularizers['SparsePhi'].tau = -0.0007
    model.regularizers['SparseTheta'].tau = -0.6
    next_step(9, model, bv, step_size, dataset_name)

logging.info("Finish")

2020-03-30 23:02:29,859 : INFO : Run learning...
2020-03-30 23:02:29,909 : INFO : 62bdf61e-72c1-11ea-971c-2532616bfce2


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

2020-03-30 23:47:34,668 : INFO : Sparsity Title Phi: 0.368
2020-03-30 23:47:34,669 : INFO : Sparsity Text Phi: 0.401
2020-03-30 23:47:34,670 : INFO : Sparsity Bigrams Phi: 0.516
2020-03-30 23:47:34,671 : INFO : Sparsity Trigrams Phi: 0.707
2020-03-30 23:47:34,672 : INFO : Sparsity Theta: 0.000
2020-03-30 23:47:34,697 : INFO : Kernel title contrast: 0.456





2020-03-30 23:47:35,023 : INFO : Kernel text contrast: 0.496
2020-03-30 23:47:36,443 : INFO : Kernel bigrams contrast: 0.484
2020-03-30 23:47:36,925 : INFO : Kernel trigrams contrast: 0.603
2020-03-30 23:47:37,089 : INFO : Kernel title purity: 0.306
2020-03-30 23:47:37,408 : INFO : Kernel text purity: 0.199
2020-03-30 23:47:38,855 : INFO : Kernel bigrams purity: 0.422
2020-03-30 23:47:39,327 : INFO : Kernel trigrams purity: 0.661
2020-03-30 23:47:39,452 : INFO : Perplexity: 16.159
2020-03-30 23:47:46,139 : INFO : b5bdd8ce-72c7-11ea-971c-2532616bfce2


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

2020-03-31 00:36:02,860 : INFO : Sparsity Title Phi: 0.880
2020-03-31 00:36:02,862 : INFO : Sparsity Text Phi: 0.801
2020-03-31 00:36:02,864 : INFO : Sparsity Bigrams Phi: 0.913
2020-03-31 00:36:02,866 : INFO : Sparsity Trigrams Phi: 0.953
2020-03-31 00:36:02,867 : INFO : Sparsity Theta: 0.000
2020-03-31 00:36:02,945 : INFO : Kernel title contrast: 0.469





2020-03-31 00:36:03,828 : INFO : Kernel text contrast: 0.506
2020-03-31 00:36:07,524 : INFO : Kernel bigrams contrast: 0.501
2020-03-31 00:36:08,874 : INFO : Kernel trigrams contrast: 0.631
2020-03-31 00:36:08,966 : INFO : Kernel title purity: 0.348
2020-03-31 00:36:09,860 : INFO : Kernel text purity: 0.246
2020-03-31 00:36:13,636 : INFO : Kernel bigrams purity: 0.475
2020-03-31 00:36:15,006 : INFO : Kernel trigrams purity: 0.720
2020-03-31 00:36:15,049 : INFO : Perplexity: 15.760
2020-03-31 00:36:20,839 : INFO : 7f09b6e8-72ce-11ea-971c-2532616bfce2


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

2020-03-31 01:25:57,477 : INFO : Sparsity Title Phi: 0.925
2020-03-31 01:25:57,478 : INFO : Sparsity Text Phi: 0.858
2020-03-31 01:25:57,480 : INFO : Sparsity Bigrams Phi: 0.941
2020-03-31 01:25:57,481 : INFO : Sparsity Trigrams Phi: 0.968
2020-03-31 01:25:57,482 : INFO : Sparsity Theta: 0.000
2020-03-31 01:25:57,564 : INFO : Kernel title contrast: 0.484





2020-03-31 01:25:58,933 : INFO : Kernel text contrast: 0.508
2020-03-31 01:26:04,824 : INFO : Kernel bigrams contrast: 0.509
2020-03-31 01:26:06,872 : INFO : Kernel trigrams contrast: 0.644
2020-03-31 01:26:06,982 : INFO : Kernel title purity: 0.367
2020-03-31 01:26:08,356 : INFO : Kernel text purity: 0.271
2020-03-31 01:26:14,275 : INFO : Kernel bigrams purity: 0.500
2020-03-31 01:26:16,365 : INFO : Kernel trigrams purity: 0.745
2020-03-31 01:26:16,406 : INFO : Perplexity: 15.695
2020-03-31 01:26:22,278 : INFO : 7c090848-72d5-11ea-971c-2532616bfce2


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

2020-03-31 02:17:47,430 : INFO : Sparsity Title Phi: 0.951
2020-03-31 02:17:47,431 : INFO : Sparsity Text Phi: 0.911
2020-03-31 02:17:47,433 : INFO : Sparsity Bigrams Phi: 0.958
2020-03-31 02:17:47,435 : INFO : Sparsity Trigrams Phi: 0.978
2020-03-31 02:17:47,437 : INFO : Sparsity Theta: 0.075
2020-03-31 02:17:47,547 : INFO : Kernel title contrast: 0.483





2020-03-31 02:17:49,352 : INFO : Kernel text contrast: 0.508
2020-03-31 02:17:57,485 : INFO : Kernel bigrams contrast: 0.512
2020-03-31 02:18:00,238 : INFO : Kernel trigrams contrast: 0.648
2020-03-31 02:18:00,382 : INFO : Kernel title purity: 0.373
2020-03-31 02:18:02,212 : INFO : Kernel text purity: 0.276
2020-03-31 02:18:10,269 : INFO : Kernel bigrams purity: 0.507
2020-03-31 02:18:13,023 : INFO : Kernel trigrams purity: 0.752
2020-03-31 02:18:13,065 : INFO : Perplexity: 15.602
2020-03-31 02:18:19,299 : INFO : bdeca36c-72dc-11ea-971c-2532616bfce2


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

2020-03-31 03:12:03,378 : INFO : Sparsity Title Phi: 0.953
2020-03-31 03:12:03,380 : INFO : Sparsity Text Phi: 0.916
2020-03-31 03:12:03,382 : INFO : Sparsity Bigrams Phi: 0.960
2020-03-31 03:12:03,384 : INFO : Sparsity Trigrams Phi: 0.979
2020-03-31 03:12:03,385 : INFO : Sparsity Theta: 0.166
2020-03-31 03:12:03,518 : INFO : Kernel title contrast: 0.482





2020-03-31 03:12:05,790 : INFO : Kernel text contrast: 0.509
2020-03-31 03:12:16,003 : INFO : Kernel bigrams contrast: 0.513
2020-03-31 03:12:19,459 : INFO : Kernel trigrams contrast: 0.650
2020-03-31 03:12:19,631 : INFO : Kernel title purity: 0.375
2020-03-31 03:12:21,927 : INFO : Kernel text purity: 0.276
2020-03-31 03:12:32,306 : INFO : Kernel bigrams purity: 0.511
2020-03-31 03:12:35,787 : INFO : Kernel trigrams purity: 0.758
2020-03-31 03:12:35,835 : INFO : Perplexity: 15.540
2020-03-31 03:12:42,424 : INFO : 56e5fd6e-72e4-11ea-971c-2532616bfce2


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

2020-03-31 04:08:29,040 : INFO : Sparsity Title Phi: 0.954
2020-03-31 04:08:29,042 : INFO : Sparsity Text Phi: 0.918
2020-03-31 04:08:29,044 : INFO : Sparsity Bigrams Phi: 0.961
2020-03-31 04:08:29,046 : INFO : Sparsity Trigrams Phi: 0.979
2020-03-31 04:08:29,048 : INFO : Sparsity Theta: 0.256





2020-03-31 04:08:29,206 : INFO : Kernel title contrast: 0.482
2020-03-31 04:08:31,951 : INFO : Kernel text contrast: 0.509
2020-03-31 04:08:44,304 : INFO : Kernel bigrams contrast: 0.514
2020-03-31 04:08:48,552 : INFO : Kernel trigrams contrast: 0.652
2020-03-31 04:08:48,759 : INFO : Kernel title purity: 0.377
2020-03-31 04:08:51,479 : INFO : Kernel text purity: 0.274
2020-03-31 04:09:03,933 : INFO : Kernel bigrams purity: 0.512
2020-03-31 04:09:08,170 : INFO : Kernel trigrams purity: 0.760
2020-03-31 04:09:08,223 : INFO : Perplexity: 15.491
2020-03-31 04:09:15,295 : INFO : 3d3509a2-72ec-11ea-971c-2532616bfce2


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

2020-03-31 05:07:18,804 : INFO : Sparsity Title Phi: 0.955
2020-03-31 05:07:18,807 : INFO : Sparsity Text Phi: 0.919
2020-03-31 05:07:18,809 : INFO : Sparsity Bigrams Phi: 0.961
2020-03-31 05:07:18,811 : INFO : Sparsity Trigrams Phi: 0.980
2020-03-31 05:07:18,814 : INFO : Sparsity Theta: 0.335





2020-03-31 05:07:18,995 : INFO : Kernel title contrast: 0.482
2020-03-31 05:07:22,189 : INFO : Kernel text contrast: 0.509
2020-03-31 05:07:36,818 : INFO : Kernel bigrams contrast: 0.514
2020-03-31 05:07:41,844 : INFO : Kernel trigrams contrast: 0.653
2020-03-31 05:07:42,065 : INFO : Kernel title purity: 0.374
2020-03-31 05:07:45,217 : INFO : Kernel text purity: 0.272
2020-03-31 05:07:59,931 : INFO : Kernel bigrams purity: 0.512
2020-03-31 05:08:04,935 : INFO : Kernel trigrams purity: 0.762
2020-03-31 05:08:04,983 : INFO : Perplexity: 15.449
2020-03-31 05:08:12,378 : INFO : 7978fc4a-72f4-11ea-971c-2532616bfce2


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

2020-03-31 06:08:51,631 : INFO : Sparsity Title Phi: 0.955
2020-03-31 06:08:51,634 : INFO : Sparsity Text Phi: 0.920
2020-03-31 06:08:51,635 : INFO : Sparsity Bigrams Phi: 0.961
2020-03-31 06:08:51,637 : INFO : Sparsity Trigrams Phi: 0.980
2020-03-31 06:08:51,639 : INFO : Sparsity Theta: 0.402





2020-03-31 06:08:51,849 : INFO : Kernel title contrast: 0.482
2020-03-31 06:08:55,383 : INFO : Kernel text contrast: 0.508
2020-03-31 06:09:12,871 : INFO : Kernel bigrams contrast: 0.514
2020-03-31 06:09:18,553 : INFO : Kernel trigrams contrast: 0.653
2020-03-31 06:09:18,803 : INFO : Kernel title purity: 0.374
2020-03-31 06:09:22,383 : INFO : Kernel text purity: 0.269
2020-03-31 06:09:39,868 : INFO : Kernel bigrams purity: 0.511
2020-03-31 06:09:45,521 : INFO : Kernel trigrams purity: 0.762
2020-03-31 06:09:45,569 : INFO : Perplexity: 15.413
2020-03-31 06:09:53,368 : INFO : 176f48c0-72fd-11ea-971c-2532616bfce2


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

2020-03-31 07:13:36,241 : INFO : Sparsity Title Phi: 0.955
2020-03-31 07:13:36,243 : INFO : Sparsity Text Phi: 0.920
2020-03-31 07:13:36,245 : INFO : Sparsity Bigrams Phi: 0.962
2020-03-31 07:13:36,248 : INFO : Sparsity Trigrams Phi: 0.980
2020-03-31 07:13:36,250 : INFO : Sparsity Theta: 0.460





2020-03-31 07:13:36,482 : INFO : Kernel title contrast: 0.482
2020-03-31 07:13:40,578 : INFO : Kernel text contrast: 0.507
2020-03-31 07:14:00,555 : INFO : Kernel bigrams contrast: 0.514
2020-03-31 07:14:06,943 : INFO : Kernel trigrams contrast: 0.653
2020-03-31 07:14:07,233 : INFO : Kernel title purity: 0.374
2020-03-31 07:14:11,253 : INFO : Kernel text purity: 0.265
2020-03-31 07:14:31,158 : INFO : Kernel bigrams purity: 0.510
2020-03-31 07:14:37,505 : INFO : Kernel trigrams purity: 0.762
2020-03-31 07:14:37,566 : INFO : Perplexity: 15.382
2020-03-31 07:14:45,642 : INFO : Finish


In [55]:
%%time

with open("myfile.txt","w") as f:
    topics_tokens = ""
    for i, topic_name in enumerate(model.topic_names[:190]):
        f.write(f"topic_name: {i+1}\n\n")

        titles = " ".join([word.split("_")[0] for word in model.score_tracker['TopTokensTitleScore'].last_tokens[topic_name]])
        f.write(f"title keywords: {titles}\n\n")
       
        texts = " ".join([word.split("_")[0] for word in model.score_tracker['TopTokensTextScore'].last_tokens[topic_name]])
        f.write(f"text keywords: {texts}\n\n")

        bigrams = []
        for bigram in model.score_tracker['TopTokensBigramsScore'].last_tokens[topic_name]:
            words = []
            for word in bigram.split("!"):
                words.append(word.split("_")[0])
            bigrams.append("_".join(words))
        f.write(f"bigram keywords: {' '.join(bigrams)}\n\n")
        
        trigrams = []
        for trigram in model.score_tracker['TopTokensTrigramsScore'].last_tokens[topic_name]:
            words = []
            for word in trigram.split("!"):
                words.append(word.split("_")[0])
            trigrams.append("_".join(words))
        f.write(f"trigram keywords: {' '.join(trigrams)}\n{'-'*100}\n\n\n\n")