In [3]:
import time
def timer(f):
    def tmp(*args, **kwargs):
        t = time.time()
        res = f(*args, **kwargs)
        with open('./time_log.txt', 'a+') as log:
            log.write("Время выполнения функции {}: {}\n".format(f.__name__, time.time()-t))
        return res
    return tmp

In [4]:
import os

root_path = '/Users/nikgerasimenko/Desktop/Учеба/2 семестр/Стрижов/Банкротство'
subject_paths = ['Арбитражные управляющие',
                 'Особенности банкротства отдельных категорий должников',
                 'Упрощенные процедуры банкротства/Банкротство ликвидируемого должника',
                 'Упрощенные процедуры банкротства/Банкротство отсутствующего должника',
                 'Процедуры банкротства/Мировое соглашение',
                 'Процедуры банкротства/Общие положения',
                 'Процедуры банкротства/Наблюдение',
                 'Процедуры банкротства/Внешнее управление',
                 'Процедуры банкротства/Конкурсное производство',
                ]
subject_paths = [os.path.join(root_path, subject_path) for subject_path in subject_paths]

document_collection_dict = [(subject_path.split('/')[-1], os.path.join(subject_path, document_name, list(filter(lambda x: x[-1]!='_', os.listdir(os.path.join(subject_path, document_name))))[0])) for subject_path in subject_paths for document_name in os.listdir(subject_path) if document_name!='.DS_Store']
document_collection = [document for (_, document) in document_collection_dict]

In [None]:
# from preprocess.to_UCI_or_VW_format import to_VW

# to_VW(document_collection, 'ld', cut_most_fr_procent=5)

In [5]:
%matplotlib inline
import glob
import os
import matplotlib.pyplot as plt

import artm

In [20]:
len(document_collection_dict)

7937

In [6]:
batch_vectorizer = artm.BatchVectorizer(data_path='vw.ld.txt',
                                        data_format='vowpal_wabbit',
                                        target_folder='ld')

In [None]:
num_topics = 50

scores = [artm.PerplexityScore(name='PerplexityScore', dictionary=batch_vectorizer.dictionary),
          artm.SparsityPhiScore(name='SparsityPhiScore'),
          artm.SparsityThetaScore(name='SparsityThetaScore'),
          artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3)]

regularizers = [artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1e+8),
                artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-2.5),
                artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.5)]

model_artm = artm.ARTM(num_topics=num_topics, cache_theta=True, 
                       dictionary=batch_vectorizer.dictionary,
                       scores=scores,
                       class_ids={'@default_class': 1.0, '@na_class': 5.0},
                       regularizers=regularizers)

In [None]:
model_artm.num_document_passes = 1
model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=15)

In [None]:
def print_measures(model_artm):
    print('Sparsity Phi: {}'.format(model_artm.score_tracker['SparsityPhiScore'].last_value))
    print('Sparsity Theta: {}'.format(model_artm.score_tracker['SparsityThetaScore'].last_value))
    print('Kernel contrast: {}'.format(model_artm.score_tracker['TopicKernelScore'].last_average_contrast))
    print('Kernel purity: {}'.format(model_artm.score_tracker['TopicKernelScore'].last_average_purity))
    print('Perplexity: {}'.format(model_artm.score_tracker['PerplexityScore'].last_value))

    plt.plot(range(model_artm.num_phi_updates),
             model_artm.score_tracker['PerplexityScore'].value, 'r--', linewidth=2)
    plt.xlabel('Iterations count')
    plt.ylabel('ARTM perplexity')
    plt.grid(True)
    plt.show()

print_measures(model_artm)

In [None]:
plt.plot(range(model_artm.num_phi_updates),
         model_artm.score_tracker['SparsityPhiScore'].value, 'r--', linewidth=2)

plt.xlabel('Iterations count')
plt.ylabel('Phi sparsity')
plt.grid(True)
plt.show()

plt.plot(range(model_artm.num_phi_updates),
         model_artm.score_tracker['SparsityThetaScore'].value, 'r--', linewidth=2)

plt.xlabel('Iterations count')
plt.ylabel('Theta sparsity')
plt.grid(True)
plt.show()

In [3]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn import decomposition
from sklearn.metrics.cluster import adjusted_rand_score as ari
from sklearn.metrics.cluster import adjusted_mutual_info_score as ami

def get_score(matrix, n_clusters):
    res = []
    for i in range(10,n_clusters+1):
        model = KMeans(n_clusters=i)
        model.fit(matrix)
        all_predictions = model.predict(matrix)
        doc_subjects = np.array([label for (label, _) in document_collection_dict])     #создаем список тематик документов
        res.append(ami(doc_subjects[:-1], all_predictions))
    return max(res), min(res), sum(res) / len(res)

In [18]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from sklearn import datasets
from sklearn.cluster import DBSCAN   
from sklearn import decomposition
from sklearn.metrics.cluster import adjusted_rand_score as ari
from sklearn.metrics.cluster import adjusted_mutual_info_score as ami

def get_score(matrix):
    model = DBSCAN()
    model.fit(matrix)
    all_predictions = model.labels_
    doc_subjects = np.array([label for (label, _) in document_collection_dict])     #создаем список тематик документов
    return ami(doc_subjects[:-1], all_predictions)

In [12]:
@timer
def get_grid_values(tau_m_space, sm_sp_theta_space, sm_sp_phi_space, decor_phi_space, 
                    model_artm, num_collection_passes, batch_vectorizer,
                    n_clusters, phi_theta_scores):
    for tau_m in tau_m_space:
        model_artm.class_ids = {'@default_class': 1.0, '@na_class': tau_m}
        for sm_sp_theta in sm_sp_theta_space:
            model_artm.regularizers['SmoothSparseTheta'].tau = sm_sp_theta
            for sm_sp_phi in sm_sp_phi_space:
                model_artm.regularizers['SmoothSparsePhi'].tau = sm_sp_phi
                for decor_phi in decor_phi_space:
                    model_artm.regularizers['DecorrelatorPhi'].tau = decor_phi
                    model_artm.fit_offline(batch_vectorizer=batch_vectorizer, 
                                           num_collection_passes=num_collection_passes)
                    theta_matrix_T = np.array(model_artm.transform(batch_vectorizer=batch_vectorizer)).T
                    score = {
                               'tau_m' : tau_m,
                               'sm_sp_theta' : sm_sp_theta,
                               'sm_sp_phi' : sm_sp_phi,
                               'decor_phi' : decor_phi,
                               'score' : get_score(theta_matrix_T)
                            }
                    with open('grid_log.txt', 'a+') as log_file:
                        log_file.write('\n' + str(score))
                    phi_theta_scores.append(score)

In [7]:
import artm

batch_vectorizer = artm.BatchVectorizer(data_path='vw.ld.txt',
                                        data_format='vowpal_wabbit',
                                        target_folder='ld')

In [10]:
num_topics = 200

scores = [artm.PerplexityScore(name='PerplexityScore', dictionary=batch_vectorizer.dictionary),
          artm.SparsityPhiScore(name='SparsityPhiScore'),
          artm.SparsityThetaScore(name='SparsityThetaScore')]

regularizers = [artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1e+8),
                artm.SmoothSparseThetaRegularizer(name='SmoothSparseTheta', tau=-2.5),
                artm.SmoothSparsePhiRegularizer(name='SmoothSparsePhi', tau=-0.5)]

model_artm = artm.ARTM(num_topics=num_topics, cache_theta=True, 
                       dictionary=batch_vectorizer.dictionary,
                       scores=scores,
                       class_ids={'@default_class': 1.0, '@na_class': 5.0},
                       regularizers=regularizers)

In [19]:
tau_m_space = np.linspace(5,15,5)
sm_sp_theta_space = np.linspace(-2,-0.5,4)
sm_sp_phi_space = np.linspace(0.25,2,8)
decor_phi_space = np.geomspace(1e+3,1e+8,6)
n_clusters = 30
num_collection_passes = 25
phi_theta_scores = []

get_grid_values(tau_m_space, sm_sp_theta_space, sm_sp_phi_space, decor_phi_space,
                model_artm, num_collection_passes, batch_vectorizer,
                n_clusters, phi_theta_scores)

Exception ignored in: <generator object tqdm_notebook.__iter__ at 0x1294d7d00>
Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm_notebook.py", line 226, in __iter__
    self.sp(bar_style='danger')
AttributeError: 'tqdm_notebook' object has no attribute 'sp'


KeyboardInterrupt: 

In [24]:
len(tau_m_space) * len(sm_sp_theta_space) * len(sm_sp_phi_space) * len(decor_phi_space)

240

In [38]:
np.geomspace(1e+3,1e+8,6)

array([1.e+03, 1.e+04, 1.e+05, 1.e+06, 1.e+07, 1.e+08])

In [None]:
phi_theta_scores.pop(-1)

In [None]:
phi_theta_scores=[]

In [None]:
print('_'*80)

In [None]:
scores_s= [scores]

In [None]:
scores_s