### Steps to reproduce: <br>
1) Install packages <br>
2) Restart the notebook so packages can turn on properly <br>
3) Run installation again <br>
4) Add the file with combined interview texts ('combined.docx') and dataframe with lemmatized documents ('interview_lemmatized.xlsx') <br>
5) Click preprocessing and look at df of words frequencies and decide which words you want to remove. Add them to 'additional_stopwords' list in preprocessing block and run the cell again <br>
5) Run Topic model <br>
6) Choose the mode and run fit <br>

## Installation of packages

In [1]:
!pip install clusteval
!pip install python-docx
!pip install stop-words
!pip install -U sentence-transformers
!pip install chainer
!pip install pyLDAvis
!pip install -U pandas
!pip install -U openpyxl
!pip install hdbscan
!pip install umap-learn
!pip install --upgrade numpy



In [2]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
import numpy as np
import nltk
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import gensim
from IPython.display import display, HTML
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
import gensim
from datetime import timedelta
import json
import itertools
import numpy as np
import stop_words
from gensim.models import CoherenceModel
import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
import time
import nltk.data
import docx
from gensim import corpora
from gensim.models import CoherenceModel
from tqdm.notebook import tqdm 
import keras
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.cluster import KMeans
from sklearn.model_selection import RandomizedSearchCV
import hdbscan
from sklearn.metrics import make_scorer
import umap.umap_ as umap

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import stop_words
rus = stop_words.get_stop_words('russian')
en = stop_words.get_stop_words('english')
all_sw = rus + en
additional = ['инт', 'инф']
all_sw += additional

In [4]:
import warnings
warnings.filterwarnings('ignore')

## After downloading packages, restart

## Preprocessing

In [9]:
def preprocess(text_file, lemmatized_excel_file, length_restrict, bigram_mincount, additional_stopwords):
  ''' length_restrict - the minimum length of the word to leave in the text
      bigram_mincount – Ignore all words and bigrams with total collected count lower than this value.'''

  print('Reading your transcripts...')

  def getText(text_file):
    doc = docx.Document(text_file)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

  def dataset_raw(text_file):
    text = getText(text_file)
    sentences = []
    tokenizer = nltk.data.load('tokenizers/punkt/russian.pickle')
    sentences = tokenizer.tokenize(text)
    paragraphs = text.split('\n')
    df = pd.DataFrame(paragraphs,
                    columns = ['paragraphs'],
                    index = range(1, len(paragraphs)+1))
    return df

  df_raw = dataset_raw('combined.docx')
  print('Raw dataset ready')
  #Provide excel file with lemmatized sentences
  print('Processing your lemmatized dataset...')

  df = pd.read_excel(lemmatized_excel_file, engine="openpyxl", index_col = 0)

  def text_to_array(length_restrict, lemmatized_df):
    ''' length_restrict - the minimum length of the word to leave in the text'''
    x_rus = []
    for i in range(len(lemmatized_df)):  
      string_spl = str(lemmatized_df['paragraphs'].iloc[i]).split()
      for i in string_spl:
        if i == 'nan' or len(i) < length_restrict+1:
          string_spl.remove(i)
      if len(string_spl) > length_restrict:
        x_rus.append(string_spl)
    united =  []
    for i in x_rus:
      for j in i:
        united.append(j)
    df_counts = pd.DataFrame({'text':united})
    df_counts = (df_counts['text'].str.split(expand=True)
                  .stack()
                  .value_counts()
                  .rename_axis('vals')
                  .reset_index(name='count'))
    
    return x_rus, df_counts

  x_rus, df_counts = text_to_array(length_restrict, df)


  def purification(additional_stopwords, array_to_clear, stop_words):
    stop_words += additional_stopwords
    x_rus_c = []
    for i in array_to_clear:
      for j in i:
        if j in stop_words:
          i.remove(j)
    for i in array_to_clear:
      x_rus_c.append(list(set(i)))
    return x_rus_c

  def make_corpus(clear_text_set, bigram_mincount):
    '''bigram_mincount – Ignore all words and bigrams with total collected count lower than this value.'''
    bigram = gensim.models.Phrases(clear_text_set, min_count=bigram_mincount, threshold=40)
    clear_text_set = [bigram[line] for line in clear_text_set]
    x_train_rus = [' '.join(i) for i in clear_text_set]
    dictionary = corpora.Dictionary(clear_text_set)
    corpus = [dictionary.doc2bow(text) for text in clear_text_set]
    return x_train_rus, dictionary, corpus

  print('Purifying the dataset with additional stop words...')
  x_rus_c = purification(additional_stopwords, x_rus, all_sw)
  print('Constructing the corpus...')
  x_train_rus, dictionary, corpus = make_corpus(x_rus_c, bigram_mincount)

  united =  []
  for i in x_rus:
    for j in i:
      united.append(j)
  df_counts_new = pd.DataFrame({'text':united})
  df_counts_new = (df_counts_new['text'].str.split(expand=True)
                  .stack()
                  .value_counts()
                  .rename_axis('vals')
                  .reset_index(name='count'))
  
  return df_raw, df_counts, df_counts_new, x_train_rus, x_rus, dictionary, corpus
      


In [10]:
additional_stopwords = ['понимать', 'думать', 'сделать', 'вообще', 'наверное', 'ничто', 'думать', 'полагать', 'вопрос', 'сделать',
              'знать', 'свой', 'например', 'все', 'таки', 'считать', 'самый', 'поэтому', 'происходить', 'вещь', 'бывать',
              'спрашивать', 'тип', 'насколько', 'точка', 'зрение', 'якобы']
all_sw += additional_stopwords

# additional_stopwords = []

df_raw, df_counts, df_counts_new, x_train_rus, x_rus, dictionary, corpus = preprocess('combined.docx', 'interview_lemmatized.xlsx', 2, 3, additional_stopwords)

Reading your transcripts...
Raw dataset ready
Processing your lemmatized dataset...
Purifying the dataset with additional stop words...
Constructing the corpus...


## df with frequencies

In [None]:
df_counts

Unnamed: 0,vals,count
0,включать,336
1,музыка,282
2,использовать,262
3,говорить,254
4,алиса,253
...,...,...
2596,обрываться,1
2597,пояснять,1
2598,ночной,1
2599,обусловливать,1


## Topic model

In [11]:
class Autoencoder:
    """
    Autoencoder for learning latent space representation
    architecture simplified for only one hidden layer
    """

    def __init__(self, latent_dim=32, activation='relu', epochs=200, batch_size=128):
        self.latent_dim = latent_dim
        self.activation = activation
        self.epochs = epochs
        self.batch_size = batch_size
        self.autoencoder = None
        self.encoder = None
        self.decoder = None
        self.his = None

    def _compile(self, input_dim):
        """
        compile the computational graph
        """
        input_vec = Input(shape=(input_dim,))
        encoded = Dense(self.latent_dim, activation=self.activation)(input_vec)
        decoded = Dense(input_dim, activation=self.activation)(encoded)
        self.autoencoder = Model(input_vec, decoded)
        self.encoder = Model(input_vec, encoded)
        encoded_input = Input(shape=(self.latent_dim,))
        decoder_layer = self.autoencoder.layers[-1]
        self.decoder = Model(encoded_input, self.autoencoder.layers[-1](encoded_input))
        self.autoencoder.compile(optimizer='adam', loss=keras.losses.mean_squared_error)

    def fit(self, X):
        if not self.autoencoder:
            self._compile(X.shape[1])
        X_train, X_test = train_test_split(X)
        self.his = self.autoencoder.fit(X_train, X_train,
                                        epochs=200,
                                        batch_size=128,
                                        shuffle=True,
                                        validation_data=(X_test, X_test), verbose=0)

In [12]:
class Topic_Model(object):
    def __init__(self, k=10, method='LDA'):
        """
        :param k: number of topics
        :param method: method chosen for the topic model
        """
        if method not in {'TFIDF', 'LDA', 'BERT', 'BERT_LDA', 'BERT_LDA_Kmeans', 'BERT_TFIDF_HDBSCAN', 'BERT_LDA_HDBSCAN', 'BERT_TFIDF_Kmeans'}:
            raise Exception('Invalid method!')
        print('Initialized')
        self.k = k
        self.dictionary = dictionary
        self.corpus = corpus
        self.stopwords = None
        self.cluster_model = None
        self.ldamodel = None
        self.gamma = 15  # parameter for reletive importance of lda
        self.vec = {}
        self.method = method
        self.AE = None
        self.sentences = x_train_rus
        np.random.seed(100)

    def vectorize(self, method=None):
        """Get vector representations from selected methods"""
        if method is None:
            method = self.method

        elif method == 'TFIDF':
          print('Getting vector representations for TF-IDF ...')
          tfidf = TfidfVectorizer()
          vec = tfidf.fit_transform(self.sentences)
          print('Getting vector representations for TF-IDF. Done!')
          return vec

        elif method == 'LDA':
            print('Getting vector representations for LDA ...')
            if not self.ldamodel:
                self.ldamodel = gensim.models.LdaMulticore(self.corpus, num_topics=self.k, 
                                       id2word = self.dictionary,
                                       workers = 2, passes=10,
                                       random_state=100,
                                       chunksize=100)
                def get_vec_lda(model, corpus, k):
                  n_doc = len(corpus)
                  vec_lda = np.zeros((n_doc, k))
                  for i in range(n_doc):
                      # get the distribution for the i-th document in corpus
                      for topic, prob in model.get_document_topics(corpus[i]):
                          vec_lda[i, topic] = prob
                  return vec_lda
                vec = get_vec_lda(self.ldamodel, self.corpus, self.k)
                print('Getting vector representations for LDA. Done!')
                return vec

        elif method == 'BERT':
            print('Getting vector representations for BERT ...')
            tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
            model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
            model= model.cuda() 
            x_train_rus_clear = []
            for i in x_train_rus:
              x_train_rus_clear.append(str(i))
            def embed_bert_cls(text, model, tokenizer):
              t = tokenizer(text, padding=True, truncation=False, return_tensors='pt')
              with torch.no_grad():
                  model_output = model(**{k: v.to(model.device) for k, v in t.items()})
              embeddings = model_output.last_hidden_state[:, 0, :]
              embeddings = torch.nn.functional.normalize(embeddings)
              return embeddings.cpu().numpy()
            class InterviewDataset(Dataset):
              def __init__(self, queries):
                  self.queries = queries
              def __len__(self):
                  return len(self.queries)
              def __getitem__(self, idx):
                  return str(self.queries[idx])
            data_loader = DataLoader(InterviewDataset(x_train_rus_clear), batch_size=1, shuffle=False)
            vecs = []
            for batch in enumerate(tqdm(data_loader)):
                batch_emdg = embed_bert_cls(batch[1], model, tokenizer)
                vecs.append(batch_emdg)
            vecs_bert = np.concatenate(vecs, axis=0 )
            print('Getting vector representations for BERT. Done!')
            return vecs_bert
            
        elif method == 'BERT_LDA':
          print('Getting vector representations')

          vec_lda = self.vectorize(method='LDA')
          vec_bert = self.vectorize(method='BERT')
          vec_ldabert = np.c_[vec_lda * self.gamma, vec_bert]
          self.vec['LDA_BERT_FULL'] = vec_ldabert
          if not self.AE:
              self.AE = Autoencoder()
              print('Fitting Autoencoder ...')
              self.AE.fit(vec_ldabert)
              print('Fitting Autoencoder Done!')
          vec = self.AE.encoder.predict(vec_ldabert)
          print('Getting vector representations. Done!')
          return vec

    def fit(self, corpus, dictionary, method=None, cluster_model=None):
        x_train_rus_clear = []
        for i in x_train_rus:
          x_train_rus_clear.append(str(i))        
        
        # Default method
        if method is None:
            method = self.method
        if method == 'LDA':
          if not self.ldamodel:
              print('Fitting LDA ...')
              self.ldamodel = gensim.models.LdaMulticore(self.corpus, num_topics=self.k, 
                                       id2word = self.dictionary,
                                       workers = 2, passes=10,
                                       random_state=100,
                                       chunksize=100)
              
              print('Fitting LDA Done!')
              for idx, topic in self.ldamodel.print_topics(-1):
                print('Topic: {} Word: {}'.format(idx, topic))

              for i in range(len(x_train_rus_clear)):
                  x_train_rus_clear[i] = str(x_train_rus_clear[i]).split(' ')
              processed_docs = np.array(x_train_rus_clear)
              
              def compute_coherence(lda_model, dictionary, coherence_metrics):
                coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs,
                                                dictionary=dictionary, coherence=coherence_metrics)
                return coherence_model_lda.get_coherence()
              num_topics = self.k
              topic_words = []
              for i in range(num_topics):
                  tt = self.ldamodel.get_topic_terms(i,20)
                  topic_words.append([dictionary[pair[0]] for pair in tt])
              def topic_diversity(topic_words):
                  topk = 10
                  if topic_words is None:
                      return 0
                  if topk > len(topic_words[0]):
                      raise Exception('Words in topics are less than ' + str(topk))
                  else:
                      unique_words = set()
                      for topic in topic_words:
                          unique_words = unique_words.union(set(topic[:topk]))
                      td = len(unique_words) / (topk * len(topic_words))
                      return td
              def _LOR(P, Q):
                  lor = 0
                  for v, w in zip(P, Q):
                      if v > 0 or w > 0:
                          lor = lor + np.abs(np.log(v) - np.log(w))
                  return lor / len(P)
              def Kullback_Leibler(ldamodel):
                  beta = ldamodel.get_topics()
                  kl_div = 0
                  count = 0
                  for i, j in itertools.combinations(range(len(beta)), 2):
                      kl_div += _LOR(beta[i], beta[j])
                      count += 1
                  return kl_div / count

              print('_________________________________________________________________')
              print('C_v coherence: ', compute_coherence(self.ldamodel, self.dictionary, 'c_v'))
              print('U_mass coherence: ', compute_coherence(self.ldamodel, self.dictionary, 'u_mass'))
              print('UCI : ', compute_coherence(self.ldamodel, self.dictionary, 'c_uci'))
              print('NPMI : ', compute_coherence(self.ldamodel, self.dictionary, 'c_npmi'))
              print('Topic_diversity :', topic_diversity(topic_words))
              print('Kullback-Leibler Divergence :', Kullback_Leibler(self.ldamodel))
                
        # Default clustering method
        elif cluster_model is None:
            cluster_model = self.cluster_model

        elif cluster_model == 'Kmeans':

          print('Clustering embeddings ...')
          cm = KMeans(self.k, random_state=100)
          self.vec[method] = self.vectorize(method)
          cm.fit(self.vec[method])
          print('Clustering embeddings. Done!')
          print('Getting topic words')
          def get_topic_words(token_lists, labels, k=None):
            """get top words within each topic from clustering results"""
            if k is None:
                k = len(np.unique(labels))
            topics = ['' for _ in range(k)]
            for i, c in enumerate(token_lists):
                topics[labels[i]] += (' ' + ' '.join(c))
            word_counts = list(map(lambda x: Counter(x.split()).items(), topics))
            # get sorted word counts
            word_counts = list(map(lambda x: sorted(x, key=lambda x: x[1], reverse=True), word_counts))
            # get topics
            topics = list(map(lambda x: list(map(lambda x: x[0], x[:10])), word_counts))
            return topics

          def get_coherence(labels, token_lists, measure):
            topics = get_topic_words(token_lists, labels)
            cm = CoherenceModel(topics=topics, texts=token_lists, corpus=self.corpus, dictionary=self.dictionary,
                                    coherence=measure)
            return cm.get_coherence()

          topics = get_topic_words(x_rus, cm.labels_)
          df_topic_keywords = pd.DataFrame(topics)
          df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
          df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    
          def topic_diversity(topic_words):
              topk = 10
              if topic_words is None:
                  return 0
              if topk > len(topic_words[0]):
                  raise Exception('Words in topics are less than ' + str(topk))
              else:
                  unique_words = set()
                  for topic in topic_words:
                      unique_words = unique_words.union(set(topic[:topk]))
                  td = len(unique_words) / (topk * len(topic_words))
                  return td

          print('_________________________________________________________________')
          print('C_v coherence: ', get_coherence(cm.labels_, x_rus, measure='c_v'))
          print('U_mass coherence: ', get_coherence(cm.labels_, x_rus, measure='u_mass'))
          print('UCI : ', get_coherence(cm.labels_, x_rus, measure='c_uci'))
          print('NPMI : ', get_coherence(cm.labels_, x_rus, measure='c_npmi'))
          print('Topic_diversity : ', topic_diversity(topics))
          print('_________________________________________________________________')

          return df_topic_keywords

        elif cluster_model == 'hdbscan':
          self.vec[method] = self.vectorize(method)
          umap_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=5, 
                            metric='cosine').fit_transform(self.vec[method])
          cm = hdbscan.HDBSCAN(gen_min_span_tree=True, min_cluster_size=5, min_samples = 6,
                          metric='euclidean', cluster_selection_method='eom').fit(umap_embeddings)
          docs_df = pd.DataFrame(x_train_rus_clear, columns=["Doc"])
          docs_df['Topic'] = cm.labels_
          docs_df['Doc_ID'] = range(len(docs_df))
          docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})
          labels = list(docs_df['Topic'])

          def c_tf_idf(documents, m, ngram_range=(1, 1)):
            count = CountVectorizer(ngram_range=ngram_range, stop_words=all_sw).fit(documents)
            t = count.transform(documents).toarray()
            w = t.sum(axis=1)
            tf = np.divide(t.T, w)
            sum_t = t.sum(axis=0)
            idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
            tf_idf = np.multiply(tf, idf)

            return tf_idf, count
          
          tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(x_train_rus_clear))
          def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
            words = count.get_feature_names()
            labels = list(docs_per_topic.Topic)
            tf_idf_transposed = tf_idf.T
            indices = tf_idf_transposed.argsort()[:, -n:]
            top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
            return top_n_words

          def extract_topic_sizes(df):
            topic_sizes = (df.groupby(['Topic'])
                            .Doc
                            .count()
                            .reset_index()
                            .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                            .sort_values("Size", ascending=False))
            return topic_sizes
          
          top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
          topic_sizes = extract_topic_sizes(docs_df) 
          
          #topic reduction
          wanted = self.k
          resize = len(topic_sizes) - (wanted-1)
          from sklearn.metrics.pairwise import cosine_similarity
          for i in range(resize):
            # Calculate cosine similarity
            similarities = cosine_similarity(tf_idf.T)
            np.fill_diagonal(similarities, 0)

            # Extract label to merge into and from where
            topic_sizes = docs_df.groupby(['Topic']).count().sort_values("Doc", ascending=False).reset_index()
            topic_to_merge = topic_sizes.iloc[-1].Topic
            topic_to_merge_into = np.argmax(similarities[topic_to_merge + 1]) - 1

            # Adjust topics
            docs_df.loc[docs_df.Topic == topic_to_merge, "Topic"] = topic_to_merge_into
            old_topics = docs_df.sort_values("Topic").Topic.unique()
            map_topics = {old_topic: index - 1 for index, old_topic in enumerate(old_topics)}
            docs_df.Topic = docs_df.Topic.map(map_topics)
            docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

            # Calculate new topic words
            m = len(x_train_rus_clear)
            tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m)
            top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
          
          topic_sizes = extract_topic_sizes(docs_df) 
          def get_topic_words(token_lists, labels, k=None):
              if k is None:
                  k = len(np.unique(labels))
              topics = ['' for _ in range(k)]
              for i, c in enumerate(token_lists):
                  topics[labels[i]] += (' ' + ' '.join(c))
              word_counts = list(map(lambda x: Counter(x.split()).items(), topics))
              word_counts = list(map(lambda x: sorted(x, key=lambda x: x[1], reverse=True), word_counts))
              topics = list(map(lambda x: list(map(lambda x: x[0], x[:10])), word_counts))
              return topics

          def get_coherence(labels, token_lists, measure='c_v'):
              topics = get_topic_words(token_lists, labels)
              cm = CoherenceModel(topics=topics, texts=token_lists, corpus=self.corpus, dictionary=self.dictionary,coherence=measure)
              return cm.get_coherence()
          
          topic_words = []
          for i in range(0, len(top_n_words)-1):
            topic = []
            for tupl in top_n_words[i]:
              topic.append(tupl[0])
            topic_words.append(topic)
          def topic_diversity(topic_words):
              topk = 10
              if topic_words is None:
                  return 0
              if topk > len(topic_words[0]):
                  raise Exception('Words in topics are less than ' + str(topk))
              else:
                  unique_words = set()
                  for topic in topic_words:
                      unique_words = unique_words.union(set(topic[:topk]))
                  td = len(unique_words) / (topk * len(topic_words))
                  return td    
          
          
          print('_________________________________________________________________')
          print('C_v coherence: ', get_coherence(list(docs_df['Topic']), x_rus, measure='c_v'))
          print('U_mass coherence: ', get_coherence(list(docs_df['Topic']), x_rus, measure='u_mass'))
          print('UCI : ', get_coherence(list(docs_df['Topic']), x_rus, measure='c_uci'))
          print('NPMI : ', get_coherence(list(docs_df['Topic']), x_rus, measure='c_npmi'))
          print('Topic_diversity : ', topic_diversity(topic_words))
          print('_________________________________________________________________')
          return top_n_words





## Fitting

BERT_HDBSCAN CHECK

Remember that hdbscan can't precisely make the model with the set number of topics

In [13]:
tm = Topic_Model(k = 10, method = 'BERT')

Initialized


In [14]:
tm.fit(corpus, dictionary, cluster_model='hdbscan')

Getting vector representations for BERT ...


Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/678M [00:00<?, ?B/s]

  0%|          | 0/1302 [00:00<?, ?it/s]

Getting vector representations for BERT. Done!
_________________________________________________________________
C_v coherence:  0.46711836398819073
U_mass coherence:  -5.243927009289497
UCI :  -2.2093662853200433
NPMI :  -0.014958780871574583
Topic_diversity :  0.8625
_________________________________________________________________


{-1: [('говорить', 0.029012087756061233),
  ('использовать', 0.028743774408192348),
  ('алиса', 0.027763832805770168),
  ('колонка', 0.02748825583900685),
  ('включать', 0.025266872501045805),
  ('музыка', 0.024912073249732228),
  ('ассистент', 0.02085359615174865),
  ('запрос', 0.020824875159947437),
  ('пользоваться', 0.02027920119464479),
  ('устройство', 0.01864932595013044),
  ('начинать', 0.01856224421132085),
  ('дом', 0.01817463464594161),
  ('телефон', 0.018129042359187608),
  ('получаться', 0.017362213856744527),
  ('делать', 0.016519073938965373),
  ('голосовой', 0.016233970202841504),
  ('ребенок', 0.015754110373979884),
  ('скоро', 0.015247839942347127),
  ('умный', 0.015065600080705292),
  ('основной', 0.014641764383304624)],
 0: [('целое', 0.31863114376505136),
  ('вежливый', 0.1650264145415674),
  ('идея', 0.14703959680332832),
  ('относиться', 0.12167278455401336),
  ('относиться_отношение', 0.11365980324381253),
  ('использовать', 0.10588961441984505),
  ('ассистент',

In [15]:
model_res = tm.fit(corpus, dictionary, cluster_model='hdbscan')

Getting vector representations for BERT ...


  0%|          | 0/1302 [00:00<?, ?it/s]

Getting vector representations for BERT. Done!
_________________________________________________________________
C_v coherence:  0.44770434595313513
U_mass coherence:  -4.982351291700264
UCI :  -1.9699330072057808
NPMI :  0.008762211086516945
Topic_diversity :  0.8625
_________________________________________________________________


## Graph

In [None]:
import json
import ast

In [129]:
nodes = []
links = []
for key, values in model_res.items():
  if key == -1:
    pass
  else:
    print(key, values)
    for tupl in values:
      nodes.append({'name': tupl[0]})
    # for tupl1 in range(len(values)-1):
    #   links.append({'source':0, 'target': tupl1})
nodes_str = []
for i in nodes:
  nodes_str.append(str(i))
nodes_str = list(set(nodes_str))
clear_nodes = []
for i in nodes_str:
  clear_nodes.append(ast.literal_eval(i))
node_namings_only = []
for node in clear_nodes:
  for key, node_name in node.items():
    node_namings_only.append(node_name)
links = []
for node in clear_nodes:
  for key, node_name in node.items():
    #print(node_name)
    for k,v in model_res.items():
      if k == -1:
        pass
      else:
        for tupl_id in range(len(v)):
          if  v[0][0] == node_name:
            links.append( {"source":node_namings_only.index(v[0][0]), 'target': node_namings_only.index(v[tupl_id][0])})         
#убираем одинаковые сурс и таргет
clear_links = []
for i in range(len(links)):
  compare = []
  for k,v in links[i].items():
    compare.append(v)
  if compare[0] == compare[1]:
    pass
  else:
    clear_links.append(links[i])

0 [('целое', 0.329270368599672), ('вежливый', 0.16050514291029158), ('идея', 0.14301111469912753), ('относиться', 0.11833928360732805), ('использовать', 0.11510482663365913), ('относиться_отношение', 0.11054583603165327), ('ассистент', 0.0957220441113344), ('форма_обращение', 0.09524347449202762), ('уместно', 0.09212153002637773), ('колонка', 0.07952937780569447), ('поболтать', 0.07817180122154878), ('вежливость', 0.06975637371354354), ('форма', 0.06669961273663655), ('умный', 0.06392905093607044), ('голосовой', 0.06168975060110656), ('окружение', 0.06026120685655799), ('норма', 0.05943865542789682), ('виртуальный', 0.05714608469521658), ('разряд', 0.055272918015826636), ('поиграть', 0.04585897214040724)]
1 [('подробно', 0.543535068262089), ('рассказывать', 0.4770527845407212), ('побуждать', 0.21497968171432566), ('возможность_функциональный', 0.19476286919097874), ('начинать', 0.1508187413821243), ('ассистент', 0.12422594169115399), ('голосовой', 0.12008938117015412), ('возможность', 

In [132]:
clear_nodes

[{'name': 'знакомый_практика'},
 {'name': 'конкретный'},
 {'name': 'вежливый'},
 {'name': 'различаться'},
 {'name': 'выключать'},
 {'name': 'яндекс'},
 {'name': 'открывать'},
 {'name': 'сторона'},
 {'name': 'телефон'},
 {'name': 'запрос'},
 {'name': 'ситуация'},
 {'name': 'просить'},
 {'name': 'самостоятельно'},
 {'name': 'сталкиваться'},
 {'name': 'относиться_отношение'},
 {'name': 'технология'},
 {'name': 'сложно'},
 {'name': 'отвечать'},
 {'name': 'понятно'},
 {'name': 'слышать'},
 {'name': 'серия'},
 {'name': 'надобность'},
 {'name': 'взаимодействие'},
 {'name': 'проживать'},
 {'name': 'алиса'},
 {'name': 'младший'},
 {'name': 'данный'},
 {'name': 'ошибка'},
 {'name': 'разум'},
 {'name': 'поиграть'},
 {'name': 'виртуальный'},
 {'name': 'подробно'},
 {'name': 'сиря'},
 {'name': 'решать'},
 {'name': 'проблема'},
 {'name': 'голосовой'},
 {'name': 'умный'},
 {'name': 'функциональный'},
 {'name': 'включать'},
 {'name': 'скажем'},
 {'name': 'сложность'},
 {'name': 'находиться'},
 {'name'

In [135]:
dict_json = {'links': clear_links, 'nodes': clear_nodes}

In [139]:
with open('graph.json', 'w', encoding='utf-8') as file:
    json.dump(dict_json, file, ensure_ascii=False)

LDA CHECK

In [None]:
tm1 = Topic_Model(k = 10, method = 'LDA')

Initialized


In [None]:
tm1.fit(corpus, dictionary)

Fitting LDA ...
Fitting LDA Done!
Topic: 0 Word: 0.012*"раздражать" + 0.012*"дело" + 0.012*"момент" + 0.012*"прямо" + 0.011*"большой" + 0.011*"интернет" + 0.011*"телевизор" + 0.010*"интересный" + 0.010*"телефон" + 0.009*"алекс"
Topic: 1 Word: 0.028*"музыка" + 0.020*"говорить" + 0.019*"включать" + 0.018*"проблема" + 0.015*"поговорить" + 0.014*"фраза" + 0.014*"колонка" + 0.013*"цель" + 0.012*"вызывать" + 0.012*"голос"
Topic: 2 Word: 0.042*"музыка" + 0.035*"включать" + 0.017*"настроение" + 0.015*"говорить" + 0.013*"нравиться" + 0.012*"дело" + 0.011*"утро" + 0.011*"использовать" + 0.011*"вечер" + 0.010*"допускать"
Topic: 3 Word: 0.039*"колонка" + 0.032*"ассистент" + 0.029*"использовать" + 0.026*"сталкиваться" + 0.025*"устройство" + 0.024*"использование" + 0.023*"подробно" + 0.019*"обращаться" + 0.018*"голосовой" + 0.013*"рассказывать"
Topic: 4 Word: 0.033*"алиса" + 0.023*"говорить" + 0.018*"разговаривать" + 0.015*"включать" + 0.015*"начинать" + 0.014*"погода" + 0.013*"запрос" + 0.011*"точн

BERT_KMEANS CHECK

In [None]:
tm2 = Topic_Model(k = 10, method = 'BERT')
tm2.fit(corpus, dictionary, cluster_model='Kmeans')

Initialized
Clustering embeddings ...
Getting vector representations for BERT ...


  0%|          | 0/1302 [00:00<?, ?it/s]

Getting vector representations for BERT. Done!
Clustering embeddings. Done!
Getting topic words
_________________________________________________________________
C_v coherence:  0.4029325251696475
U_mass coherence:  -3.074523991532285
UCI :  -0.828621006136968
NPMI :  -0.004317247693360652
Topic_diversity :  0.42
_________________________________________________________________


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,использовать,алиса,дом,колонка,ассистент,находиться,семья,включать,пользоваться,комната
Topic 1,музыка,включать,говорить,алиса,использовать,колонка,слушать,запрос,получаться,ребенок
Topic 2,включать,музыка,алиса,говорить,колонка,песня,либо,начинать,выключать,слушать
Topic 3,использовать,устройство,запрос,колонка,музыка,включать,ассистент,алиса,использование,ситуация
Topic 4,включать,вечер,утро,музыка,алиса,использовать,погода,свет,ужин,завтрак
Topic 5,ассистент,голосовой,говорить,использовать,телефон,колонка,запрос,музыка,целое,идеальный
Topic 6,использовать,дома,говорить,ассистент,устройство,самостоятельно,пользоваться,колонка,голосовой,музыка
Topic 7,ассистент,голосовой,использовать,говорить,запрос,колонка,телефон,алиса,начинать,пользоваться
Topic 8,проблема,говорить,сталкиваться,запрос,использование,ситуация,трудность,ассистент,недостаток,устройство
Topic 9,говорить,целое,скоро,колонка,ассистент,использовать,умный,пользоваться,итог,включать


BERT_LDA_KMEANS CHECK

In [None]:
tm3 = Topic_Model(k = 10, method = 'BERT_LDA')
tm3.fit(corpus, dictionary, cluster_model='Kmeans')

Initialized
Clustering embeddings ...
Getting vector representations
Getting vector representations for LDA ...
Getting vector representations for LDA. Done!
Getting vector representations for BERT ...


  0%|          | 0/1302 [00:00<?, ?it/s]

Getting vector representations for BERT. Done!
Fitting Autoencoder ...
Fitting Autoencoder Done!
Getting vector representations. Done!
Clustering embeddings. Done!
Getting topic words
_________________________________________________________________
C_v coherence:  0.4801940481979917
U_mass coherence:  -2.7254009025118493
UCI :  -0.5351680504703986
NPMI :  0.002609492583029728
Topic_diversity :  0.44
_________________________________________________________________


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,запрос,музыка,колонка,включать,поболтать,пробовать,реакция,говорить,погода,соответственно
Topic 1,запрос,колонка,ребенок,общий,ассистент,делать,использовать,целое,говорить,музыка
Topic 2,использовать,колонка,ассистент,подробно,сталкиваться,говорить,рассказывать,устройство,использование,начинать
Topic 3,говорить,момент,голос,включать,колонка,телефон,дело,музыка,находить,прямо
Topic 4,ассистент,колонка,использование,голосовой,умный,телефон,говорить,целое,алиса,дом
Topic 5,музыка,включать,алиса,говорить,колонка,рука,делать,ситуация,голос,послушать
Topic 6,использовать,ассистент,голосовой,пользоваться,колонка,алиса,телефон,дома,устройство,говорить
Topic 7,включать,говорить,музыка,алиса,запрос,просить,распознавать,либо,ситуация,реагировать
Topic 8,алиса,включать,музыка,говорить,запрос,разговаривать,погода,начинать,использовать,редко
Topic 9,включать,музыка,использовать,алиса,говорить,колонка,песня,дело,запрос,просить


BERT_LDA_HDBSCAN CHECK

In [None]:
tm4 = Topic_Model(k = 10, method = 'BERT_LDA')
tm4.fit(corpus, dictionary, cluster_model='hdbscan')

Initialized
Getting vector representations
Getting vector representations for LDA ...
Getting vector representations for LDA. Done!
Getting vector representations for BERT ...


  0%|          | 0/1302 [00:00<?, ?it/s]

Getting vector representations for BERT. Done!
Fitting Autoencoder ...
Fitting Autoencoder Done!
Getting vector representations. Done!
_________________________________________________________________
C_v coherence:  0.4476630284796845
U_mass coherence:  -3.827216020450319
UCI :  -1.4743960067842226
NPMI :  -0.012532944132612864
Topic_diversity :  0.7625
_________________________________________________________________


{-1: [('включать', 0.0340019151379548),
  ('музыка', 0.03280170828158372),
  ('говорить', 0.0273588715704079),
  ('колонка', 0.026158221774226977),
  ('запрос', 0.024661060076250973),
  ('алиса', 0.021672852078464193),
  ('использовать', 0.019399672521942902),
  ('слушать', 0.01714579522916201),
  ('делать', 0.016563958323650336),
  ('получаться', 0.016165861360786207),
  ('либо', 0.01608898274450433),
  ('основной', 0.015869491518602882),
  ('ассистент', 0.01496984377686056),
  ('допускать', 0.014920369793391952),
  ('просить', 0.014844846963177064),
  ('проблема', 0.014761337099280373),
  ('ребенок', 0.014610586577831248),
  ('нравиться', 0.014211792374004908),
  ('слышать', 0.013916681283651934),
  ('начинать', 0.01370292696184182)],
 0: [('подробно', 0.12001039248603587),
  ('ассистент', 0.09413494286182006),
  ('использовать', 0.08478269341082992),
  ('устройство', 0.07371394503906055),
  ('проживать', 0.07118372016428363),
  ('возможность', 0.06778157338855849),
  ('колонка', 0.0

## Черновик метрик

In [None]:
ldamodel = gensim.models.LdaMulticore(corpus, num_topics=10, 
                                       id2word = dictionary,
                                       workers = 2, passes=10,
                                       random_state=100,
                                       chunksize=100)

In [None]:
term_topic = ldamodel.get_topics()

In [None]:
term_topic.shape

(10, 2624)

In [None]:
def _LOR(P, Q):
    lor = 0
    for v, w in zip(P, Q):
        if v > 0 or w > 0:
            lor = lor + np.abs(np.log(v) - np.log(w))
    return lor / len(P)
def Kullback_Leibler(ldamodel):
    beta = ldamodel.get_topics()
    kl_div = 0
    count = 0
    for i, j in itertools.combinations(range(len(beta)), 2):
        kl_div += _LOR(beta[i], beta[j])
        count += 1
    return kl_div / count

In [None]:
Kullback_Leibler(ldamodel)

0.9005306353292816