# Topic Modeling with gensim

In [1]:
from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import pickle

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
# gensim
import gensim
from gensim import corpora, models, similarities, matutils
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from gensim import corpora, models, similarities, matutils
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import pickle
import numpy as np
import pandas as pd

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


## Document Preprocessing
We'll need to generate a term-document matrix of word (token) counts for use in LDA.

We'll use `sklearn`'s `CountVectorizer` to generate our term-document matrix of counts. We'll make use of a few parameters to accomplish the following preprocessing of the text documents all within the `CountVectorizer`:
* `analyzer=word`: Tokenize by word
* `ngram_range=(1,2)`: Keep all 1 and 2-word grams
* `stop_words=english`: Remove all English stop words
* `token_pattern=\\b[a-z][a-z]+\\b`: Match all tokens with 2 or more (strictly) alphabet characters

In [22]:
#Open Corpus of Bitcoin News Articles
with open('../news_articles_processing/data_frames/bitcoin_news_df_processed_for_modeling.pickle', 'rb') as file:
     bitcoin_news_df = pickle.load(file)

### Prepare Bigram Text

In [7]:
bigram_text = Phrases(bitcoin_news_df.tokenized_text, min_count=2, threshold = 80);
bigram_mod_text = Phraser(bigram_text);

2019-06-12 01:28:48,031 : INFO : collecting all words and their counts
2019-06-12 01:28:48,032 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2019-06-12 01:28:51,990 : INFO : PROGRESS: at sentence #10000, processed 2608196 words and 1414836 word types
2019-06-12 01:28:54,755 : INFO : collected 2238480 word types from a corpus of 4473232 words (unigram + bigrams) and 17048 sentences
2019-06-12 01:28:54,755 : INFO : using 2238480 counts as vocab in Phrases<0 vocab, min_count=2, threshold=80, max_vocab_size=40000000>
2019-06-12 01:28:54,756 : INFO : source_vocab length 2238480
2019-06-12 01:29:13,208 : INFO : Phraser built with 25218 25218 phrasegrams


In [23]:
words_bigrams_news_text = pd.Series([bigram_mod_text[doc] for doc in bitcoin_news_df.tokenized_text])

In [25]:
news_extracted = words_bigrams_news_text.apply(lambda x: ' '.join(x))

## Convert to document-term matrix
Next, the raw documents are converted into document-term matrix, possibly as raw counts or in TF-IDF form.

In [28]:
#Bigrams
tf_vectorizer_bi = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)

dtm_tf_bi = tf_vectorizer_bi.fit_transform(news_extracted)
print(dtm_tf_bi.shape)

tfidf_vectorizer_bi = TfidfVectorizer(**tf_vectorizer_bi.get_params())
dtm_tfidf_bi = tfidf_vectorizer_bi.fit_transform(news_extracted)
print(dtm_tfidf_bi.shape)

(17048, 14468)




(17048, 14468)


## Fit Latent Dirichlet Allocation models
Finally, the LDA models are fitted.

In [29]:
#Bigrams
# for TF DTM
lda_tf_bi = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tf_bi.fit(dtm_tf_bi)

# for TFIDF DTM
lda_tfidf_bi = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tfidf_bi.fit(dtm_tfidf_bi)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=20, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

## Visualizing the models with pyLDAvis

In [30]:
bi_real = pyLDAvis.sklearn.prepare(lda_tf_bi, dtm_tf_bi, tf_vectorizer_bi)
bi_real

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
