In [54]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np
import pre_clean as clean
from numpy import array
import pyLDAvis
import pyLDAvis.sklearn
import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity
from sklearn.decomposition import LatentDirichletAllocation


%matplotlib inline

For using pyLDAvis visualization

In [32]:
%pwd
%ls ../data/

biology_flashcards.txt              full_corpus_cleaned.txt
biology_flashcards_cleaned.txt      history_flashcards.txt
datascience_flashcards.txt          history_flashcards_cleaned.txt
datascience_flashcards_cleaned.txt


Import data

In [34]:
data = '../data/datascience_flashcards.txt'
df_datascience = clean.read_cards(data)
#   biology cards
data = '../data/biology_flashcards.txt'
df_biology = clean.read_cards(data)
#   history cards
data = '../data/history_flashcards.txt'
df_history = clean.read_cards(data)

concat to build full corpus

In [36]:
frames = [df_datascience, df_biology, df_history]
corpus = pd.concat(frames)
len(corpus)

36188

Clean Corpus

In [37]:
# clean corpus
corpus_clean = clean.clean_dataframe(corpus)
corpus_collapsed = clean.collapse_df(corpus_clean)

79                                                     
79                                              neutral
79    tobacco. didn't have much food though. john sm...
Name: answer, dtype: object


Set up vectorizers

In [46]:
count_vectorizer = CountVectorizer(min_df=5, max_df=0.80, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.80, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

Fit vectorizers

In [47]:
corpus_tfidf = tfidf_vectorizer.fit_transform(corpus_collapsed)
corpus_count = count_vectorizer.fit_transform(corpus_collapsed)

In [49]:
corpus_tfidf.shape

(36188, 9436)

Fit LDA mode

In [52]:
# With Count Vector, running with default 10 iterations
lda_corpus_count = LatentDirichletAllocation(n_topics=3, random_state=0)
lda_corpus_count.fit(corpus_count)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=3,
             perp_tol=0.1, random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [53]:
# With TF-IDF matrix, running with default 10 iterations
lda_corpus_tfidf = LatentDirichletAllocation(n_topics=3, random_state=0)
lda_corpus_tfidf.fit(corpus_tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=3,
             perp_tol=0.1, random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

## Visualizing with pyLDAvis on Full Corpus (all topics)

### First on TF Vector

In [63]:
# First with Count Vector
pyl_data = pyLDAvis.sklearn.prepare(lda_corpus_count, corpus_count, count_vectorizer, R=15)
pyLDAvis.display(pyl_data)
# pyLDAvis.save_html(pyl_data, "../images/count_vect_topics.html")

### Now on TF-IDF Vector

In [64]:
pyl_data = pyLDAvis.sklearn.prepare(lda_corpus_tfidf, corpus_tfidf, tfidf_vectorizer, R=15)
pyLDAvis.display(pyl_data)
# pyLDAvis.save_html(pyl_data, "../images/tfidf_vect_topics.html")