In [54]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np
import pre_clean as clean
from numpy import array
import pyLDAvis
import pyLDAvis.sklearn
import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity
from sklearn.decomposition import LatentDirichletAllocation


%matplotlib inline

For using pyLDAvis visualization

In [32]:
%pwd
%ls ../data/

biology_flashcards.txt              full_corpus_cleaned.txt
biology_flashcards_cleaned.txt      history_flashcards.txt
datascience_flashcards.txt          history_flashcards_cleaned.txt
datascience_flashcards_cleaned.txt


**Import data**

In [34]:
data = '../data/datascience_flashcards.txt'
df_datascience = clean.read_cards(data)
#   biology cards
data = '../data/biology_flashcards.txt'
df_biology = clean.read_cards(data)
#   history cards
data = '../data/history_flashcards.txt'
df_history = clean.read_cards(data)

**Clean Each Data Set**

In [65]:
df_datascience_clean = clean.clean_dataframe(df_datascience)
df_biology_clean = clean.clean_dataframe(df_biology)
df_history_clean = clean.clean_dataframe(df_history)

 
neutral
tobacco. didn't have much food though. john smith led them, while john rolf got the tobacca with pocahonta (sp ).


**Collapse Each Dataset**

In [67]:
df_datascience_collapsed = clean.collapse_df(df_datascience_clean)
df_biology_collapsed = clean.collapse_df(df_biology_clean)
df_history_collapsed = clean.collapse_df(df_history_clean)

**Set Up Vectorizers**

In [81]:
# datascience
ds_count_vectorizer = CountVectorizer(min_df=5, max_df=0.80, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
ds_tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.80, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

# biology
bio_count_vectorizer = CountVectorizer(min_df=5, max_df=0.80, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
bio_tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.80, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

# history
his_count_vectorizer = CountVectorizer(min_df=5, max_df=0.80, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
his_tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.80, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

**Fit Vectorizers**

In [82]:
# datascience
ds_tfidf = ds_tfidf_vectorizer.fit_transform(df_datascience_collapsed)
ds_count = ds_count_vectorizer.fit_transform(df_datascience_collapsed)

# biology
bio_tfidf = bio_tfidf_vectorizer.fit_transform(df_biology_collapsed)
bio_count = bio_count_vectorizer.fit_transform(df_biology_collapsed)

# history
his_tfidf = his_tfidf_vectorizer.fit_transform(df_history_collapsed)
his_count = his_count_vectorizer.fit_transform(df_history_collapsed)

**Fit LDA Models**

*Data Science*

In [83]:
# With Count Vector, running with default 10 iterations, 5 topics
lda_ds_count = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_ds_count.fit(ds_count)

# With TF-IDF matrix, running with default 10 iterations
lda_ds_tfidf = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_ds_tfidf.fit(ds_tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=5,
             perp_tol=0.1, random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

*Biology*

In [72]:
# With Count Vector, running with default 10 iterations, 5 topics
lda_bio_count = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_bio_count.fit(bio_count)

# With TF-IDF matrix, running with default 10 iterations
lda_bio_tfidf = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_bio_tfidf.fit(bio_tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=5,
             perp_tol=0.1, random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

*History*

In [73]:
# With Count Vector, running with default 10 iterations, 5 topics
lda_his_count = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_his_count.fit(his_count)

# With TF-IDF matrix, running with default 10 iterations
lda_his_tfidf = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_his_tfidf.fit(his_tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=5,
             perp_tol=0.1, random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

## Visualizing with pyLDAvis on each topic separately


**Data Science Topic Vis**

In [93]:
# First with Count Vector on Data Science
pyl_ds_data_cv = pyLDAvis.sklearn.prepare(lda_ds_count, ds_count, ds_count_vectorizer, R=15)
pyLDAvis.display(pyl_ds_data_cv)
# pyLDAvis.save_html(pyl_ds_data_cv, "../images/datascience_count_vect_topics.html")

In [95]:
# Next with TFIDF Vector on Data Science
pyl_ds_data_tfidf = pyLDAvis.sklearn.prepare(lda_ds_tfidf, ds_tfidf, ds_tfidf_vectorizer, R=15)
pyLDAvis.display(pyl_ds_data_tfidf)
# pyLDAvis.save_html(pyl_ds_data_tfidf, "../images/datascience_tfidf_vect_topics.html")

**Biology Topic Vis**

In [97]:
# First with Count Vector on Biology
pyl_bio_data_cv = pyLDAvis.sklearn.prepare(lda_bio_count, bio_count, bio_count_vectorizer, R=15)
pyLDAvis.display(pyl_bio_data_cv)
# pyLDAvis.save_html(pyl_bio_data_cv, "../images/bio_count_vect_topics.html")

In [99]:
# Next with TFIDF Vector on Biology
pyl_bio_data_tfidf = pyLDAvis.sklearn.prepare(lda_bio_tfidf, bio_tfidf, bio_tfidf_vectorizer, R=15)
pyLDAvis.display(pyl_bio_data_tfidf)
# pyLDAvis.save_html(pyl_bio_data_tfidf, "../images/bio_tfidf_vect_topics.html")

**History Topic Vis**

In [101]:
# First with Count Vector on History
pyl_his_data_cv = pyLDAvis.sklearn.prepare(lda_his_count, his_count, his_count_vectorizer, R=15)
pyLDAvis.display(pyl_his_data_cv)
# pyLDAvis.save_html(pyl_his_data_cv, "../images/his_count_vect_topics.html")

In [103]:
# Next with TFIDF Vector on History
pyl_his_data_tfidf = pyLDAvis.sklearn.prepare(lda_his_tfidf, his_tfidf, his_tfidf_vectorizer, R=15)
pyLDAvis.display(pyl_his_data_tfidf)
# pyLDAvis.save_html(pyl_his_data_tfidf, "../images/his_tfidf_vect_topics.html")

## **concat to build full corpus**

In [105]:
frames = [df_datascience, df_biology, df_history]
corpus = pd.concat(frames)
len(corpus)

36188

**Clean Corpus**

In [106]:
# clean corpus
corpus_clean = clean.clean_dataframe(corpus)
corpus_collapsed = clean.collapse_df(corpus_clean)

79                                                     
79                                              neutral
79    tobacco. didn't have much food though. john sm...
Name: answer, dtype: object


**Set up vectorizers**

In [107]:
count_vectorizer = CountVectorizer(min_df=5, max_df=0.80, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.80, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

**Fit vectorizers**

In [108]:
corpus_tfidf = tfidf_vectorizer.fit_transform(corpus_collapsed)
corpus_count = count_vectorizer.fit_transform(corpus_collapsed)

In [109]:
corpus_tfidf.shape

(36188, 9436)

**Fit LDA models**

In [110]:
# With Count Vector, running with default 10 iterations
lda_corpus_count = LatentDirichletAllocation(n_topics=3, random_state=0)
lda_corpus_count.fit(corpus_count)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=3,
             perp_tol=0.1, random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [111]:
# With TF-IDF matrix, running with default 10 iterations
lda_corpus_tfidf = LatentDirichletAllocation(n_topics=3, random_state=0)
lda_corpus_tfidf.fit(corpus_tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=3,
             perp_tol=0.1, random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

## Visualizing with pyLDAvis on Full Corpus (all topics)

### First on TF Vector

In [112]:
# First with Count Vector
pyl_data_cv = pyLDAvis.sklearn.prepare(lda_corpus_count, corpus_count, count_vectorizer, R=15)
pyLDAvis.display(pyl_data_cv)
# pyLDAvis.save_html(pyl_data_cv, "../images/all_count_vect_topics.html")

### Now on TF-IDF Vector

In [113]:
pyl_data_tfidf = pyLDAvis.sklearn.prepare(lda_corpus_tfidf, corpus_tfidf, tfidf_vectorizer, R=15)
pyLDAvis.display(pyl_data_tfidf)
# pyLDAvis.save_html(pyl_data_tfidf, "../images/all_tfidf_vect_topics.html")