In [22]:
from __future__ import print_function
from time import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [23]:
# Read in the data
reviews = pd.read_csv('home_products_additional_features.csv', header=0, encoding="ISO-8859-1" )

In [77]:
def run_nmf(nmf_features, nmf_topics, nmf_top_words, nmf_data_samples, nmf_max_df, nmf_min_df, nmf_alpha, nmf_l1_ratio):
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df,
                                       max_features=n_features,
                                       stop_words='english')

    tfidf = tfidf_vectorizer.fit_transform(nmf_data_samples)

    nmf = NMF(n_components=n_topics, random_state=1,
              alpha=alpha, l1_ratio=l1_ratio).fit(tfidf)

    print("\nTopics in NMF model:")
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    print_top_words(nmf, tfidf_feature_names, n_top_words)

In [80]:
def run_lda(lda_features, lda_topics, lda_top_words, lda_data_samples, lda_max_df, lda_min_df, lda_max_iter, lda_learning_offset):
    tf_vectorizer = CountVectorizer(max_df=lda_max_df, min_df=lda_min_df,
                                    max_features=lda_features,
                                    stop_words='english')
    
    tf = tf_vectorizer.fit_transform(lda_data_samples)
    
    lda = LatentDirichletAllocation(n_topics=lda_topics, max_iter=lda_max_iter,
                                    learning_method='online',
                                    learning_offset=lda_learning_offset,
                                    random_state=0)
    
    lda.fit(tf)
    
    print("\nTopics in LDA model:")
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, lda_top_words)    

In [25]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [83]:
#////////////////////////SET NMF PARAMETERS AND RUN MODEL/////////////////////////////

# Uncomment one of the lines (and only one line) that begins with nmf_data_samples to change the data set the model runs on

#nmf_data_samples = list(reviews['Text'])
#nmf_data_samples = list(reviews['Title'])
#nmf_data_samples = list(reviews['text_and_title'])
#nmf_data_samples = list(reviews['double_title'])
#nmf_data_samples = list(reviews['text_and_title_no_stops'])
nmf_data_samples = list(reviews['double_title_no_stops'])
#nmf_data_samples = list(reviews['text_and_title_negation'])
#nmf_data_samples = list(reviews['double_title_negation'])
#nmf_data_samples = list(reviews['text_and_title_negation_no_stops'])
#nmf_data_samples = list(reviews['double_title_negation_no_stops'])
#nmf_data_samples = list(reviews['lemma_text_title_no_stops'])
#nmf_data_samples = list(reviews['lemma_double_title_no_stops'])

nmf_features = 10000         # Size of the vocabulary
nmf_topics = 10              # Number of topics
nmf_top_words = 20           # Words to include in the topic
nmf_max_df=0.95              # Ignore terms that have a doc frequency (percent or int) strictly higher than the given threshold
nmf_min_df=2                 # Ignore terms that have a doc frequency (percent or int) strictly lower than the given threshold
nmf_alpha=.1                 # Constant that multiplies the regularization terms. Set to zero for no regularization.
nmf_l1_ratio=.5              # Regularization mixing parameter.  0 <= l1_ratio <= 1

run_nmf(nmf_features, nmf_topics, nmf_top_words, nmf_data_samples, nmf_max_df, nmf_min_df, nmf_alpha, nmf_l1_ratio)

Extracting tf-idf features for NMF...

Topics in NMF model:
Topic #0:
clorox wipes convenient products use disinfect using house disinfects home like family cleaning germs trust especially know messes kids bleach
Topic #1:
stars price works work wipes glasses fast excellent job described great expected deal perfect time exactly worked shipping thank service
Topic #2:
great product works price work use cleaning excellent wipes recommend value cleans products job buy smells smell home deal house
Topic #3:
love wipes smell use using house absolutely convenience home products room smells kids scent em especially cleans product kitchen day
Topic #4:
good product price value smell really wipes job like buy works cleaning smells cleans stuff quality pretty screen deal work
Topic #5:
easy use convenient quick wipes kids cleanup fast cleaning super disinfect makes effective grab time clean make wipe mess messes
Topic #6:
disinfecting wipes lysol lemon cleaning scent pack great value convenient 

In [92]:
#////////////////////////SET LDA PARAMETERS AND RUN MODEL/////////////////////////////

# Uncomment one of the lines (and only one line) that begins with lda_data_samples to change the data set the model runs on

#lda_data_samples = list(reviews['Text'])
#lda_data_samples = list(reviews['Title'])
#lda_data_samples = list(reviews['text_and_title'])
#lda_data_samples = list(reviews['double_title'])
#lda_data_samples = list(reviews['text_and_title_no_stops'])
#lda_data_samples = list(reviews['double_title_no_stops'])
#lda_data_samples = list(reviews['text_and_title_negation'])
#lda_data_samples = list(reviews['double_title_negation'])
#lda_data_samples = list(reviews['text_and_title_negation_no_stops'])
#lda_data_samples = list(reviews['double_title_negation_no_stops'])
#lda_data_samples = list(reviews['lemma_text_title_no_stops'])
lda_data_samples = list(reviews['lemma_double_title_no_stops'])

lda_features = 12000         # Size of the vocabulary 
lda_topics = 20              # Number of topics
lda_top_words = 10           # Words to include in the topic
lda_max_df= 0.95             # Ignore terms that have a doc frequency (percent or int) strictly higher than the given threshold
lda_min_df= 5                # Ignore terms that have a doc frequency (percent or int) strictly lower than the given threshold
lda_max_iter=5               # Number of iterations to compute
lda_learning_offset=50.      # A parameter that downweights early iterations in online learning. Should be > 1

run_lda(lda_features, lda_topics, lda_top_words, lda_data_samples, lda_max_df, lda_min_df, lda_max_iter, lda_learning_offset)


Topics in LDA model:
Topic #0:
travel cover icloth equipment portable pinch ocd 24 pas kit
Topic #1:
great product good work value cleaning steel use stainless job
Topic #2:
star grease work cut good wipe love oil clean tool
Topic #3:
nice super baby puppy money waste softy generation seventh sturdy
Topic #4:
floor clean water ok hard scrub use dirt dirty tile
Topic #5:
excellent save quality silver product clear high subscribe expect regular
Topic #6:
wipe clorox clean love use easy disinfect bathroom kitchen house
Topic #7:
glass lens use clean effective live time day wipe year
Topic #8:
wipe clean work use screen good leave dust best job
Topic #9:
fast chemical smell review buy natural durable powerful skin green
Topic #10:
touchscreen im awsome received station hesitant luv thankful spic span
Topic #11:
lysol smell scent wipe lemon price fresh strong disinfect great
Topic #12:
flu office season cold especially shoe bug lifesaver bay pen
Topic #13:
spray towel surface paper bottle 