In [2]:
%matplotlib inline
import random
random.seed(1234)

import pandas as pd
import gzip
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# import nltk
# nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import spacy
import matplotlib.pyplot as plt
import pyLDAvis #python library for interactive topic model visualization
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# import warnings
# warnings.filterwarnings("ignore",category=DeprecationWarning)

import pickle
import numpy as np

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
# import pyLDAvis
# import pyLDAvis.sklearn
# import matplotlib.pyplot as plt
%matplotlib inline

from tqdm.notebook import tqdm as tqdm
tqdm.pandas()

# Example for detecting bigrams 
import math
import nltk
from collections import defaultdict

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
from nltk.probability import FreqDist

from gensim.models.ldamulticore import LdaMulticore

# read data 

In [3]:
df = pd.read_csv('./all_df.csv', index_col = False)

# batch prepare data

In [4]:
df.loc[df.category_2 == 'Cats', ['category_3']].value_counts()

category_3                   
Beds & Furniture                 53303
Litter & Housebreaking           39361
Toys                             36078
Health Supplies                  34889
Food                             27843
Treats                           10143
Grooming                          8567
Feeding & Watering Supplies       7831
Cat Flaps, Steps, Nets & Pens     3965
Collars, Harnesses & Leashes      3082
Educational Repellents            2263
Carriers & Strollers              1284
Cages                              196
Memorials                          155
Apparel                            150
Medications                          8
dtype: int64

In [5]:
df1 = df.loc[(df.category_2 == 'Cats') & (df.category_3 == 'Toys')]
df2 = df.loc[(df.category_2 == 'Cats') & (df.category_3 == 'Food')].reset_index(drop = True) 
df3 = df.loc[(df.category_2 == 'Cats') & (df.category_3 == 'Beds & Furniture')]
df4 = df.loc[(df.category_2 == 'Cats') & (df.category_3 == 'Litter & Housebreaking')]
df5 = df.loc[(df.category_2 == 'Cats') & (df.category_3 == 'Health Supplies')]
df6 = df.loc[(df.category_2 == 'Cats') & (df.category_3 == 'Treats')]
df7 = df.loc[(df.category_2 == 'Cats') & (df.category_3 == 'Grooming')]
df8 = df.loc[(df.category_2 == 'Cats') & (df.category_3 == 'Feeding & Watering Supplies')]

In [6]:
def sent_to_words(sentences):
    '''
    Simplify and tokenize strings in an iterable.
    
            Parameters:
                    sentences (iterable): Review strings in a list or as a pandas.series.
                    
            Returns:
                    _ (generator): Simplify review tokens.  
    '''
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

def get_bigrams(data_words): 
    '''
    Form bigram.
    
        Parameters:
            data_words (iterable): Review tokens in a list or as a pandas.series.

        Returns:
           bigram_pmi (data frame): bigram and it's pmi score. 
    '''
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = nltk.collocations.BigramCollocationFinder.from_documents(data_words) #data_words_nostops
    finder.apply_freq_filter(20)
    bigram_scores = finder.score_ngrams(bigram_measures.pmi) 
    bigram_pmi = pd.DataFrame(bigram_scores)
    bigram_pmi.columns = ['bigram', 'pmi']
    bigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)
    return finder, bigram_measures, bigram_pmi

def bigram_filter(bigram):
    """
    Filter bigram. 
        
        Parameters:
            bigram (string): a bigram.

        Returns:
           _ (boolean): decide whether or not to keep the bigram.
    """
    stop_words = stopwords.words('english')   
    tag = nltk.pos_tag(bigram)
    #if tag[0][1] not in ['JJ', 'NN', 'NNS'] and tag[1][1] not in ['NN', 'NNS']: #we only want adjective + noun
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['NN']: #we only want adjective + noun
        return False
    if bigram[0] in stop_words or bigram[1] in stop_words: 
        return False
    if 'n' in bigram or 't' in bigram:
        return False
    if 'PRON' in bigram: #we don't want pronoun
        return False
    if len(bigram[0]) <= 2 or len(bigram[1]) <= 2:
        return False
    return True

def get_trigrams(data_words): 
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    finder = nltk.collocations.TrigramCollocationFinder.from_documents(data_words) #data_words_nostops
    finder.apply_freq_filter(20)
    trigram_scores = finder.score_ngrams(trigram_measures.pmi)

    trigram_pmi = pd.DataFrame(trigram_scores)
    trigram_pmi.columns = ['trigram', 'pmi']
    trigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)
    return finder, trigram_measures, trigram_pmi

def trigram_filter(trigram):
    tag = nltk.pos_tag(trigram)
    stop_words = stopwords.words('english')
    #if tag[0][1] not in ['JJ', 'NN', 'NNS'] and tag[1][1] not in ['NN', 'NNS']: #we only want adjective + noun
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['JJ','NN']:
        return False
    if trigram[0] in stop_words or trigram[-1] in stop_words or trigram[1] in stop_words:
        return False
    if 'n' in trigram or 't' in trigram:
        return False
    if 'PRON' in trigram:
        return False
    if len(trigram[0]) <= 2 or len(trigram[1]) <= 2 and len(trigram[2]) <= 2:
        return False
    return True 

def replace_ngram(x, bigrams, trigrams):
    '''
    Form ngram. 
        
        Parameters:
            bigram (string): a bigram.

        Returns:
           _ (boolean): decide whether or not to keep the bigram.
    '''
    for gram in trigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    for gram in bigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    return x


def remove_stopwords(texts, extra_stopwords = []):
    '''
    Remove stopwords in an iterable (string or tokens).

        Parameters:
                texts (iterable): Review strings in a list or as a pandas.series.

        Returns:
                _ (list): Simplify review stirngs without stopwords.  
    '''
    stop_words = stopwords.words('english') + extra_stopwords
    return [[word for word in doc.split() if word not in stop_words] for doc in texts]

def lemmatize_skip_ngrams(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    '''feed tokens'''
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    texts_out = []
    for sent in texts:
        doc = nlp(' '.join(sent)) 
        if len(allowed_postags) > 0: 
            returned_doc = []
            for token in doc:
                if '_' in token.text:
                    returned_token = token.text
                if token.pos_ in allowed_postags and '_' not in token.text:
                    returned_token = token.lemma_
                if token.pos_ not in allowed_postags and '_' not in token.text:
                    continue
                returned_doc.append(returned_token)
            texts_out.append(returned_doc)
                    
        else: 
            returned_doc = []
            for token in doc:
                if '_' in token.text:
                    returned_token = token.text
                else: 
                    returned_token = token.lemma_
                returned_doc.append(returned_token)
            texts_out.append(returned_doc)
    return texts_out

def preprocess_review(reviews, extra_stopwords = ['cat', 'cats'], allowed_postags=[]): 
    
    data_words = list(sent_to_words(tqdm(reviews.tolist())))
    bigram_finder, bigram_measures, bigram_pmi = get_bigrams(tqdm(data_words))
    trigram_finder, trigram_measures, trigram_pmi = get_trigrams(tqdm(data_words))
    filtered_bigram = bigram_pmi[bigram_pmi.progress_apply(lambda bigram:\
                                                  bigram_filter(bigram['bigram'])\
                                                  and bigram.pmi > 5, axis = 1)][:500]

    filtered_trigram = trigram_pmi[trigram_pmi.progress_apply(lambda trigram: \
                                                     trigram_filter(trigram['trigram'])\
                                                     and trigram.pmi > 5, axis = 1)][:500]
    bigrams = [' '.join(x) for x in filtered_bigram.bigram.values]
    trigrams = [' '.join(x) for x in filtered_trigram.trigram.values]
    reviews_ngrams = pd.DataFrame([' '.join(sen) for sen in data_words], columns = ['reviewText'])
    reviews_ngrams.reviewText = reviews_ngrams.reviewText.map(lambda x: replace_ngram(x, bigrams, trigrams))
    cleaned_reviews_ngrams = remove_stopwords(tqdm(reviews_ngrams.reviewText), extra_stopwords = extra_stopwords)
    lemmatized_reviews_ngrams = lemmatize_skip_ngrams(tqdm(cleaned_reviews_ngrams), allowed_postags = allowed_postags)
    return filtered_bigram, filtered_trigram, lemmatized_reviews_ngrams

In [7]:
df8.shape

(7831, 23)

In [None]:
cats_toys_bigram, cats_toys_trigram, cats_toys_reviews = preprocess_review(df1.reviewText, extra_stopwords = ['cat', 'cats', 'toy', 'toys'])
cats_food_bigram, cats_food_trigram, cats_food_reviews = preprocess_review(df2.reviewText, extra_stopwords = ['cat', 'cats', 'food'])
cats_beds_bigram, cats_beds_trigram, cats_beds_reviews = preprocess_review(df3.reviewText, extra_stopwords = ['cat', 'cats', 'bed', 'beds'] )
cats_litter_bigram, cats_litter_trigram, cats_litter_reviews = preprocess_review(df4.reviewText, extra_stopwords = ['cat', 'cats', 'litters', 'litter'])
cats_health_supplies_bigram, cats_health_supplies_trigram, cats_health_supplies_reviews = preprocess_review(df5.reviewText, extra_stopwords = ['cat', 'cats'])
cats_treats_bigram, cats_treats_trigram, cats_treats_reviews = preprocess_review(df6.reviewText, extra_stopwords = ['cat', 'cats', 'treat', 'treats'])
cats_grooming_bigram, cats_grooming_trigram, cats_grooming_reviews = preprocess_review(df7.reviewText, extra_stopwords = ['cat', 'cats'])
cats_feeding_supplies_bigram, cats_feeding_suppliess_trigram, cats_feeding_supplies_reviews = preprocess_review(df8.reviewText, extra_stopwords = ['cat', 'cats'])

  0%|          | 0/36078 [00:00<?, ?it/s]

  0%|          | 0/36078 [00:00<?, ?it/s]

  0%|          | 0/36078 [00:00<?, ?it/s]

  0%|          | 0/14874 [00:00<?, ?it/s]

  0%|          | 0/9235 [00:00<?, ?it/s]

  0%|          | 0/36078 [00:00<?, ?it/s]

  0%|          | 0/36078 [00:00<?, ?it/s]

In [None]:
cats_toys_bigram, cats_toys_trigram, cats_toys_reviews2 = preprocess_review(df1.reviewText, extra_stopwords = ['cat', 'cats', 'toy', 'toys'],
                                                                          allowed_postags = ['ADJ', 'VERB', 'ADV'])
cats_food_bigram, cats_food_trigram, cats_food_reviews2 = preprocess_review(df2.reviewText, extra_stopwords = ['cat', 'cats', 'food'],
                                                                          allowed_postags = ['ADJ', 'VERB', 'ADV'])
cats_beds_bigram, cats_beds_trigram, cats_beds_reviews2 = preprocess_review(df3.reviewText, extra_stopwords = ['cat', 'cats', 'bed', 'beds'],
                                                                          allowed_postags = ['ADJ', 'VERB', 'ADV'])
cats_litter_bigram, cats_litter_trigram, cats_litter_reviews2 = preprocess_review(df4.reviewText, extra_stopwords = ['cat', 'cats', 'litters', 'litter'],
                                                                                allowed_postags = ['ADJ', 'VERB', 'ADV'])
cats_health_supplies_bigram, cats_health_supplies_trigram, cats_health_supplies_reviews2 = preprocess_review(df5.reviewText, extra_stopwords = ['cat', 'cats'],
                                                                                                           allowed_postags = ['ADJ', 'VERB', 'ADV'])
cats_treats_bigram, cats_treats_trigram, cats_treats_reviews2 = preprocess_review(df6.reviewText, extra_stopwords = ['cat', 'cats', 'treat', 'treats'],
                                                                                allowed_postags = ['ADJ', 'VERB', 'ADV'])
cats_grooming_bigram, cats_grooming_trigram, cats_grooming_reviews2 = preprocess_review(df7.reviewText, extra_stopwords = ['cat', 'cats'],
                                                                                      allowed_postags = ['ADJ', 'VERB', 'ADV'])
cats_feeding_supplies_bigram, cats_feeding_suppliess_trigram, cats_feeding_supplies_reviews2 = preprocess_review(df8.reviewText, extra_stopwords = ['cat', 'cats'],
                                                                                                               allowed_postags = ['ADJ', 'VERB', 'ADV'])

In [None]:
cats_health_supplies_bigram[:20]

# modeling

In [12]:
def vectorize(texts):
    dictionary = corpora.Dictionary(tqdm(texts))
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in tqdm(texts)]
    return dictionary, doc_term_matrix

In [81]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        print('Round: '+ str(num_topics))
        model = LdaMulticore(corpus=corpus, 
                                num_topics=num_topics, 
                                id2word=dictionary,
                                passes=40,
                                iterations=10, 
                                chunksize = 100, 
                                eval_every = None,
                                random_state = 1234)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, 
                                        texts=texts, 
                                        dictionary=dictionary, 
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

def get_keywords(doc_term_matrix, model): #top 20
    keywords_dict = {}
    for k in range(len(model.top_topics(doc_term_matrix))):
        tu = model.top_topics(doc_term_matrix)[k]
        keywords_lst = []
        for tup in tu[0]:
            keywords_lst.append(tup[1])
        keywords_dict[k] = keywords_lst
    return keywords_dict

# Interpret the results

## 1. cats feeding supplies

In [89]:
cats_feeding_supplies_dictionary, cats_feeding_supplies_doc_term_matrix = vectorize(cats_feeding_supplies_reviews2)
# cats_feeding_supplies_model_list, cats_feeding_supplies_coherence_values = compute_coherence_values(
#                                                     dictionary=cats_feeding_supplies_dictionary, 
#                                                     corpus=cats_feeding_supplies_doc_term_matrix, 
#                                                     texts=cats_feeding_supplies_reviews2, 
#                                                     start=2, 
#                                                     limit=20, 
#                                                     step=2)

# limit=20; start=2; step=2;
# x = range(start, limit, step)
# plt.plot(x, cats_feeding_supplies_coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()
# for m, cv in zip(x, cats_feeding_supplies_coherence_values):
#     print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
    
# with open ('./cat_feeding_supplies_model_list.pickle', 'wb') as handle:
#     pickle.dump(cats_feeding_supplies_model_list, handle)

  0%|          | 0/1093 [00:00<?, ?it/s]

  0%|          | 0/1093 [00:00<?, ?it/s]

In [82]:
with open ('./cat_feeding_supplies_model_list.pickle', 'rb') as handle:
    cat_feeding_supplies_model_list = pickle.load(handle)

In [86]:
cats_feeding_supplies_optimal_model = cat_feeding_supplies_model_list[2]
model_topics = cats_feeding_supplies_optimal_model.show_topics(formatted=False)

In [87]:
cats_feeding_supplies_optimal_model.print_topics(num_topics=6, num_words=30) 

[(0,
  '0.040*"get" + 0.024*"eat" + 0.014*"put" + 0.013*"little" + 0.013*"come" + 0.012*"take" + 0.012*"try" + 0.012*"open" + 0.012*"use" + 0.011*"work" + 0.011*"small" + 0.011*"make" + 0.011*"great" + 0.010*"go" + 0.010*"really" + 0.010*"also" + 0.009*"find" + 0.008*"much" + 0.008*"seem" + 0.008*"even" + 0.008*"good" + 0.008*"give" + 0.007*"buy" + 0.007*"think" + 0.007*"want" + 0.007*"easy" + 0.007*"need" + 0.007*"keep" + 0.007*"big" + 0.006*"well"'),
 (1,
  '0.038*"get" + 0.021*"keep" + 0.017*"drink" + 0.016*"great" + 0.015*"little" + 0.014*"use" + 0.014*"clean" + 0.013*"make" + 0.012*"love" + 0.011*"well" + 0.011*"much" + 0.009*"large" + 0.009*"fresh" + 0.008*"recommend" + 0.008*"easy" + 0.008*"come" + 0.008*"old" + 0.008*"buy" + 0.008*"small" + 0.007*"really" + 0.007*"go" + 0.007*"even" + 0.007*"seem" + 0.007*"happy" + 0.007*"find" + 0.006*"want" + 0.006*"hard" + 0.006*"work" + 0.006*"still" + 0.005*"extra"'),
 (2,
  '0.036*"clean" + 0.017*"easy" + 0.016*"use" + 0.015*"take" + 0.01

In [91]:
pyLDAvis.enable_notebook()
topic_data =  pyLDAvis.gensim_models.prepare(cats_feeding_supplies_optimal_model, 
                                             cats_feeding_supplies_doc_term_matrix, 
                                             cats_feeding_supplies_dictionary, 
                                             mds = 'pcoa')
pyLDAvis.display(topic_data)

  default_term_info = default_term_info.sort_values(


In [90]:
pyLDAvis.enable_notebook()
topic_data =  pyLDAvis.gensim_models.prepare(cats_feeding_supplies_optimal_model, 
                                             cats_feeding_supplies_doc_term_matrix, 
                                             cats_feeding_supplies_dictionary, 
                                             mds = 'tsne')
pyLDAvis.display(topic_data)

# pcoa:Principal Coordinate Analysis(aka Classical Multidimensional Scaling)
# mmds:Metric Multidimensional Scaling
# tsne:t-distributed Stochastic Neighbor Embedding

  default_term_info = default_term_info.sort_values(


In [216]:
keywords_dict = get_keywords(cats_feeding_supplies_doc_term_matrix, model = cats_feeding_supplies_optimal_model)

In [217]:
[w for w in keywords_dict[0] if w not in  keywords_dict[1] and w not in  keywords_dict[2] and w not in  keywords_dict[3]]

['eat', 'open', 'find', 'seem']

In [218]:
[w for w in keywords_dict[1] if w not in  keywords_dict[0] and w not in  keywords_dict[2] and w not in  keywords_dict[3]]

['still', 'different', 'think', 'sure', 'set', 'see']

In [219]:
[w for w in keywords_dict[2] if w not in  keywords_dict[1] and w not in  keywords_dict[0] and w not in  keywords_dict[3]]

['keep', 'love', 'large', 'fresh', 'recommend', 'old']

In [220]:
[w for w in keywords_dict[3] if w not in  keywords_dict[1] and w not in  keywords_dict[2] and w not in  keywords_dict[0]]

['quiet', 'good', 'first', 'new', 'add', 'give', 'run', 'often']

In [377]:
pyLDAvis.enable_notebook()
topic_data =  pyLDAvis.gensim_models.prepare(cat_feeding_supplies_optimal_model, 
                                             cats_feeding_supplies_doc_term_matrix2, 
                                             cats_feeding_supplies_dictionary2, 
                                             mds = 'tsne')
pyLDAvis.display(topic_data)

  default_term_info = default_term_info.sort_values(


## 2. cat food (4/4)

In [22]:
# cats_food_bigram, cats_food_trigram, cats_food_reviews2 = preprocess_review(df2.reviewText, 
#                                                          extra_stopwords = ['cat', 'cats', 'food'],
#                                                          allowed_postags = ['ADJ', 'VERB', 'ADV'])

  0%|          | 0/5999 [00:00<?, ?it/s]

  0%|          | 0/5999 [00:00<?, ?it/s]

  0%|          | 0/5999 [00:00<?, ?it/s]

  0%|          | 0/3561 [00:00<?, ?it/s]

  0%|          | 0/1195 [00:00<?, ?it/s]

  0%|          | 0/5999 [00:00<?, ?it/s]

  0%|          | 0/5999 [00:00<?, ?it/s]

In [13]:
cats_food_dictionary, cats_food_doc_term_matrix = vectorize(cats_food_reviews2)
# cats_food_model_list, cats_food_coherence_values = compute_coherence_values(dictionary=cats_food_dictionary, 
#                                                     corpus=cats_food_doc_term_matrix, 
#                                                     texts=cats_food_reviews2, 
#                                                     start=2, 
#                                                     limit=20, 
#                                                     step=2)

# limit=20; start=2; step=2;
# x = range(start, limit, step)
# plt.plot(x, cats_food_coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()
# for m, cv in zip(x, cats_food_coherence_values):
#     print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

  0%|          | 0/5999 [00:00<?, ?it/s]

  0%|          | 0/5999 [00:00<?, ?it/s]

In [24]:
# with open ('./cats_food_model_list.pickle', 'wb') as handle:
#     pickle.dump(cats_food_model_list, handle)

In [14]:
with open ('./cats_food_model_list.pickle', 'rb') as handle:
    cats_food_model_list = pickle.load(handle)

In [121]:
CoherenceModel(model=cats_food_model_list[1], 
               texts=cats_food_reviews2, 
               dictionary=cats_food_dictionary, 
               coherence='c_v').get_coherence()

0.5972780790966179

In [15]:
cats_food_optimal_model = cats_food_model_list[1]
model_topics = cats_food_optimal_model.show_topics(formatted=False)

In [108]:
cats_food_optimal_model.print_topics(num_topics=4, num_words=30)

[(0,
  '0.059*"eat" + 0.024*"get" + 0.019*"try" + 0.017*"seem" + 0.016*"go" + 0.013*"much" + 0.013*"give" + 0.012*"good" + 0.011*"look" + 0.011*"really" + 0.011*"well" + 0.010*"think" + 0.010*"even" + 0.009*"make" + 0.009*"say" + 0.009*"smell" + 0.009*"open" + 0.009*"come" + 0.009*"know" + 0.009*"first" + 0.009*"love" + 0.008*"see" + 0.008*"like" + 0.008*"put" + 0.008*"buy" + 0.008*"never" + 0.008*"old" + 0.007*"want" + 0.007*"picky" + 0.007*"little"'),
 (1,
  '0.022*"well" + 0.022*"good" + 0.019*"get" + 0.017*"love" + 0.016*"healthy" + 0.014*"buy" + 0.013*"much" + 0.013*"try" + 0.012*"give" + 0.012*"find" + 0.012*"recommend" + 0.012*"seem" + 0.011*"feed" + 0.011*"also" + 0.010*"switch" + 0.010*"expensive" + 0.009*"eat" + 0.009*"go" + 0.009*"less" + 0.009*"make" + 0.009*"grain_free" + 0.008*"really" + 0.008*"use" + 0.008*"old" + 0.007*"great" + 0.007*"even" + 0.007*"cheap" + 0.007*"start" + 0.007*"little" + 0.007*"dry"'),
 (2,
  '0.035*"dry" + 0.026*"fancy_feast" + 0.025*"eat" + 0.022*

In [207]:
keywords_dict = get_keywords(cats_food_doc_term_matrix, model = cats_food_optimal_model)

In [212]:
[w for w in keywords_dict[1] if w not in  keywords_dict[0] and w not in  keywords_dict[2] and w not in keywords_dict[3]] #scent

['think', 'smell', 'open', 'come', 'first']

In [213]:
[w for w in keywords_dict[0] if w not in  keywords_dict[1] and w not in  keywords_dict[2] and w not in keywords_dict[3]] #value of money

['buy', 'find', 'recommend', 'switch', 'expensive', 'less']

In [214]:
[w for w in keywords_dict[2] if w not in  keywords_dict[1] and w not in  keywords_dict[0] and w not in keywords_dict[3]] #ingredient 

['natural', 'pet', 'contain', 'high', 'low', 'include', 'use', 'see', 'many']

In [215]:
[w for w in keywords_dict[2] if w not in  keywords_dict[1] and w not in  keywords_dict[0] and w not in keywords_dict[3]] #flavor

['fancy_feast',
 'wet',
 'can',
 'keep',
 'enjoy',
 'like',
 'happy',
 'prefer',
 'favorite',
 'great']

In [107]:
pyLDAvis.enable_notebook()
topic_data =  pyLDAvis.gensim_models.prepare(cats_food_optimal_model, 
                                             cats_food_doc_term_matrix, 
                                             cats_food_dictionary, 
                                             mds = 'tsne')
pyLDAvis.display(topic_data)



In [76]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
            row = sorted(row, key=lambda x: (x[1]), reverse=True)
            # Get the Dominant topic, Perc Contribution and Keywords for each document
            for j, (topic_num, prop_topic) in enumerate(row):
                #if j == 0:  # => dominant topic (topic_num, prop_topic) sorted descending in the row
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([i, int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
    sent_topics_df.columns = ['index', 'Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts).reset_index()
    contents.columns = ['index', 'Tokens']
    sent_topics_df = contents.merge(sent_topics_df, left_on = 'index', right_on = 'index')
    return(sent_topics_df)

In [77]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=cats_food_optimal_model, 
                                                  corpus=cats_food_doc_term_matrix[:2], 
                                                  texts=cats_food_reviews2[ :2])
wide_df_topic_sents_keywords = df_topic_sents_keywords.pivot(index=['index'], columns='Topic', values='Perc_Contribution')

In [78]:
wide_df_topic_sents_keywords

Topic,0.0,1.0,2.0,3.0
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.6181,0.2918,0.0832,
1,0.1848,0.7847,0.0153,0.0151


In [82]:
#df_topic_sents_keywords['Dominant_Topic'].value_counts(normalize = True)

In [80]:
# # Number of Documents for Each Topic
# topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# # Percentage of Documents for Each Topic
# topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# # Topic Number and Keywords
# topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# # Concatenate Column wise
# df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# # Change Column names
# df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# # Show
# df_dominant_topics

## 3. cat toys (0/4)

In [124]:
cats_toys_dictionary, cats_toys_doc_term_matrix = vectorize(cats_toys_reviews2)
# cats_toys_model_list, cats_toys_coherence_values = compute_coherence_values(dictionary=cats_toys_dictionary, 
#                                                     corpus=cats_toys_doc_term_matrix, 
#                                                     texts=cats_toys_reviews2, 
#                                                     start=2, 
#                                                     limit=20, 
#                                                     step=2)

# limit=20; start=2; step=2;
# x = range(start, limit, step)
# plt.plot(x, cats_toys_coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()
# for m, cv in zip(x, cats_toys_coherence_values):
#     print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

  0%|          | 0/7893 [00:00<?, ?it/s]

  0%|          | 0/7893 [00:00<?, ?it/s]

In [28]:
# with open ('./cats_toys_model_list.pickle', 'wb') as handle:
#     pickle.dump(cats_toys_model_list, handle)

In [113]:
with open ('./cats_toys_model_list.pickle', 'rb') as handle:
     cats_toys_model_list = pickle.load(handle)

In [128]:
CoherenceModel(model=cats_toys_model_list[1], 
               texts=cats_toys_reviews2, 
               dictionary=cats_toys_dictionary, 
               coherence='c_v').get_coherence()

0.6224511640648648

In [129]:
CoherenceModel(model=cats_toys_model_list[2], 
               texts=cats_toys_reviews2, 
               dictionary=cats_toys_dictionary, 
               coherence='c_v').get_coherence()

0.6136577381694276

In [130]:
cats_toys_optimal_model = cats_toys_model_list[1]
model_topics = cats_toys_optimal_model.show_topics(formatted=False)
keywords_dict = get_keywords(cats_toys_doc_term_matrix, model = cats_toys_optimal_model)
cats_toys_optimal_model.print_topics(num_topics=4, num_words=30)

[(0,
  '0.042*"buy" + 0.029*"love" + 0.024*"get" + 0.016*"play" + 0.016*"old" + 0.015*"catnip" + 0.014*"recommend" + 0.013*"good" + 0.013*"well" + 0.012*"little" + 0.011*"find" + 0.011*"great" + 0.011*"still" + 0.010*"make" + 0.010*"even" + 0.009*"new" + 0.009*"happy" + 0.009*"purchase" + 0.008*"keep" + 0.008*"small" + 0.008*"last" + 0.008*"highly" + 0.008*"come" + 0.007*"use" + 0.007*"always" + 0.006*"also" + 0.006*"carry" + 0.006*"much" + 0.006*"lose" + 0.006*"definitely"'),
 (1,
  '0.048*"love" + 0.040*"go" + 0.036*"play" + 0.029*"get" + 0.017*"make" + 0.017*"much" + 0.016*"crazy" + 0.013*"great" + 0.011*"favorite" + 0.011*"really" + 0.011*"never" + 0.011*"little" + 0.010*"well" + 0.010*"know" + 0.010*"think" + 0.010*"even" + 0.010*"chase" + 0.009*"give" + 0.009*"see" + 0.008*"look" + 0.008*"around" + 0.008*"say" + 0.007*"also" + 0.007*"absolutely" + 0.006*"good" + 0.006*"worth" + 0.006*"fly" + 0.006*"keep" + 0.005*"carry" + 0.005*"nuts"'),
 (2,
  '0.026*"seem" + 0.025*"make" + 0.02

In [131]:
[w for w in keywords_dict[0] if w not in  keywords_dict[1] and w not in  keywords_dict[2] and w not in  keywords_dict[3]]

['put', 'try', 'back', 'take', 'away', 'use', 'first', 'want', 'together']

In [132]:
[w for w in keywords_dict[1] if w not in  keywords_dict[0] and w not in  keywords_dict[2] and w not in  keywords_dict[3]]

['crazy', 'favorite', 'never', 'know', 'chase', 'look']

In [133]:
[w for w in keywords_dict[2] if w not in  keywords_dict[0] and w not in  keywords_dict[1] and w not in  keywords_dict[3]]

['buy',
 'old',
 'catnip',
 'recommend',
 'good',
 'find',
 'new',
 'happy',
 'purchase']

In [134]:
[w for w in keywords_dict[3] if w not in  keywords_dict[0] and w not in  keywords_dict[2] and w not in  keywords_dict[1]]

['seem', 'like', 'long', 'also', 'work', 'nice', 'enjoy', 'break', 'maybe']

In [119]:
pyLDAvis.enable_notebook()
topic_data =  pyLDAvis.gensim_models.prepare(cats_toys_optimal_model, 
                                             cats_toys_doc_term_matrix, 
                                             cats_toys_dictionary, 
                                             mds = 'tsne')
pyLDAvis.display(topic_data)

  default_term_info = default_term_info.sort_values(


## 4. cats beds (2/4)

In [135]:
cats_beds_dictionary, cats_beds_doc_term_matrix = vectorize(cats_beds_reviews2)
# cats_beds_model_list, cats_beds_coherence_values = compute_coherence_values(dictionary=cats_beds_dictionary, 
#                                                     corpus=cats_beds_doc_term_matrix, 
#                                                     texts=cats_beds_reviews2, 
#                                                     start=2, 
#                                                     limit=20, 
#                                                     step=2)

# limit=20; start=2; step=2;
# x = range(start, limit, step)
# plt.plot(x, cats_beds_coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()

  0%|          | 0/5840 [00:00<?, ?it/s]

  0%|          | 0/5840 [00:00<?, ?it/s]

In [31]:
# for m, cv in zip(x, cats_beds_coherence_values):
#     print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

Num Topics = 2  has Coherence Value of 0.5773
Num Topics = 4  has Coherence Value of 0.6171
Num Topics = 6  has Coherence Value of 0.6064
Num Topics = 8  has Coherence Value of 0.593
Num Topics = 10  has Coherence Value of 0.6091
Num Topics = 12  has Coherence Value of 0.5758
Num Topics = 14  has Coherence Value of 0.5343
Num Topics = 16  has Coherence Value of 0.546
Num Topics = 18  has Coherence Value of 0.55


In [33]:
# with open ('./cats_beds_model_list.pickle', 'wb') as handle:
#     pickle.dump(cats_beds_model_list, handle)

In [136]:
with open ('./cats_beds_model_list.pickle', 'rb') as handle:
     cats_beds_model_list = pickle.load(handle)

In [144]:
cats_beds_optimal_model = cats_beds_model_list[2]
model_topics = cats_beds_optimal_model.show_topics(formatted=False)
keywords_dict = get_keywords(cats_beds_doc_term_matrix, model = cats_beds_optimal_model)

cats_beds_optimal_model.print_topics(num_topics=6, num_words=30)

[(0,
  '0.031*"well" + 0.028*"easy" + 0.026*"make" + 0.022*"together" + 0.022*"put" + 0.020*"sturdy" + 0.017*"cat_tree" + 0.016*"good" + 0.014*"large" + 0.013*"look" + 0.012*"take" + 0.012*"love" + 0.011*"really" + 0.011*"little" + 0.011*"buy" + 0.011*"even" + 0.010*"assemble" + 0.010*"recommend" + 0.009*"top" + 0.009*"hold" + 0.009*"nice" + 0.009*"enough" + 0.009*"get" + 0.008*"great" + 0.008*"big" + 0.008*"also" + 0.007*"fit" + 0.007*"high" + 0.007*"seem" + 0.007*"go"'),
 (1,
  '0.033*"get" + 0.023*"warm" + 0.019*"love" + 0.018*"soft" + 0.017*"keep" + 0.016*"buy" + 0.015*"nice" + 0.014*"great" + 0.014*"put" + 0.014*"well" + 0.012*"really" + 0.011*"small" + 0.011*"sleep" + 0.010*"comfortable" + 0.010*"fit" + 0.009*"cold" + 0.009*"also" + 0.009*"make" + 0.008*"seem" + 0.008*"good" + 0.008*"take" + 0.008*"come" + 0.007*"little" + 0.007*"go" + 0.007*"perfect" + 0.007*"feel" + 0.007*"wash" + 0.007*"large" + 0.007*"enough" + 0.007*"purchase"'),
 (2,
  '0.029*"scratch" + 0.023*"look" + 0.01

### num_topics 6 (**check**) v.s. 4

In [145]:
[w for w in keywords_dict[0] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[3]\
                                and w not in  keywords_dict[4]\
                                and w not in  keywords_dict[5]]

['play', 'old']

In [146]:
[w for w in keywords_dict[1] if w not in  keywords_dict[0]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[3]\
                                and w not in  keywords_dict[4]\
                                and w not in  keywords_dict[5]] 

['think', 'like', 'new', 'know', 'sit']

In [147]:
[w for w in keywords_dict[2] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[0]\
                                and w not in  keywords_dict[3]\
                                and w not in  keywords_dict[4]\
                                and w not in  keywords_dict[5]] #sturdy/assemble/hold

['together', 'sturdy', 'cat_tree', 'large', 'assemble', 'recommend', 'hold']

In [148]:
[w for w in keywords_dict[3] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[0]\
                                and w not in  keywords_dict[4]\
                                and w not in  keywords_dict[5]]

['big', 'set', 'attach', 'open']

In [149]:
[w for w in keywords_dict[4] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[0]\
                                and w not in  keywords_dict[3]\
                                and w not in  keywords_dict[5]] #warm/soft/comfortable

['warm', 'soft', 'nice', 'sleep', 'comfortable', 'fit', 'cold']

In [150]:
[w for w in keywords_dict[5] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[0]\
                                and w not in  keywords_dict[4]\
                                and w not in  keywords_dict[3]] 

['apply', 'work', 'protect', 'pet', 'pull', 'cut']

In [152]:
cats_beds_optimal_model = cats_beds_model_list[1]
model_topics = cats_beds_optimal_model.show_topics(formatted=False)
keywords_dict = get_keywords(cats_beds_doc_term_matrix, model = cats_beds_optimal_model)

cats_beds_optimal_model.print_topics(num_topics=4, num_words=30)

[(0,
  '0.022*"well" + 0.021*"easy" + 0.020*"make" + 0.020*"put" + 0.019*"together" + 0.015*"sturdy" + 0.014*"cat_tree" + 0.012*"take" + 0.012*"small" + 0.012*"large" + 0.012*"top" + 0.011*"look" + 0.011*"get" + 0.011*"little" + 0.011*"big" + 0.011*"good" + 0.010*"love" + 0.010*"really" + 0.009*"even" + 0.009*"assemble" + 0.009*"also" + 0.008*"hold" + 0.008*"great" + 0.007*"high" + 0.007*"use" + 0.007*"recommend" + 0.007*"go" + 0.007*"fit" + 0.007*"nice" + 0.006*"buy"'),
 (1,
  '0.034*"get" + 0.021*"love" + 0.020*"warm" + 0.018*"buy" + 0.015*"soft" + 0.015*"keep" + 0.015*"great" + 0.014*"small" + 0.013*"put" + 0.013*"nice" + 0.013*"sleep" + 0.012*"well" + 0.011*"really" + 0.010*"fit" + 0.009*"make" + 0.008*"also" + 0.008*"comfortable" + 0.008*"use" + 0.008*"cold" + 0.008*"little" + 0.007*"go" + 0.007*"perfect" + 0.007*"seem" + 0.007*"take" + 0.007*"purchase" + 0.007*"good" + 0.007*"feel" + 0.006*"lay" + 0.006*"much" + 0.006*"come"'),
 (2,
  '0.033*"use" + 0.030*"get" + 0.023*"love" + 0

In [153]:
[w for w in keywords_dict[1] if w not in  keywords_dict[0]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[3]] 

['scratch', 'much', 'still', 'play', 'old', 'like']

In [154]:
[w for w in keywords_dict[0] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[3]] #sturdy, assemble

['easy', 'together', 'sturdy', 'cat_tree', 'large', 'top', 'big', 'assemble']

In [155]:
[w for w in keywords_dict[2] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[0]\
                                and w not in  keywords_dict[3]] #warm/soft/comofortable

['warm',
 'soft',
 'keep',
 'great',
 'nice',
 'sleep',
 'fit',
 'comfortable',
 'cold']

In [156]:
[w for w in keywords_dict[3] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[0]]  

['come', 'back', 'see', 'want', 'open', 'pull', 'work', 'give', 'however']

In [157]:
pyLDAvis.enable_notebook()
topic_data =  pyLDAvis.gensim_models.prepare(cats_beds_optimal_model, 
                                             cats_beds_doc_term_matrix, 
                                             cats_beds_dictionary, 
                                             mds = 'tsne')
pyLDAvis.display(topic_data)

  default_term_info = default_term_info.sort_values(


## 5. cats litter (1/4)

In [158]:
cats_litter_dictionary, cats_litter_doc_term_matrix = vectorize(cats_litter_reviews2)
# cats_litter_model_list, cats_litter_coherence_values = compute_coherence_values(dictionary=cats_litter_dictionary, 
#                                                     corpus=cats_litter_doc_term_matrix, 
#                                                     texts=cats_litter_reviews2, 
#                                                     start=2, 
#                                                     limit=20, 
#                                                     step=2)

# limit=20; start=2; step=2;
# x = range(start, limit, step)
# plt.plot(x, cats_litter_coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()
# for m, cv in zip(x, cats_litter_coherence_values):
#     print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
    
# with open ('./cats_litter_model_list.pickle', 'wb') as handle:
#      pickle.dump(cats_litter_model_list, handle)

  0%|          | 0/4893 [00:00<?, ?it/s]

  0%|          | 0/4893 [00:00<?, ?it/s]

In [159]:
with open ('./cats_litter_model_list.pickle', 'rb') as handle:
     cats_litter_model_list = pickle.load(handle)

In [161]:
cats_litter_optimal_model = cats_litter_model_list[1]
model_topics = cats_litter_optimal_model.show_topics(formatted=False)
pyLDAvis.enable_notebook()
topic_data =  pyLDAvis.gensim_models.prepare(cats_litter_optimal_model, 
                                             cats_litter_doc_term_matrix, 
                                             cats_litter_dictionary, 
                                             mds = 'tsne')
pyLDAvis.display(topic_data)

  default_term_info = default_term_info.sort_values(


In [163]:
keywords_dict = get_keywords(cats_litter_doc_term_matrix, model = cats_litter_optimal_model)
cats_litter_optimal_model.print_topics(num_topics=4, num_words=30)

[(0,
  '0.033*"well" + 0.033*"use" + 0.021*"good" + 0.020*"try" + 0.018*"get" + 0.014*"seem" + 0.013*"smell" + 0.012*"go" + 0.011*"say" + 0.010*"little" + 0.010*"really" + 0.010*"far" + 0.009*"think" + 0.009*"work" + 0.009*"scoop" + 0.008*"clump" + 0.008*"less" + 0.008*"even" + 0.008*"much" + 0.008*"make" + 0.008*"also" + 0.008*"find" + 0.007*"bad" + 0.007*"regular" + 0.006*"last" + 0.006*"odor_control" + 0.006*"fine" + 0.006*"still" + 0.005*"natural" + 0.005*"expensive"'),
 (1,
  '0.028*"use" + 0.025*"get" + 0.023*"go" + 0.017*"take" + 0.012*"clean" + 0.010*"buy" + 0.010*"work" + 0.010*"even" + 0.009*"try" + 0.009*"still" + 0.009*"old" + 0.009*"find" + 0.009*"back" + 0.009*"know" + 0.008*"first" + 0.008*"start" + 0.008*"put" + 0.008*"new" + 0.008*"give" + 0.008*"want" + 0.007*"think" + 0.007*"smell" + 0.007*"right" + 0.007*"never" + 0.007*"come" + 0.007*"really" + 0.007*"also" + 0.006*"long" + 0.006*"keep" + 0.006*"away"'),
 (2,
  '0.026*"use" + 0.023*"get" + 0.022*"make" + 0.017*"eas

In [167]:
[w for w in keywords_dict[1] if w not in  keywords_dict[2]\
                                and w not in  keywords_dict[0]\
                                and w not in  keywords_dict[3]] 

['great', 'large', 'nice', 'recommend', 'track', 'look']

In [166]:
[w for w in keywords_dict[2] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[0]\
                                and w not in  keywords_dict[3]] 

['still',
 'old',
 'find',
 'back',
 'know',
 'first',
 'start',
 'new',
 'give',
 'want']

In [168]:
[w for w in keywords_dict[3] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[0]\
                                and w not in  keywords_dict[2]] 

['small', 'top', 'empty', 'come', 'open']

In [169]:
[w for w in keywords_dict[0] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[3]]  #odor_control/smell -> scent

['smell', 'say', 'far', 'think', 'clump', 'less']

## 6. cats health supplies (0/4)

In [170]:
cats_health_supplies_dictionary, cats_health_supplies_doc_term_matrix = vectorize(cats_health_supplies_reviews2)
# cats_health_supplies_model_list, cats_health_supplies_coherence_values = compute_coherence_values(dictionary=cats_health_supplies_dictionary, 
#                                                     corpus=cats_health_supplies_doc_term_matrix, 
#                                                     texts=cats_health_supplies_reviews2, 
#                                                     start=2, 
#                                                     limit=20, 
#                                                     step=2)

# limit=20; start=2; step=2;
# x = range(start, limit, step)
# plt.plot(x, cats_health_supplies_coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()
# for m, cv in zip(x, cats_health_supplies_coherence_values):
#     print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
    
# with open ('./cats_health_supplies_model_list.pickle', 'wb') as handle:
#      pickle.dump(cats_health_supplies_model_list, handle)

  0%|          | 0/4294 [00:00<?, ?it/s]

  0%|          | 0/4294 [00:00<?, ?it/s]

In [171]:
with open ('./cats_health_supplies_model_list.pickle', 'rb') as handle:
     cats_health_supplies_model_list = pickle.load(handle)

In [172]:
cats_health_supplies_optimal_model = cats_health_supplies_model_list[1]
model_topics = cats_health_supplies_optimal_model.show_topics(formatted=False)

keywords_dict = get_keywords(cats_health_supplies_doc_term_matrix, model = cats_health_supplies_optimal_model)
cats_health_supplies_optimal_model.print_topics(num_topics=4, num_words=30)

[(0,
  '0.021*"calm" + 0.020*"get" + 0.015*"seem" + 0.015*"try" + 0.014*"spray" + 0.014*"feliway" + 0.014*"use" + 0.013*"go" + 0.013*"help" + 0.012*"new" + 0.010*"think" + 0.009*"come" + 0.009*"also" + 0.009*"see" + 0.009*"old" + 0.008*"still" + 0.008*"well" + 0.008*"work" + 0.008*"start" + 0.008*"make" + 0.007*"say" + 0.007*"buy" + 0.007*"really" + 0.007*"plug" + 0.006*"much" + 0.006*"even" + 0.006*"keep" + 0.006*"take" + 0.006*"stop" + 0.006*"little"'),
 (1,
  '0.039*"eat" + 0.025*"give" + 0.013*"try" + 0.013*"good" + 0.013*"take" + 0.012*"seem" + 0.010*"daily" + 0.010*"treat" + 0.010*"even" + 0.010*"love" + 0.008*"small" + 0.008*"think" + 0.008*"well" + 0.007*"get" + 0.007*"little" + 0.007*"recommend" + 0.007*"use" + 0.007*"say" + 0.007*"make" + 0.007*"much" + 0.007*"also" + 0.007*"help" + 0.006*"want" + 0.006*"healthy" + 0.006*"see" + 0.005*"look" + 0.005*"picky" + 0.005*"dry" + 0.005*"open" + 0.005*"soft"'),
 (2,
  '0.045*"get" + 0.039*"work" + 0.031*"use" + 0.031*"well" + 0.023*"

In [178]:
[w for w in keywords_dict[0] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[3]]  

['calm', 'spray', 'feliway', 'new', 'old']

In [174]:
[w for w in keywords_dict[1] if w not in  keywords_dict[0]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[3]]  

['eat', 'daily', 'treat', 'love', 'small', 'little']

In [175]:
[w for w in keywords_dict[2] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[0]\
                                and w not in  keywords_dict[3]]  

['put', 'find', 'back', 'need', 'never']

In [176]:
[w for w in keywords_dict[3] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[0]]  #clean

['really', 'great', 'sure', 'keep', 'know', 'clean', 'easy']

In [177]:
pyLDAvis.enable_notebook()
topic_data =  pyLDAvis.gensim_models.prepare(cats_health_supplies_optimal_model, 
                                             cats_health_supplies_doc_term_matrix, 
                                             cats_health_supplies_dictionary, 
                                             mds = 'tsne')
pyLDAvis.display(topic_data)

  default_term_info = default_term_info.sort_values(


## 7. cats treats

In [179]:
cats_treats_dictionary, cats_treats_doc_term_matrix = vectorize(cats_treats_reviews2)
# cat_treats_model_list, cats_treats_coherence_values = compute_coherence_values(dictionary=cats_treats_dictionary, 
#                                                     corpus=cats_treats_doc_term_matrix, 
#                                                     texts=cats_treats_reviews2, 
#                                                     start=2, 
#                                                     limit=20, 
#                                                     step=2)

# limit=20; start=2; step=2;
# x = range(start, limit, step)
# plt.plot(x, cats_treats_coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()
# for m, cv in zip(x, cats_treats_coherence_values):
#     print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

  0%|          | 0/2237 [00:00<?, ?it/s]

  0%|          | 0/2237 [00:00<?, ?it/s]

In [41]:
# with open ('./cat_treats_model_list.pickle', 'wb') as handle:
#      pickle.dump(cat_treats_model_list, handle)

In [180]:
with open ('./cat_treats_model_list.pickle', 'rb') as handle:
     cat_treats_model_list = pickle.load(handle)

In [189]:
cat_treats_optimal_model = cat_treats_model_list[1]
model_topics = cat_treats_optimal_model.show_topics(formatted=False)
cat_treats_optimal_model.print_topics(num_topics=4, num_words=30)

[(0,
  '0.049*"eat" + 0.027*"get" + 0.019*"try" + 0.016*"give" + 0.016*"use" + 0.016*"make" + 0.013*"take" + 0.013*"easy" + 0.012*"great" + 0.012*"buy" + 0.011*"much" + 0.010*"good" + 0.009*"work" + 0.009*"first" + 0.009*"even" + 0.009*"little" + 0.009*"well" + 0.008*"like" + 0.008*"pill_pockets" + 0.008*"put" + 0.008*"really" + 0.007*"say" + 0.007*"want" + 0.007*"keep" + 0.007*"actually" + 0.006*"also" + 0.006*"need" + 0.006*"come" + 0.006*"sure" + 0.006*"love"'),
 (1,
  '0.023*"go" + 0.020*"try" + 0.018*"seem" + 0.018*"love" + 0.017*"get" + 0.016*"buy" + 0.015*"well" + 0.014*"really" + 0.014*"much" + 0.013*"grow" + 0.013*"good" + 0.011*"enjoy" + 0.011*"come" + 0.010*"make" + 0.008*"give" + 0.008*"also" + 0.008*"put" + 0.008*"small" + 0.008*"like" + 0.008*"think" + 0.008*"different" + 0.007*"away" + 0.007*"happy" + 0.007*"many" + 0.006*"scratch" + 0.006*"keep" + 0.006*"catnip" + 0.006*"crazy" + 0.006*"take" + 0.006*"old"'),
 (2,
  '0.050*"get" + 0.041*"love" + 0.028*"give" + 0.019*"go

In [190]:
keywords_dict = get_keywords(cats_treats_doc_term_matrix, model = cat_treats_optimal_model)

In [191]:
[w for w in keywords_dict[3] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[0]]  

['open', 'come_running', 'recommend', 'want', 'keep', 'say', 'highly', 'sure']

In [192]:
[w for w in keywords_dict[1] if w not in  keywords_dict[3]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[0]]  

['eat', 'take', 'easy', 'great', 'work', 'first', 'pill_pockets']

In [193]:
[w for w in keywords_dict[2] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[0]\
                                and w not in  keywords_dict[3]]  

['find', 'cheap', 'natural', 'long', 'healthy', 'still']

In [194]:
[w for w in keywords_dict[0] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[3]]  

['grow', 'enjoy']

In [182]:
pyLDAvis.enable_notebook()
topic_data =  pyLDAvis.gensim_models.prepare(cat_treats_optimal_model, 
                                             cats_treats_doc_term_matrix, 
                                             cats_treats_dictionary, 
                                             mds = 'tsne')
pyLDAvis.display(topic_data)

  default_term_info = default_term_info.sort_values(


## 8. cats grooming

In [195]:
cats_grooming_dictionary, cats_grooming_doc_term_matrix = vectorize(cats_grooming_reviews2)
# cats_grooming_model_list, cats_grooming_coherence_values = compute_coherence_values(dictionary=cats_grooming_dictionary, 
#                                                     corpus=cats_grooming_doc_term_matrix, 
#                                                     texts=cats_grooming_reviews2, 
#                                                     start=2, 
#                                                     limit=20, 
#                                                     step=2)

# limit=20; start=2; step=2;
# x = range(start, limit, step)
# plt.plot(x, cats_grooming_coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()
# for m, cv in zip(x, cats_grooming_coherence_values):
#     print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
    
    
# with open ('./cats_grooming_model_list.pickle', 'wb') as handle:
#      pickle.dump(cats_grooming_model_list, handle)

  0%|          | 0/1150 [00:00<?, ?it/s]

  0%|          | 0/1150 [00:00<?, ?it/s]

In [196]:
with open ('./cats_grooming_model_list.pickle', 'rb') as handle:
     cats_grooming_model_list = pickle.load(handle)

In [199]:
cats_grooming_optimal_model = cats_grooming_model_list[1]
model_topics = cats_grooming_optimal_model.show_topics(formatted=False)

cats_grooming_optimal_model.print_topics(num_topics=4, num_words=30)

[(0,
  '0.031*"get" + 0.020*"well" + 0.019*"use" + 0.017*"make" + 0.014*"put" + 0.012*"small" + 0.010*"go" + 0.010*"seem" + 0.010*"good" + 0.009*"big" + 0.008*"try" + 0.008*"take" + 0.007*"still" + 0.007*"first" + 0.007*"clean" + 0.007*"let" + 0.007*"really" + 0.007*"enough" + 0.007*"fit" + 0.007*"much" + 0.006*"come" + 0.006*"easy" + 0.006*"see" + 0.006*"love" + 0.006*"large" + 0.006*"say" + 0.006*"give" + 0.005*"soft" + 0.005*"know" + 0.005*"quite"'),
 (1,
  '0.025*"use" + 0.020*"get" + 0.017*"love" + 0.016*"well" + 0.013*"much" + 0.013*"recommend" + 0.012*"long" + 0.012*"work" + 0.012*"buy" + 0.012*"also" + 0.011*"seem" + 0.011*"even" + 0.010*"try" + 0.010*"great" + 0.010*"make" + 0.010*"pet" + 0.009*"groom" + 0.009*"brush" + 0.009*"remove" + 0.009*"find" + 0.008*"look" + 0.008*"good" + 0.007*"see" + 0.007*"highly" + 0.007*"come" + 0.007*"never" + 0.007*"go" + 0.007*"say" + 0.007*"give" + 0.006*"little"'),
 (2,
  '0.033*"use" + 0.027*"get" + 0.022*"well" + 0.021*"easy" + 0.018*"grea

In [200]:
keywords_dict = get_keywords(cats_grooming_doc_term_matrix, model = cats_grooming_optimal_model)

In [202]:
[w for w in keywords_dict[3] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[0]]  

['easy',
 'cut',
 'trim',
 'sharp',
 'scratch',
 'hold',
 'soft_claws',
 'keep',
 'clip']

In [203]:
[w for w in keywords_dict[2] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[3]\
                                and w not in  keywords_dict[0]]  

['put', 'small', 'big', 'take', 'first', 'clean', 'let', 'enough', 'fit']

In [204]:
[w for w in keywords_dict[0] if w not in  keywords_dict[1]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[3]]  

['think', 'shed', 'come', 'pull']

In [205]:
[w for w in keywords_dict[1] if w not in  keywords_dict[3]\
                                and w not in  keywords_dict[2]\
                                and w not in  keywords_dict[0]]  

['recommend', 'even', 'pet', 'groom', 'remove', 'find']

In [206]:

pyLDAvis.enable_notebook()
topic_data =  pyLDAvis.gensim_models.prepare(cats_grooming_optimal_model, 
                                             cats_grooming_doc_term_matrix, 
                                             cats_grooming_dictionary, 
                                             mds = 'tsne')
pyLDAvis.display(topic_data)

  default_term_info = default_term_info.sort_values(
