In [1]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
plt.style.use ('ggplot')
import seaborn as sns
%matplotlib inline
sns.set_style(style="whitegrid")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text

from bs4 import BeautifulSoup

import nltk, string, contractions, random
from nltk.stem.snowball import SnowballStemmer

from gensim.corpora.dictionary import Dictionary
from gensim.models.nmf import Nmf
from gensim.models.coherencemodel import CoherenceModel

from operator import itemgetter

In [2]:
df1 = pd.read_csv('data/modeling_ready_microwave1')
df2 = pd.read_csv('data/modeling_ready_microwave2')
df3 = pd.read_csv('data/modeling_ready_microwave3')

In [3]:
df1.dropna(inplace=True)
df2.dropna(inplace=True)
df3.dropna(inplace=True)

In [4]:
df1.review_date = pd.to_datetime(df1.review_date)
df2.review_date = pd.to_datetime(df2.review_date)
df3.review_date = pd.to_datetime(df3.review_date)

In [5]:
keep_words = ['not', 'ain', 'aren', "aren't", 'couldn', "couldn't", 
              'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 
              'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 
              'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn',
              "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 
              'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 
              'wouldn', "wouldn't", 'fire', 'off']
test_stop_words = []
review_stop_words = ['panasonic', 'really', 'husband', 'thanks', 'thank', 'ha', 
              'just', 'thing', 'did', 'nn', 'wa', 'yr', 'u', 'say', 'doe',
              'mom', 'christmas', 'gift', 'got', 'way', 'le', 'daughter',
              'e','not','love','good','bought','great microwave','micro',
              'great oven','microwave','product','work great','nice work',
              'work great use','work great love','feature work great',
              'unit work great','oven work great','easy use love','old oven',
              'old old','20 year old','unit', 'not', 'work', 'amazon', 'com',
              'old', 'wife', 'highly', 'recommend', 'like', 'charm', '20', 
              'easy', 'oven', 'use', 'year', 'lot', 'pleased', 'happy', 'hope',
              'review', 'buy', 'far', 'day', '1', '2', '3', '4', '5', '6', '7',
              '8', '9', '10', '20', 'great', 'excellent','feature', 'nice', 'old']
for _ in text.ENGLISH_STOP_WORDS:
    if _ in keep_words:
        pass
    else:
        review_stop_words.append(_)
for _ in test_stop_words:
    review_stop_words.append(_)   
    
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()    
    
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

def vectorize_this(df, max_features, min_df, max_df, ngram_max):
    vectorizer = TfidfVectorizer(tokenizer = lemmatize_text,
                             stop_words= review_stop_words,
                             max_features = max_features,
                             min_df = min_df,
                             max_df =  max_df,
                             ngram_range=(1, ngram_max)
                            )
    X = df['model_ready']
    X = vectorizer.fit_transform(X)
    return X, vectorizer

In [298]:
topics1 = 14
mindf1 = 2
maxdf1 = .95
grams1 = 2

In [296]:
# topics2 = 7
# mindf2 = 2
# maxdf2 = .99
# grams2 = 3

In [297]:
# topics3 = 7
# mindf3 = 2
# maxdf3 = .99
# grams3 = 3

In [385]:
X1, vectorizer1 = vectorize_this(df1, 1000, mindf1, maxdf1, grams1)

In [386]:
# X2, vectorizer2 = vectorize_this(df2, 10000, mindf, maxdf, grams)

In [387]:
# X3, vectorizer3 = vectorize_this(df3, 10000, mindf, maxdf, grams)

In [388]:
def make_nmf(n_components, alpha, X):
    nmf = NMF(
            n_components=n_components,
            init='nndsvd',
            random_state=12345,
            alpha = alpha
            ).fit(X)

    W = nmf.fit_transform(X)
    H = nmf.components_
    return nmf, W, H

In [520]:
nmf1, W1, H1, = make_nmf(topics1, .1, X1)

In [390]:
# nmf2, W2, H2, = make_nmf(topics, .1, X2)

In [391]:
# nmf3, W3, H3, = make_nmf(topics, .1, X3)

In [521]:
def topic_keywords(nmf_model, vectorizer, n_words=10):

    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in nmf_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [522]:
list1 = np.array(vectorizer1.get_feature_names()).take((-nmf1.components_[6]).argsort()[:500]).tolist()
list1

['button',
 'open',
 'push',
 'push button',
 'open door',
 'press',
 'minute',
 'door',
 'button open',
 'quick',
 'fine',
 'start',
 'door open',
 'push open',
 'hand',
 'used',
 'pull',
 'make',
 'quick minute',
 'feel',
 'minute button',
 'release',
 'second',
 'hit',
 'little',
 'quality',
 'potato',
 'stick',
 'cheap',
 'wish',
 'plastic',
 'previous',
 '30',
 'solid',
 'hard',
 'basic',
 'control',
 'easily',
 'touch',
 'start button',
 'expected',
 'inside',
 'reheat button',
 'opening',
 'prefer',
 'slide',
 'light',
 '30 second',
 'want',
 'pushing',
 'going',
 'using',
 'disappointed',
 'know',
 'big',
 'different',
 'having',
 'worked fine',
 'press start',
 'plate',
 'liked',
 'pushing button',
 'door release',
 'handle',
 'door opening',
 'user',
 'rest',
 'using month',
 'easier',
 'choose',
 'real',
 'min',
 'panel',
 'add',
 'popcorn button',
 'step',
 'hard open',
 'heating',
 'compared',
 'pressing',
 'job',
 'bit',
 'complaint',
 'reheating',
 'counter',
 'extra',
 

In [523]:
list2=df1.model_ready[369].split()
list2

['purchased',
 'replace',
 'sharp',
 'nicely',
 'soft',
 'start',
 'inverter',
 'technology',
 'operates',
 'reduced',
 'power',
 'level',
 'user',
 'selects',
 'lower',
 'power',
 'setting',
 'instead',
 'constantly',
 'cycling',
 'off',
 'limited',
 'number',
 'cycle',
 'lifetime',
 'expect',
 'significantly',
 'longer']

In [519]:
((-nmf1.components_[1]).argsort()[:10])

array([263, 468, 159, 267, 585, 675, 807, 389, 820, 264])

In [524]:
for idx, i in enumerate(list1):
    if i in list2:
        print(i, idx)



start 11
user 65
instead 154
off 254
lower 291
reduced 385
sharp 400
nicely 439


In [525]:
topic_keywords1 = topic_keywords(nmf1, vectorizer1)        

In [526]:
topic_keywords1

[array(['power', 'level', 'power level', 'inverter', 'power setting',
        'cooking', 'setting', 'off', 'set', 'lower'], dtype='<U19'),
 array(['door', 'latch', 'close', 'door latch', 'open', 'problem', 'shut',
        'hard', 'slam', 'door close'], dtype='<U19'),
 array(['month', 'lasted', 'warranty', 'service', 'died', 'worked',
        'repair', 'purchased', 'customer', 'cost'], dtype='<U19'),
 array(['large', 'fit', 'big', 'space', 'size', 'powerful', 'kitchen',
        'counter', 'need', 'counter space'], dtype='<U19'),
 array(['sensor', 'reheat', 'sensor reheat', 'function', 'defrost', 'food',
        'reheat function', 'sensor cook', 'turbo', 'setting'], dtype='<U19'),
 array(['model', 'new', 'replace', 'previous', 'replaced', 'new model',
        'older', 'similar', 'died', 'purchased'], dtype='<U19'),
 array(['button', 'open', 'push', 'push button', 'open door', 'press',
        'minute', 'door', 'button open', 'quick'], dtype='<U19'),
 array(['cook', 'fast', 'cook fast', '

In [527]:
W1.shape, H1.shape

((1848, 14), (14, 1000))

In [528]:
nmf1.components_.shape

(14, 1000)

In [529]:
topic_keywords1 = topic_keywords(nmf1, vectorizer1)        

In [530]:
# topic_keywords2 = topic_keywords(vectorizer2, nmf2)        

In [531]:
# topic_keywords3 = topic_keywords(vectorizer3, nmf3)        

In [532]:
topic_keywords1

[array(['power', 'level', 'power level', 'inverter', 'power setting',
        'cooking', 'setting', 'off', 'set', 'lower'], dtype='<U19'),
 array(['door', 'latch', 'close', 'door latch', 'open', 'problem', 'shut',
        'hard', 'slam', 'door close'], dtype='<U19'),
 array(['month', 'lasted', 'warranty', 'service', 'died', 'worked',
        'repair', 'purchased', 'customer', 'cost'], dtype='<U19'),
 array(['large', 'fit', 'big', 'space', 'size', 'powerful', 'kitchen',
        'counter', 'need', 'counter space'], dtype='<U19'),
 array(['sensor', 'reheat', 'sensor reheat', 'function', 'defrost', 'food',
        'reheat function', 'sensor cook', 'turbo', 'setting'], dtype='<U19'),
 array(['model', 'new', 'replace', 'previous', 'replaced', 'new model',
        'older', 'similar', 'died', 'purchased'], dtype='<U19'),
 array(['button', 'open', 'push', 'push button', 'open door', 'press',
        'minute', 'door', 'button open', 'quick'], dtype='<U19'),
 array(['cook', 'fast', 'cook fast', '

In [533]:
# def topic_featuring(X, topic_keywords, n_components=10, n_words=10):
    
#     nmf, W, H = make_nmf(n_components, .1, X)
    
#     # Topic - Keywords Dataframe
#     df_topic_keywords = pd.DataFrame(topic_keywords)
#     df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
#     df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]

#     Topics_theme = range(n_components)
#     df_topic_keywords['topic_theme'] = Topics_theme
#     df_topic_keywords.set_index('topic_theme', inplace=True)
#     return df_topic_keywords.T

In [534]:
# topic_featuring(X1, topic_keywords1, topics1)

In [535]:
# topic_featuring(X2, topic_keywords2, topics2)

In [536]:
# topic_featuring(X3, topic_keywords3, topics3)

In [537]:
def df_featurizer(df, topic_keywords, topic_labels=None, n_components=10, column_names_known = 'n'):

    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    
    if column_names_known == 'n':
        Topics_theme = range(n_components)
    elif column_names_known == 'Y':
        Topics_theme = topic_labels
    df_topic_keywords['topic_theme'] = Topics_theme
    df_topic_keywords.set_index('topic_theme', inplace=True)
    return df_topic_keywords.T

In [538]:
df_featurizer(df1, topic_keywords1, n_components=topics1)

topic_theme,0,1,2,3,4,5,6,7,8,9,10,11,12,13
Word 0,power,door,month,large,sensor,model,button,cook,price,time,food,working,popcorn,watt
Word 1,level,latch,lasted,fit,reheat,new,open,fast,best,used,heat,stopped,setting,1250
Word 2,power level,close,warranty,big,sensor reheat,replace,push,cook fast,size,long,evenly,stopped working,bag,1250 watt
Word 3,inverter,door latch,service,space,function,previous,push button,cook evenly,expected,cooking,heat food,quit working,popcorn setting,watt power
Word 4,power setting,open,died,size,defrost,replaced,open door,evenly,store,item,quickly,quit,pop,little
Word 5,cooking,problem,worked,powerful,food,new model,press,sensor cook,right,second,heating,month,popcorn button,cubic
Word 6,setting,shut,repair,kitchen,reheat function,older,minute,minute,better,long time,cook food,completely,burn,cubic foot
Word 7,off,hard,purchased,counter,sensor cook,similar,door,larger,needed,cooking time,heat evenly,week,size,foot
Word 8,set,slam,customer,need,turbo,died,button open,cook food,delivery,time used,food evenly,stop,button,watt sensor
Word 9,lower,door close,cost,counter space,setting,purchased,quick,potato,value,arrived,make,color,popping,cuft


In [539]:
# nmf_featurizer(df2, max_df=maxdf2, ngram_max=grams2, n_components=topics2)

In [540]:
# nmf_featurizer(df3, max_df=maxdf3, ngram_max=grams3, n_components=topics3)

In [541]:
vocabulary1 = np.array(vectorizer1.get_feature_names())
vocabulary1

array(['0', '00', '100', '100 power', '1000', '1000 watt', '11', '1100',
       '12', '1200', '1200 watt', '1250', '1250 watt', '13', '14', '15',
       '16', '17', '18', '2009', '2010', '2011', '2012', '2013', '2014',
       '21', '24', '25', '2nd', '30', '30 second', '40', '50', '60', '70',
       '80', '90', 'ability', 'able', 'absolutely', 'accurate', 'actual',
       'actually', 'add', 'added', 'addition', 'adjust', 'advertised',
       'age', 'ago', 'allow', 'allows', 'amana', 'amazing', 'amp',
       'annoying', 'anymore', 'apparently', 'appears', 'appliance',
       'april', 'area', 'arrived', 'asin', 'ask', 'asked', 'attention',
       'attractive', 'authorized', 'auto', 'automatic', 'automatically',
       'available', 'average', 'avoid', 'aware', 'away', 'awesome',
       'bacon', 'bad', 'bag', 'bag popcorn', 'baked', 'baked potato',
       'baking', 'barely', 'based', 'basic', 'beautiful', 'beautifully',
       'beep', 'beeper', 'began', 'believe', 'bell', 'bell whistle',
 

In [542]:
# vocabulary2 = np.array(vectorizer2.get_feature_names())

In [543]:
# vocabulary3 = np.array(vectorizer3.get_feature_names())

In [544]:
def label_topics(H, vocabulary):
    '''
    Print the most influential words of each latent topic, and prompt the user
    to label each topic. The user should use their humanness to figure out what
    each latent topic is capturing.
    '''
    topic_labels = []
    for i, row in enumerate(H):
        top_five = np.argsort(row)[::-1][:12]
        print('topic', i)
        print('-->', ' '.join(vocabulary[top_five]))
        label = input('please label this topic: ')
        topic_labels.append(label)
        print()
    return topic_labels

In [546]:
topic_labels1 = label_topics(H1,vocabulary1)

topic 0
--> power level power level inverter power setting cooking setting off set lower technology lower power
please label this topic: power

topic 1
--> door latch close door latch open problem shut hard slam door close broke door open
please label this topic: door close

topic 2
--> month lasted warranty service died worked repair purchased customer cost ago center
please label this topic: month 

topic 3
--> large fit big space size powerful kitchen counter need counter space dish inside
please label this topic: size

topic 4
--> sensor reheat sensor reheat function defrost food reheat function sensor cook turbo setting reheat sensor warm
please label this topic: sensor

topic 5
--> model new replace previous replaced new model older similar died purchased needed replacement
please label this topic: model

topic 6
--> button open push push button open door press minute door button open quick fine start
please label this topic: button

topic 7
--> cook fast cook fast cook evenly ev

In [547]:
# topic_labels2 = label_topics(H2,vocabulary2)

In [548]:
# topic_labels3 = label_topics(H3,vocabulary3)

In [549]:
df_featurizer(df1, topic_keywords1, topic_labels1, n_components=topics1, column_names_known = 'Y')

topic_theme,power,door close,month,size,sensor,model,button,cook,price,time,food,working,popcorn,wattage
Word 0,power,door,month,large,sensor,model,button,cook,price,time,food,working,popcorn,watt
Word 1,level,latch,lasted,fit,reheat,new,open,fast,best,used,heat,stopped,setting,1250
Word 2,power level,close,warranty,big,sensor reheat,replace,push,cook fast,size,long,evenly,stopped working,bag,1250 watt
Word 3,inverter,door latch,service,space,function,previous,push button,cook evenly,expected,cooking,heat food,quit working,popcorn setting,watt power
Word 4,power setting,open,died,size,defrost,replaced,open door,evenly,store,item,quickly,quit,pop,little
Word 5,cooking,problem,worked,powerful,food,new model,press,sensor cook,right,second,heating,month,popcorn button,cubic
Word 6,setting,shut,repair,kitchen,reheat function,older,minute,minute,better,long time,cook food,completely,burn,cubic foot
Word 7,off,hard,purchased,counter,sensor cook,similar,door,larger,needed,cooking time,heat evenly,week,size,foot
Word 8,set,slam,customer,need,turbo,died,button open,cook food,delivery,time used,food evenly,stop,button,watt sensor
Word 9,lower,door close,cost,counter space,setting,purchased,quick,potato,value,arrived,make,color,popping,cuft


In [550]:
# df_featurizer(df2, topic_keywords2, topic_labels2, n_components=topics2, column_names_known = 'Y')

In [551]:
# df_featurizer(df3, topic_keywords3, topic_labels3, n_components=topics3, column_names_known = 'Y')

In [552]:
def softmax(v, temperature=1.0):
    '''
    A heuristic to convert arbitrary positive values into probabilities.
    See: https://en.wikipedia.org/wiki/Softmax_function
    '''
    expv = np.exp(v / temperature)
    s = np.sum(expv)
    return expv / s

In [553]:
def analyze_reviews(W, topic_labels):
    '''
    Print an analysis of a single Amazon review, including the review ID
    and a summary of which topics it represents. The topics are identified
    via the hand-labels which were assigned by the user.
    '''
    topic_count = len(topic_labels)
    topic_percentages = [[] for i in range(0, topic_count)]

    for idx, i in enumerate(W):  
        probs = softmax(W[idx], temperature=.01)
        topic_list_counter = 0
        for prob, label in zip(probs, topic_labels):
            topic_percentages[topic_list_counter].append(round(prob, 5))
            topic_list_counter += 1
        
    return pd.DataFrame(dict(zip(topic_labels, topic_percentages)))

In [554]:
top_df1 = analyze_reviews(W1, topic_labels1)

In [555]:
# top_df2 = analyze_reviews(W2, topic_labels2)

In [556]:
# top_df3 = analyze_reviews(W3, topic_labels3)

In [557]:
top_df1

Unnamed: 0,power,door close,month,size,sensor,model,button,cook,price,time,food,working,popcorn,wattage
0,0.00000,0.99999,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00001,0.00000,0.00000,0.00000,0.00000
1,0.00000,0.00000,0.00000,0.00001,0.00000,0.00000,0.00000,0.99999,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
2,0.00001,0.00001,0.00001,0.99983,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001
3,0.01001,0.01428,0.76334,0.01190,0.01001,0.08203,0.01001,0.01001,0.01381,0.01001,0.01006,0.01218,0.01016,0.03220
4,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1843,0.00030,0.00013,0.00013,0.28535,0.00015,0.00089,0.00013,0.00013,0.00048,0.24919,0.46275,0.00013,0.00013,0.00013
1844,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.01272,0.00000,0.00000,0.98727,0.00000,0.00000,0.00000
1845,0.00000,0.00000,0.00000,0.00023,0.00000,0.00000,0.00000,0.00002,0.00002,0.00000,0.05836,0.00000,0.94135,0.00000
1846,0.00001,0.00001,0.00003,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.99984,0.00002,0.00001,0.00001,0.00001


In [558]:
top_df1.loc[0]

power         0.00000
door close    0.99999
month         0.00000
size          0.00000
sensor        0.00000
model         0.00000
button        0.00000
cook          0.00000
price         0.00000
time          0.00001
food          0.00000
working       0.00000
popcorn       0.00000
wattage       0.00000
Name: 0, dtype: float64

In [559]:
def feature_score(top_df):    
    feature_score = {}
    for _ in range(len(top_df.columns)):
        key = top_df.columns[_]
        feature_score[key] = round(sum(top_df[key])/(len(top_df)),5)
    return feature_score

In [560]:
feature_score1 = feature_score(top_df1)
sorted(feature_score1.items(), key= lambda x: x[1], reverse=True)

[('month ', 0.12697),
 ('size', 0.11984),
 ('door close', 0.09322),
 ('model', 0.07896),
 ('time', 0.07445),
 ('food', 0.06337),
 ('price', 0.06265),
 ('button', 0.06171),
 ('sensor', 0.06114),
 ('cook', 0.06087),
 ('power', 0.06047),
 ('working', 0.04711),
 ('wattage', 0.04477),
 ('popcorn', 0.04447)]

In [561]:
# feature_score2 = feature_score(top_df2)
# sorted(feature_score2.items(), key= lambda x: x[1], reverse=True)

In [562]:
# feature_score3 = feature_score(top_df3)
# sorted(feature_score3.items(), key= lambda x: x[1], reverse=True)

In [563]:
topic_words_df1 = df_featurizer(df1, topic_keywords1, topic_labels1, column_names_known ='Y')
# topic_words_df2 = nmf_featurizer(df2, topic_keywords2, topic_labels2, column_names_known ='Y')
# topic_words_df3 = nmf_featurizer(df3, topic_keywords3, topic_labels3, column_names_known ='Y')


In [564]:
topic_words_df1

topic_theme,power,door close,month,size,sensor,model,button,cook,price,time,food,working,popcorn,wattage
Word 0,power,door,month,large,sensor,model,button,cook,price,time,food,working,popcorn,watt
Word 1,level,latch,lasted,fit,reheat,new,open,fast,best,used,heat,stopped,setting,1250
Word 2,power level,close,warranty,big,sensor reheat,replace,push,cook fast,size,long,evenly,stopped working,bag,1250 watt
Word 3,inverter,door latch,service,space,function,previous,push button,cook evenly,expected,cooking,heat food,quit working,popcorn setting,watt power
Word 4,power setting,open,died,size,defrost,replaced,open door,evenly,store,item,quickly,quit,pop,little
Word 5,cooking,problem,worked,powerful,food,new model,press,sensor cook,right,second,heating,month,popcorn button,cubic
Word 6,setting,shut,repair,kitchen,reheat function,older,minute,minute,better,long time,cook food,completely,burn,cubic foot
Word 7,off,hard,purchased,counter,sensor cook,similar,door,larger,needed,cooking time,heat evenly,week,size,foot
Word 8,set,slam,customer,need,turbo,died,button open,cook food,delivery,time used,food evenly,stop,button,watt sensor
Word 9,lower,door close,cost,counter space,setting,purchased,quick,potato,value,arrived,make,color,popping,cuft


In [565]:
review_df_columns1 = (['review_body'] + topic_labels1)
# review_df_columns2 = (['review_body'] + topic_labels2)
# review_df_columns3 = (['review_body'] + topic_labels3)

In [566]:
review_df1 = pd.concat([df1, top_df1.reindex(df1.index)], axis=1)[review_df_columns1]
# review_df2 = pd.concat([df2, top_df2.reindex(df2.index)], axis=1)[review_df_columns2]
# review_df3 = pd.concat([df3, top_df3.reindex(df3.index)], axis=1)[review_df_columns3]

In [567]:
def review_checker(review_df, num_samples):
    review_df = review_df
    review_index = []
    for _ in range(num_samples):
        review_index.append(random.randint(0, len(review_df)))
        
    for _ in review_index:
        print('Review ' + str(_))
        print(' ')
        print(review_df.review_body[_])
        print(' ')
        print(review_df.loc[_][1:])
        print(' ')
        print('___________________________________________________')
        print(' ')
    

In [568]:
len(W1)

1848

In [569]:
W1[:,1].argsort()[-20:]

array([ 229,  621,  598,  654,  324,   14,  161,  155,  891,  128,   34,
        133,  601, 1340,  454, 1280,   31,  554, 1426,   60])

In [570]:
H1[:,0].argsort()[:]

array([ 1,  3,  5,  8, 10,  0, 11, 12,  4,  9,  2,  7,  6, 13])

In [571]:
df1.review_body[554]

'This microwave is AWESOME!!! I love the automatic defrost, automatic reheat, Automatic Cook, one touch button functions! They take ALL of the guess work out of microwaving ANYTHING. This thing is huge and has a turntable to match. It looks beautiful, works beautiful and anyone will love it. BUY THIS!'

In [572]:
np.array(vectorizer1.get_feature_names()).take(nmf1.components_[1].argsort()[:])[-100:]

array(['pushed', 'decent', 'noise', 'order', 'replacement', 'flaw',
       'trouble', 'happened', 'screw', 'think', 'paid', 'door opening',
       'try', 'definitely', 'negative', 'stopped', 'hard open', 'safety',
       'difficult', 'pushing', 'lever', 'half', 'need', 'guess',
       'release', 'noticed', 'handle', 'finally', 'star', 'opened',
       'door time', 'hand', 'repair', 'poor', 'spring', 'opening closing',
       'eventually', 'fix', 'make', 'month door', 'break', 'stop',
       'buying', 'unless', 'pull', 'bit', 'quality', 'read', 'place',
       'worked', 'stick', 'force', 'started', 'complaint', 'stay',
       'right', 'open close', 'completely', 'latch mechanism', 'slamming',
       'push', 'light', 'door shut', 'longer', 'ago', 'little', 'plastic',
       'problem door', 'broken', 'closing door', 'door hard',
       'door closed', 'start', 'issue', 'design', 'loud', 'switch',
       'opening', 'properly', 'hard close', 'off', 'open door',
       'close door', 'turn', '

In [573]:

for idx, i in enumerate(W1[:,2].argsort()[:]):
    if i == 368:
        print(idx)

618


In [574]:
df1.review_body[369]

'Purchased this to replace  a (only three year old) Sharp Microwave.  Works very nicely, with a soft start.  Because of the inverter technology, it operates at reduced power levels when the user selects a lower power setting, instead of constantly cycling the unit off and on.  Most everything has a limited number cycles in their lifetime...I expect this to last significantly longer'

In [575]:
review_df1.loc[369][1:]

power         0.00051
door close    0.00011
month         0.00014
size          0.01749
sensor        0.00088
model         0.00011
button        0.97983
cook          0.00011
price         0.00011
time          0.00011
food          0.00011
working       0.00019
popcorn       0.00011
wattage       0.00021
Name: 369, dtype: object

In [478]:
review_checker(review_df1, 5)

Review 1199
 
We were looking for a microwave similar to our old one, a Kenmore, and this one is does it. We especially like the defrost mode.
 
power         0
door latch    0
3             0
4             0
5             1
6             0
7             0
8             0
9             0
10            0
11            0
12            0
13            0
14            0
Name: 1199, dtype: object
 
___________________________________________________
 
Review 1454
 
I love this microwave. Powerful. Alot of Auto settings available. Its a little larger than I expected but still fits perfectly on the countertop.
 
power         0.00037
door latch     0.0001
3              0.0001
4             0.36818
5             0.00032
6              0.0001
7              0.0001
8              0.0001
9              0.0001
10            0.00106
11            0.62874
12             0.0001
13             0.0001
14             0.0005
Name: 1454, dtype: object
 
___________________________________________________