In [199]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
plt.style.use ('ggplot')
import seaborn as sns
%matplotlib inline
sns.set_style(style="whitegrid")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text

from bs4 import BeautifulSoup

import nltk, string, contractions

from gensim.corpora.dictionary import Dictionary
from gensim.models.nmf import Nmf
from gensim.models.coherencemodel import CoherenceModel

from operator import itemgetter

In [204]:
df = pd.read_csv('data/modeling_ready_microwave1')

In [205]:
df.dropna(inplace=True)

In [206]:
df.review_date = pd.to_datetime(df.review_date)

In [207]:
def vectorize_this(max_features, min_df, max_df, ngram_max):
    vectorizer = TfidfVectorizer(tokenizer = lemmatize_text,
                             stop_words= review_stop_words,
                             max_features = max_features,
                             min_df = min_df,
                             max_df =  max_df,
                             ngram_range=(1, ngram_max)
                            )
    X = df['model_ready']
    X = vectorizer.fit_transform(X)
    return X, vectorizer

In [248]:
X, vectorizer = vectorize_this(10000, 2, .9, 3)

In [249]:
def make_nmf(n_components, alpha, X):
    nmf = NMF(
            n_components=n_components,
            init='nndsvd',
            random_state=12345,
            alpha = alpha
            ).fit(X)

    W = nmf.fit_transform(X)
    H = nmf.components_
    return nmf, W, H

In [250]:
nmf, W, H, = make_nmf(16, .1, X)

In [252]:
def topic_keywords(vectorizer=vectorizer, lda_model=nmf, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [253]:
topic_keywords = topic_keywords()        

In [254]:
topic_keywords

[array(['time', 'cooking', 'cooking time', 'long', 'long time', 'item',
        'second', 'arrived time', 'run', 'minute'], dtype='<U31'),
 array(['door', 'open', 'latch', 'close', 'door latch', 'door open',
        'push', 'hard', 'problem', 'slam'], dtype='<U31'),
 array(['month', 'service', 'warranty', 'repair', 'customer', 'died',
        'problem', 'center', 'cost', 'worked'], dtype='<U31'),
 array(['large', 'fit', 'space', 'size', 'counter', 'kitchen', 'need',
        'counter space', 'larger', 'needed'], dtype='<U31'),
 array(['sensor', 'reheat', 'sensor reheat', 'function', 'defrost', 'food',
        'reheat function', 'sensor reheat function', 'turbo', 'hot'],
       dtype='<U31'),
 array(['big', 'turkey', 'expected', 'big powerful', 'big inside',
        'inside', 'job', 'big need', 'big big', 'need'], dtype='<U31'),
 array(['power', 'level', 'power level', 'inverter', 'setting',
        'power setting', 'cooking', 'set', 'off', 'lower'], dtype='<U31'),
 array(['working', 'st

In [255]:
def topic_featuring(n_components=10, n_words=10):
    
    nmf, W, H = make_nmf(n_components, .1, X)
    

    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]

    Topics_theme = range(n_components)
    df_topic_keywords['topic_theme'] = Topics_theme
    df_topic_keywords.set_index('topic_theme', inplace=True)
    return df_topic_keywords.T

In [256]:
topic_featuring(16)

topic_theme,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
Word 0,time,door,month,large,sensor,big,power,working,lasted,price,model,watt,powerful,heat,popcorn,used
Word 1,cooking,open,service,fit,reheat,turkey,level,stopped,second,best,new,1250,powerful large,food,button,look
Word 2,cooking time,latch,warranty,space,sensor reheat,expected,power level,stopped working,going,store,replace,1250 watt,room,evenly,setting,little
Word 3,long,close,repair,size,function,big powerful,inverter,quit working,performance,right,replaced,watt power,loud,heat food,bag,long used
Word 4,long time,door latch,customer,counter,defrost,big inside,setting,quit,died,delivery,previous,1250 watt power,big powerful,quickly,minute,time used
Word 5,item,door open,died,kitchen,food,inside,power setting,month working,lasted 10,value,new model,fast,large,fast,popcorn button,getting
Word 6,second,push,problem,need,reheat function,job,cooking,month,owned,needed,older,little,spacious,food evenly,pop,think
Word 7,arrived time,hard,center,counter space,sensor reheat function,big need,set,given,10,size,similar,1000,hot,heat evenly,popcorn setting,daily
Word 8,run,problem,cost,larger,turbo,big big,off,color,month,best price,older model,1000 watt,spacious powerful,heating,push,used daily
Word 9,minute,slam,worked,needed,hot,need,lower,working month,long,arrived,died,watt sensor,simple,food quickly,make,compact


In [257]:
def nmf_featurizer(max_features=10000, min_df=4, max_df=.8, ngram_max = 2, n_components=10, n_words=10, column_names_known = 'n'):
    X, vectorizer = vectorize_this(max_features, min_df, max_df, ngram_max)
    
    nmf, W, H = make_nmf(n_components, .1, X)
    
    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    
    if column_names_known == 'n':
        Topics_theme = range(n_components)
    elif column_names_known == 'Y':
        Topics_theme = topic_labels
    df_topic_keywords['topic_theme'] = Topics_theme
    df_topic_keywords.set_index('topic_theme', inplace=True)
    return df_topic_keywords.T

In [258]:
nmf_featurizer(max_df=.9, ngram_max=3, n_components=16)

topic_theme,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
Word 0,time,door,month,large,sensor,big,power,working,lasted,price,model,watt,powerful,heat,popcorn,used
Word 1,cooking,open,service,fit,reheat,turkey,level,stopped,second,best,new,1250,powerful large,food,button,look
Word 2,cooking time,latch,warranty,space,sensor reheat,expected,power level,stopped working,going,store,replace,1250 watt,room,evenly,setting,little
Word 3,long,close,repair,size,function,big powerful,inverter,quit working,performance,right,replaced,watt power,loud,heat food,bag,long used
Word 4,long time,door latch,customer,counter,defrost,big inside,setting,quit,died,delivery,previous,1250 watt power,big powerful,quickly,minute,time used
Word 5,item,door open,died,kitchen,food,inside,power setting,month working,lasted 10,value,new model,fast,large,fast,popcorn button,getting
Word 6,second,push,problem,need,reheat function,job,cooking,month,owned,needed,older,little,spacious,food evenly,pop,think
Word 7,arrived time,hard,center,counter space,sensor reheat function,big need,set,given,10,size,similar,1000,hot,heat evenly,popcorn setting,daily
Word 8,run,problem,cost,larger,turbo,big big,off,color,month,best price,older model,1000 watt,spacious powerful,heating,push,used daily
Word 9,minute,slam,worked,needed,hot,need,lower,working month,long,arrived,died,watt sensor,simple,food quickly,make,compact


In [259]:
vocabulary = np.array(vectorizer.get_feature_names())

In [None]:
def label_topics(H, vocabulary):
    '''
    Print the most influential words of each latent topic, and prompt the user
    to label each topic. The user should use their humanness to figure out what
    each latent topic is capturing.
    '''
    topic_labels = []
    for i, row in enumerate(H):
        top_five = np.argsort(row)[::-1][:12]
        print('topic', i)
        print('-->', ' '.join(vocabulary[top_five]))
        label = input('please label this topic: ')
        topic_labels.append(label)
        print()
    return topic_labels

In [260]:
nmf_featurizer(max_df=.9, ngram_max=3, n_components=16)

topic_theme,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
Word 0,time,door,month,large,sensor,big,power,working,lasted,price,model,watt,powerful,heat,popcorn,used
Word 1,cooking,open,service,fit,reheat,turkey,level,stopped,second,best,new,1250,powerful large,food,button,look
Word 2,cooking time,latch,warranty,space,sensor reheat,expected,power level,stopped working,going,store,replace,1250 watt,room,evenly,setting,little
Word 3,long,close,repair,size,function,big powerful,inverter,quit working,performance,right,replaced,watt power,loud,heat food,bag,long used
Word 4,long time,door latch,customer,counter,defrost,big inside,setting,quit,died,delivery,previous,1250 watt power,big powerful,quickly,minute,time used
Word 5,item,door open,died,kitchen,food,inside,power setting,month working,lasted 10,value,new model,fast,large,fast,popcorn button,getting
Word 6,second,push,problem,need,reheat function,job,cooking,month,owned,needed,older,little,spacious,food evenly,pop,think
Word 7,arrived time,hard,center,counter space,sensor reheat function,big need,set,given,10,size,similar,1000,hot,heat evenly,popcorn setting,daily
Word 8,run,problem,cost,larger,turbo,big big,off,color,month,best price,older model,1000 watt,spacious powerful,heating,push,used daily
Word 9,minute,slam,worked,needed,hot,need,lower,working month,long,arrived,died,watt sensor,simple,food quickly,make,compact


In [261]:
topic_labels = label_topics(H,vocabulary)

topic 0
--> time cooking cooking time long long time item second arrived time run minute time used tell
please label this topic: cooking time

topic 1
--> door open latch close door latch door open push hard problem slam open door shut
please label this topic: door

topic 2
--> month service warranty repair customer died problem center cost worked week purchased
please label this topic: customer service & support

topic 3
--> large fit space size counter kitchen need counter space larger needed wanted dish
please label this topic: external size

topic 4
--> sensor reheat sensor reheat function defrost food reheat function sensor reheat function turbo hot warm reheat sensor
please label this topic: pre-programmed functions

topic 5
--> big turkey expected big powerful big inside inside job big need big big need room big fit
please label this topic: internal size

topic 6
--> power level power level inverter setting power setting cooking set off lower technology lower power
please label 

In [262]:
nmf_featurizer(max_df=.9, ngram_max=3, n_components=16, column_names_known = 'Y')

topic_theme,cooking time,door,customer service & support,external size,pre-programmed functions,internal size,power controls,short product lifetime,product lifetime,perceived value,replacement for older microwave,power and wattage,power,cooks evenly,popcorn,regular use
Word 0,time,door,month,large,sensor,big,power,working,lasted,price,model,watt,powerful,heat,popcorn,used
Word 1,cooking,open,service,fit,reheat,turkey,level,stopped,second,best,new,1250,powerful large,food,button,look
Word 2,cooking time,latch,warranty,space,sensor reheat,expected,power level,stopped working,going,store,replace,1250 watt,room,evenly,setting,little
Word 3,long,close,repair,size,function,big powerful,inverter,quit working,performance,right,replaced,watt power,loud,heat food,bag,long used
Word 4,long time,door latch,customer,counter,defrost,big inside,setting,quit,died,delivery,previous,1250 watt power,big powerful,quickly,minute,time used
Word 5,item,door open,died,kitchen,food,inside,power setting,month working,lasted 10,value,new model,fast,large,fast,popcorn button,getting
Word 6,second,push,problem,need,reheat function,job,cooking,month,owned,needed,older,little,spacious,food evenly,pop,think
Word 7,arrived time,hard,center,counter space,sensor reheat function,big need,set,given,10,size,similar,1000,hot,heat evenly,popcorn setting,daily
Word 8,run,problem,cost,larger,turbo,big big,off,color,month,best price,older model,1000 watt,spacious powerful,heating,push,used daily
Word 9,minute,slam,worked,needed,hot,need,lower,working month,long,arrived,died,watt sensor,simple,food quickly,make,compact


In [226]:
def softmax(v, temperature=1.0):
    '''
    A heuristic to convert arbitrary positive values into probabilities.
    See: https://en.wikipedia.org/wiki/Softmax_function
    '''
    expv = np.exp(v / temperature)
    s = np.sum(expv)
    return expv / s

In [227]:
def analyze_reviews(W, topic_labels):
    '''
    Print an analysis of a single Amazon review, including the review ID
    and a summary of which topics it represents. The topics are identified
    via the hand-labels which were assigned by the user.
    '''
    topic_count = len(topic_labels)
    topic_percentages = [[] for i in range(0, topic_count)]

    for idx, i in enumerate(W):  
        probs = softmax(W[idx], temperature=0.01)
        topic_list_counter = 0
        for prob, label in zip(probs, topic_labels):
            topic_percentages[topic_list_counter].append(round(prob, 5))
            topic_list_counter += 1
        
    return pd.DataFrame(dict(zip(topic_labels, topic_percentages)))

In [263]:
top_df = analyze_reviews(W, topic_labels)

In [264]:
top_df

Unnamed: 0,cooking time,door,customer service & support,external size,pre-programmed functions,internal size,power controls,short product lifetime,product lifetime,perceived value,replacement for older microwave,power and wattage,power,cooks evenly,popcorn,regular use
0,0.00039,0.99950,0.00004,0.00000,0.00001,0.00000,0.00001,0.00000,0.00001,0.00000,0.00000,0.00000,0.00000,0.00001,0.00000,0.00000
1,0.00042,0.00044,0.00042,0.09469,0.00042,0.00173,0.00042,0.00042,0.00042,0.00042,0.00042,0.00064,0.00585,0.89231,0.00042,0.00052
2,0.00000,0.00000,0.00000,0.99997,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00001,0.00000,0.00000,0.00000
3,0.00822,0.00822,0.82180,0.00822,0.00822,0.00891,0.00822,0.00832,0.00822,0.01116,0.04384,0.02246,0.00951,0.00822,0.00822,0.00822
4,0.00000,0.99999,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1843,0.00018,0.00000,0.00000,0.00001,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.99979,0.00001,0.00000,0.00000
1844,0.00000,0.00000,0.00000,0.00001,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.99997,0.00000,0.00000
1845,0.00000,0.00000,0.00000,0.00123,0.00000,0.00000,0.00000,0.00000,0.00000,0.00001,0.00000,0.00000,0.00000,0.95031,0.04840,0.00000
1846,0.99982,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00006,0.00001,0.00001


In [265]:
top_df.loc[0]

cooking time                       0.00039
door                               0.99950
customer service & support         0.00004
external size                      0.00000
pre-programmed functions           0.00001
internal size                      0.00000
power controls                     0.00001
short product lifetime             0.00000
product lifetime                   0.00001
perceived value                    0.00000
replacement for older microwave    0.00000
power and wattage                  0.00000
power                              0.00000
cooks evenly                       0.00001
popcorn                            0.00000
regular use                        0.00000
Name: 0, dtype: float64

In [266]:
feature_score = {}
for _ in range(len(top_df.columns)):
    key = top_df.columns[_]
    feature_score[key] = round(sum(top_df[key])/(len(top_df)),5)

In [267]:
feature_score

{'cooking time': 0.05893,
 'door': 0.10089,
 'customer service & support': 0.11456,
 'external size': 0.10117,
 'pre-programmed functions': 0.06478,
 'internal size': 0.0396,
 'power controls': 0.06383,
 'short product lifetime': 0.04231,
 'product lifetime': 0.04238,
 'perceived value': 0.0515,
 'replacement for older microwave': 0.07272,
 'power and wattage': 0.04117,
 'power': 0.03404,
 'cooks evenly': 0.06591,
 'popcorn': 0.06207,
 'regular use': 0.04413}

In [268]:
sorted(feature_score.items(), key= lambda x: x[1])

[('power', 0.03404),
 ('internal size', 0.0396),
 ('power and wattage', 0.04117),
 ('short product lifetime', 0.04231),
 ('product lifetime', 0.04238),
 ('regular use', 0.04413),
 ('perceived value', 0.0515),
 ('cooking time', 0.05893),
 ('popcorn', 0.06207),
 ('power controls', 0.06383),
 ('pre-programmed functions', 0.06478),
 ('cooks evenly', 0.06591),
 ('replacement for older microwave', 0.07272),
 ('door', 0.10089),
 ('external size', 0.10117),
 ('customer service & support', 0.11456)]

In [269]:
topic_words_df = nmf_featurizer(column_names_known ='Y')
topic_words_df

topic_theme,cooking time,door,customer service & support,external size,pre-programmed functions,internal size,power controls,short product lifetime,product lifetime,perceived value,replacement for older microwave,power and wattage,power,cooks evenly,popcorn,regular use
Word 0,time,door,month,large,sensor,big,power,working,lasted,price,model,watt,powerful,heat,popcorn,used
Word 1,cooking,open,service,fit,reheat,turkey,level,stopped,second,best,new,1250,powerful large,food,button,look
Word 2,cooking time,latch,warranty,space,sensor reheat,expected,power level,stopped working,going,store,replace,1250 watt,room,evenly,setting,little
Word 3,long,close,repair,size,function,big powerful,inverter,quit working,performance,right,replaced,watt power,loud,heat food,bag,long used
Word 4,long time,door latch,customer,counter,defrost,big inside,setting,quit,died,delivery,previous,1250 watt power,big powerful,quickly,minute,time used
Word 5,item,door open,died,kitchen,food,inside,power setting,month working,lasted 10,value,new model,fast,large,fast,popcorn button,getting
Word 6,second,push,problem,need,reheat function,job,cooking,month,owned,needed,older,little,spacious,food evenly,pop,think
Word 7,arrived time,hard,center,counter space,sensor reheat function,big need,set,given,10,size,similar,1000,hot,heat evenly,popcorn setting,daily
Word 8,run,problem,cost,larger,turbo,big big,off,color,month,best price,older model,1000 watt,spacious powerful,heating,push,used daily
Word 9,minute,slam,worked,needed,hot,need,lower,working month,long,arrived,died,watt sensor,simple,food quickly,make,compact


In [271]:
review_df = pd.concat([df, top_df.reindex(df.index)], axis=1)[['review_body',
                                                               'cooking time', 'door', 
                                                               'customer service & support',
                                                               'external size', 
                                                               'pre-programmed functions',
                                                               'internal size', 'power controls', 
                                                               'short product lifetime',
                                                               'product lifetime', 
                                                               'perceived value', 
                                                               'replacement for older microwave',
                                                               'power and wattage', 'power',
                                                               'cooks evenly', 'popcorn', 
                                                               'regular use']]

In [272]:
print(review_df.review_body[0])
print(review_df.loc[0])

I have had this microwave for just over 3 years and the door won't close, or the microwave doesn't know that the door is closed so it won't turn on. It is rather irritating to be having to keep slamming and pressing and shaking the door to make it start. Many a times we end up heating/cooking on the stove. I do not think it is worth the over $100 we have to pay for it and throw it after 2-3 years. I see that many others have mentioned this problem.
review_body                        I have had this microwave for just over 3 year...
cooking time                                                                 0.00039
door                                                                          0.9995
customer service & support                                                     4e-05
external size                                                                      0
pre-programmed functions                                                       1e-05
internal size                        

In [273]:
print(review_df.review_body[42])
print(review_df.loc[42])

STAY AWAY FROM PANASONIC MICROWAVES!!! I guess I should be happy, it worked fine for 24 months, which for today's microwaves is probably great. BUT...yesterday it almost burnt down my house!!! Put in 3 frozen microwave pancakes on a microwavable plate, set time for 1:00 minute (I checked to make sure I set timer right), pressed START, and went upstairs for a few minutes. Smelled smoke and ran back down to see my entire first floor engulfed in smoke from fire inside microwave. Unplugged and carefully carried unit outside. Researched this model and I've seen many reports/reviews of arcing and FIRE, just like what happened to me. DANGEROUS!!! If I hadn't come back down, might have set the kitchen and house on fire.
review_body                        STAY AWAY FROM PANASONIC MICROWAVES!!! I guess...
cooking time                                                                 0.00847
door                                                                         0.21134
customer service & supp

In [274]:
print(review_df.review_body[420])
print(review_df.loc[420])

Works as expected.  Nice big size. What else can I say. Was a present for my daughter. Good price. .
review_body                        Works as expected.  Nice big size. What else c...
cooking time                                                                 0.00018
door                                                                          0.0002
customer service & support                                                   0.00129
external size                                                                0.01156
pre-programmed functions                                                     0.94239
internal size                                                                0.00017
power controls                                                               0.00093
short product lifetime                                                       0.00017
product lifetime                                                             0.00019
perceived value                                  

In [275]:
print(review_df.review_body[666])
print(review_df.loc[666])

I wont' go into a ton of detail on this product.  The price was the selling point for me and the good avg reviews.  I will comment on the &#34;sensor reheat&#34; and &#34;sensor cook&#34; functions.  They actually work.  I was actually amazed at that.  After Thanksgiving I put 6 containers in the microwave at once and hit &#34;sensor reheat&#34;.  The containers were:  Turkey, Mashed Potatos, Stuffing, green beans,...hrm, can't remember the last two.  Regardless, of all the containers that came out only the mashed potatos required any further heating.<br /><br />On the sensor cook option it does a nice job overall but it did leave frozen chicken a bit undercooked near the middle of the leg sections.  But yes, I put those sections into it frozen to start with.  I'd count that a win.  (In case you're wondering I was planning on battering/frying that chicken anyhow so a little underdone was perfect.)
review_body                        I wont' go into a ton of detail on this produc...
cook