In [199]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
plt.style.use ('ggplot')
import seaborn as sns
%matplotlib inline
sns.set_style(style="whitegrid")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text

from bs4 import BeautifulSoup

import nltk, string, contractions

from gensim.corpora.dictionary import Dictionary
from gensim.models.nmf import Nmf
from gensim.models.coherencemodel import CoherenceModel

from operator import itemgetter

In [204]:
df = pd.read_csv('data/modeling_ready_microwave1')

In [205]:
df.dropna(inplace=True)

In [206]:
df.review_date = pd.to_datetime(df.review_date)

In [207]:
def vectorize_this(max_features, min_df, max_df, ngram_max):
    vectorizer = TfidfVectorizer(tokenizer = lemmatize_text,
                             stop_words= review_stop_words,
                             max_features = max_features,
                             min_df = min_df,
                             max_df =  max_df,
                             ngram_range=(1, ngram_max)
                            )
    X = df['model_ready']
    X = vectorizer.fit_transform(X)
    return X, vectorizer

In [208]:
X, vectorizer = vectorize_this(10000, 4, .95, 3)

In [209]:
def make_nmf(n_components, alpha, X):
    nmf = NMF(
            n_components=n_components,
            init='nndsvd',
            random_state=12345,
            alpha = alpha
            ).fit(X)

    W = nmf.fit_transform(X)
    H = nmf.components_
    return nmf, W, H

In [210]:
nmf, W, H, = make_nmf(8, .1, X)

In [211]:
def topic_keywords(vectorizer=vectorizer, lda_model=nmf, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [212]:
topic_keywords = topic_keywords()        

In [213]:
topic_keywords

[array(['popcorn', 'button', 'setting', 'bag', 'minute', 'popcorn button',
        'pop', 'push', 'popcorn setting', 'make'], dtype='<U26'),
 array(['door', 'open', 'latch', 'close', 'door latch', 'door open',
        'push', 'hard', 'problem', 'slam'], dtype='<U26'),
 array(['month', 'lasted', 'working', 'warranty', 'service', 'repair',
        'died', 'worked', '3', 'stopped'], dtype='<U26'),
 array(['large', 'fit', 'price', 'big', 'size', 'space', 'powerful',
        'counter', 'kitchen', 'need'], dtype='<U26'),
 array(['sensor', 'reheat', 'food', 'sensor reheat', 'defrost', 'heat',
        'function', 'hot', 'heat food', 'evenly'], dtype='<U26'),
 array(['model', 'new', 'replace', 'previous', 'replaced', 'new model',
        'died', 'older', 'similar', 'purchased'], dtype='<U26'),
 array(['power', 'level', 'power level', 'watt', 'inverter', '1250',
        '1250 watt', 'cooking', 'setting', 'power setting'], dtype='<U26'),
 array(['time', 'used', 'long', 'cooking', 'second', 'cooki

In [214]:
def topic_featuring(n_components=10, n_words=10):
    
    nmf, W, H = make_nmf(n_components, .1, X)
    

    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]

    Topics_theme = range(n_components)
    df_topic_keywords['topic_theme'] = Topics_theme
    df_topic_keywords.set_index('topic_theme', inplace=True)
    return df_topic_keywords.T

In [215]:
topic_featuring(8)

topic_theme,0,1,2,3,4,5,6,7
Word 0,popcorn,door,month,large,sensor,model,power,time
Word 1,button,open,lasted,fit,reheat,new,level,used
Word 2,setting,latch,working,price,food,replace,power level,long
Word 3,bag,close,warranty,big,sensor reheat,previous,watt,cooking
Word 4,minute,door latch,service,size,defrost,replaced,inverter,second
Word 5,popcorn button,door open,repair,space,heat,new model,1250,cooking time
Word 6,pop,push,died,powerful,function,died,1250 watt,heat
Word 7,push,hard,worked,counter,hot,older,cooking,item
Word 8,popcorn setting,problem,3,kitchen,heat food,similar,setting,long time
Word 9,make,slam,stopped,need,evenly,purchased,power setting,time used


In [216]:
def nmf_featurizer(max_features=10000, min_df=4, max_df=.8, ngram_max = 2, n_components=10, n_words=10, column_names_known = 'n'):
    X, vectorizer = vectorize_this(max_features, min_df, max_df, ngram_max)
    
    nmf, W, H = make_nmf(n_components, .1, X)
    
    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    
    if column_names_known == 'n':
        Topics_theme = range(n_components)
    elif column_names_known == 'Y':
        Topics_theme = topic_labels
    df_topic_keywords['topic_theme'] = Topics_theme
    df_topic_keywords.set_index('topic_theme', inplace=True)
    return df_topic_keywords.T

In [217]:
nmf_featurizer(max_df=.95, ngram_max=3, n_components=8)

topic_theme,0,1,2,3,4,5,6,7
Word 0,popcorn,door,month,large,sensor,model,power,time
Word 1,button,open,lasted,fit,reheat,new,level,used
Word 2,setting,latch,working,price,food,replace,power level,long
Word 3,bag,close,warranty,big,sensor reheat,previous,watt,cooking
Word 4,minute,door latch,service,size,defrost,replaced,inverter,second
Word 5,popcorn button,door open,repair,space,heat,new model,1250,cooking time
Word 6,pop,push,died,powerful,function,died,1250 watt,heat
Word 7,push,hard,worked,counter,hot,older,cooking,item
Word 8,popcorn setting,problem,3,kitchen,heat food,similar,setting,long time
Word 9,make,slam,stopped,need,evenly,purchased,power setting,time used


In [218]:
vocabulary = np.array(vectorizer.get_feature_names())

In [None]:
def label_topics(H, vocabulary):
    '''
    Print the most influential words of each latent topic, and prompt the user
    to label each topic. The user should use their humanness to figure out what
    each latent topic is capturing.
    '''
    topic_labels = []
    for i, row in enumerate(H):
        top_five = np.argsort(row)[::-1][:12]
        print('topic', i)
        print('-->', ' '.join(vocabulary[top_five]))
        label = input('please label this topic: ')
        topic_labels.append(label)
        print()
    return topic_labels

In [223]:
nmf_featurizer(max_df=.95, ngram_max=3, n_components=8)

topic_theme,0,1,2,3,4,5,6,7
Word 0,popcorn,door,month,large,sensor,model,power,time
Word 1,button,open,lasted,fit,reheat,new,level,used
Word 2,setting,latch,working,price,food,replace,power level,long
Word 3,bag,close,warranty,big,sensor reheat,previous,watt,cooking
Word 4,minute,door latch,service,size,defrost,replaced,inverter,second
Word 5,popcorn button,door open,repair,space,heat,new model,1250,cooking time
Word 6,pop,push,died,powerful,function,died,1250 watt,heat
Word 7,push,hard,worked,counter,hot,older,cooking,item
Word 8,popcorn setting,problem,3,kitchen,heat food,similar,setting,long time
Word 9,make,slam,stopped,need,evenly,purchased,power setting,time used


In [224]:
topic_labels = label_topics(H,vocabulary)

topic 0
--> popcorn button setting bag minute popcorn button pop push popcorn setting make quick press
please label this topic: popcorn

topic 1
--> door open latch close door latch door open push hard problem slam open door shut
please label this topic: door

topic 2
--> month lasted working warranty service repair died worked 3 stopped week purchased
please label this topic: product lifetime

topic 3
--> large fit price big size space powerful counter kitchen need needed wanted
please label this topic: size

topic 4
--> sensor reheat food sensor reheat defrost heat function hot heat food evenly reheat function turbo
please label this topic: programmed functions

topic 5
--> model new replace previous replaced new model died older similar purchased 10 needed
please label this topic: replaced old model

topic 6
--> power level power level watt inverter 1250 1250 watt cooking setting power setting watt power technology
please label this topic: power

topic 7
--> time used long cooking s

In [None]:
nmf_featurizer(max_df=.95, ngram_max=3, n_components=8, column_names_known = 'Y')

In [27]:
def softmax(v, temperature=1.0):
    '''
    A heuristic to convert arbitrary positive values into probabilities.
    See: https://en.wikipedia.org/wiki/Softmax_function
    '''
    expv = np.exp(v / temperature)
    s = np.sum(expv)
    return expv / s

In [28]:
def analyze_reviews(W, topic_labels):
    '''
    Print an analysis of a single Amazon review, including the review ID
    and a summary of which topics it represents. The topics are identified
    via the hand-labels which were assigned by the user.
    '''
    topic_count = len(topic_labels)
    topic_percentages = [[] for i in range(0, topic_count)]

    for idx, i in enumerate(W):  
        probs = softmax(W[idx], temperature=0.01)
        topic_list_counter = 0
        for prob, label in zip(probs, topic_labels):
            topic_percentages[topic_list_counter].append(round(prob, 5))
            topic_list_counter += 1
        
    return pd.DataFrame(dict(zip(topic_labels, topic_percentages)))

In [112]:
top_df = analyze_reviews(W, topic_labels)

In [120]:
top_df

Unnamed: 0,cooking time,door latch / shutting,customer service and support,external size,programmed functions,SOUP,cooking power / wattage,door open button,perceived value,cooks food evenly,SOUP (cook time),product lifetime,popcorn feature,short product lifetime
0,0.00001,0.99999,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
1,0.00000,0.00000,0.00000,0.00009,0.00000,0.00002,0.00000,0.00000,0.00000,0.00003,0.99983,0.00000,0.00000,0.00000
2,0.00000,0.00000,0.00000,0.99993,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
3,0.00278,0.00326,0.91190,0.00286,0.00278,0.00604,0.00278,0.00278,0.00278,0.00282,0.00278,0.00278,0.00278,0.03392
4,0.00000,0.99998,0.00000,0.00000,0.00000,0.00000,0.00000,0.00002,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1870,0.00107,0.00000,0.00000,0.00002,0.00000,0.99859,0.00000,0.00000,0.00000,0.00029,0.00000,0.00000,0.00000,0.00000
1871,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.99832,0.00168,0.00000,0.00000,0.00000
1872,0.00000,0.00000,0.00000,0.00150,0.00000,0.00000,0.00000,0.00000,0.00001,0.04877,0.00005,0.00000,0.94964,0.00000
1873,0.99960,0.00001,0.00002,0.00001,0.00001,0.00001,0.00001,0.00001,0.00001,0.00007,0.00001,0.00002,0.00001,0.00003


In [113]:
top_df.iloc[0]

cooking time                    0.00001
door latch / shutting           0.99999
customer service and support    0.00000
external size                   0.00000
programmed functions            0.00000
SOUP                            0.00000
cooking power / wattage         0.00000
door open button                0.00000
perceived value                 0.00000
cooks food evenly               0.00000
SOUP (cook time)                0.00000
product lifetime                0.00000
popcorn feature                 0.00000
short product lifetime          0.00000
Name: 0, dtype: float64

In [116]:
feature_score = {}
for _ in range(len(top_df.columns)):
    key = top_df.columns[_]
    feature_score[key] = round(sum(top_df[key])/(len(top_df)),5)

In [117]:
feature_score

{'cooking time': 0.05757,
 'door latch / shutting': 0.0695,
 'customer service and support': 0.08676,
 'external size': 0.09258,
 'programmed functions': 0.06032,
 'SOUP': 0.04082,
 'cooking power / wattage': 0.068,
 'door open button': 0.0575,
 'perceived value': 0.05023,
 'cooks food evenly': 0.05713,
 'SOUP (cook time)': 0.05456,
 'product lifetime': 0.04173,
 'popcorn feature': 0.04421,
 'short product lifetime': 0.0565}

In [118]:
sorted(feature_score.items(), key= lambda x: x[1])

[('SOUP', 0.04082),
 ('product lifetime', 0.04173),
 ('popcorn feature', 0.04421),
 ('perceived value', 0.05023),
 ('SOUP (cook time)', 0.05456),
 ('short product lifetime', 0.0565),
 ('cooks food evenly', 0.05713),
 ('door open button', 0.0575),
 ('cooking time', 0.05757),
 ('programmed functions', 0.06032),
 ('cooking power / wattage', 0.068),
 ('door latch / shutting', 0.0695),
 ('customer service and support', 0.08676),
 ('external size', 0.09258)]

In [133]:
topic_words_df = nmf_featurizer(column_names_known ='Y')
topic_words_df

topic_theme,cooking time,door latch / shutting,customer service and support,external size,programmed functions,SOUP,cooking power / wattage,door open button,perceived value,cooks food evenly,SOUP (cook time),product lifetime,SOUP.1,product lifetime.1,popcorn feature,SOUP.2,short product lifetime
Word 0,time,door,service,large,sensor,model,power,open,price,food,cook,lasted,used,working,popcorn,powerful,month
Word 1,second,latch,repair,fit,reheat,new,level,button,store,heat,fast,second,little,stopped,setting,big,buy
Word 2,cooking,close,day,space,sensor reheat,replace,power level,door,best,evenly,cook fast,died,far,stopped working,bag,big powerful,worked
Word 3,cooking time,door latch,warranty,size,defrost,new model,watt,push,size,heat food,cook evenly,lasted 10,long,quit working,popcorn setting,room,fine
Word 4,long,problem,customer,counter,function,previous,inverter,door open,right,quickly,evenly,10,look,quit,button,powerful large,6 month
Word 5,long time,slam,customer service,kitchen,food,died,1250,open door,delivery,cook food,sensor cook,owned,expected,far,pop,sure,6
Word 6,item,shut,center,counter space,reheat function,older,1250 watt,push button,needed,heating,cook food,previous lasted,cooking,month working,popcorn button,loud,died
Word 7,arrived time,door close,shipping,need,turbo,replaced,power setting,button open,arrived,food evenly,potato,long,think,working far,burn,spacious,warranty
Word 8,minute,hard,called,needed,sensor cook,similar,watt power,hand,expected,food quickly,larger,going,bit,completely stopped,size,inside,ago
Word 9,start,slam door,purchase,dish,warm,older model,cooking,pull,value,heat evenly,minute,previous,getting,color,make,second,month ago


In [104]:
review_df = pd.concat([df, top_df.reindex(df.index)], axis=1)[['review_body', '1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17']]

In [110]:
print(review_df.review_body[0])
print(review_df.iloc[0])

I have had this microwave for just over 3 years and the door won't close, or the microwave doesn't know that the door is closed so it won't turn on. It is rather irritating to be having to keep slamming and pressing and shaking the door to make it start. Many a times we end up heating/cooking on the stove. I do not think it is worth the over $100 we have to pay for it and throw it after 2-3 years. I see that many others have mentioned this problem.
review_body    I have had this microwave for just over 3 year...
1                                                          1e-05
2                                                        0.99999
3                                                              0
4                                                              0
5                                                              0
6                                                              0
7                                                              0
8                          

TypeError: 'DataFrame' object is not callable

In [None]:
(10000, 4,.8) 8
(10000, 4,.85) 8
(10000, 4,.9) 8
(10000, 4,.95) 8


(10000, 6,.8) 9
(10000, 6,.85) 9
(10000, 6,.9) 9
(10000, 6,.95) 9