In [130]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
plt.style.use ('ggplot')
import seaborn as sns
%matplotlib inline
sns.set_style(style="whitegrid")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text

from bs4 import BeautifulSoup

import nltk, string, contractions, random
from nltk.stem.snowball import SnowballStemmer

from gensim.corpora.dictionary import Dictionary
from gensim.models.nmf import Nmf
from gensim.models.coherencemodel import CoherenceModel

from operator import itemgetter

In [126]:
df = pd.read_csv('data/modeling_ready_microwave1')

In [127]:
df.dropna(inplace=True)

In [128]:
df.review_date = pd.to_datetime(df.review_date)

In [131]:
keep_words = ['not', 'ain', 'aren', "aren't", 'couldn', "couldn't", 
              'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 
              'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 
              'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn',
              "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 
              'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 
              'wouldn', "wouldn't", 'fire', 'off']
test_stop_words = ['great', 'excellent', '1', '2', 'feature', 'nice', 'old']
review_stop_words = ['panasonic', 'really', 'husband', 'thanks', 'thank', 'ha', 
              'just', 'thing', 'did', 'nn', 'wa', 'yr', 'u', 'say', 'doe',
              'mom', 'christmas', 'gift', 'got', 'way', 'le', 'daughter',
              'e','not','love','good','bought','great microwave','micro',
              'great oven','microwave','product','work great','nice work',
              'work great use','work great love','feature work great',
              'unit work great','oven work great','easy use love','old oven',
              'old old','20 year old','unit', 'not', 'work', 'amazon', 'com',
              'old', 'wife', 'highly', 'recommend', 'like', 'charm', '20', 
              'easy', 'oven', 'use', 'year', 'lot', 'pleased', 'happy', 'hope',
              'review', 'buy', 'far', 'day', '1', '2', '3', '4', '5', '6', '7',
              '8', '9', '10', '20']
for _ in text.ENGLISH_STOP_WORDS:
    if _ in keep_words:
        pass
    else:
        review_stop_words.append(_)
for _ in test_stop_words:
    review_stop_words.append(_)

stemmer = SnowballStemmer("english")    
    
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()    
    
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

def vectorize_this(max_features, min_df, max_df, ngram_max):
    vectorizer = TfidfVectorizer(tokenizer = lemmatize_text,
                             stop_words= review_stop_words,
                             max_features = max_features,
                             min_df = min_df,
                             max_df =  max_df,
                             ngram_range=(1, ngram_max)
                            )
    X = df['model_ready']
    X = vectorizer.fit_transform(X)
    return X, vectorizer

In [162]:
topics = 4
mindf = 1
maxdf = .95
grams = 3

In [163]:
X, vectorizer = vectorize_this(10000, mindf, maxdf, grams)

In [164]:
def make_nmf(n_components, alpha, X):
    nmf = NMF(
            n_components=n_components,
            init='nndsvd',
            random_state=12345,
            alpha = alpha
            ).fit(X)

    W = nmf.fit_transform(X)
    H = nmf.components_
    return nmf, W, H

In [165]:
nmf, W, H, = make_nmf(topics, .1, X)

In [166]:
def topic_keywords(vectorizer=vectorizer, lda_model=nmf, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [167]:
topic_keywords = topic_keywords()        

In [168]:
topic_keywords

[array(['power', 'cook', 'time', 'food', 'sensor', 'reheat', 'button',
        'popcorn', 'cooking', 'setting'], dtype='<U32'),
 array(['door', 'open', 'latch', 'close', 'button', 'door latch', 'push',
        'door open', 'hard', 'problem'], dtype='<U32'),
 array(['lasted', 'month', 'working', 'warranty', 'service', 'repair',
        'died', 'new', 'model', 'worked'], dtype='<U32'),
 array(['price', 'large', 'fit', 'big', 'size', 'space', 'powerful',
        'need', 'counter', 'kitchen'], dtype='<U32')]

In [169]:
def topic_featuring(n_components=10, n_words=10):
    
    nmf, W, H = make_nmf(n_components, .1, X)
    

    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]

    Topics_theme = range(n_components)
    df_topic_keywords['topic_theme'] = Topics_theme
    df_topic_keywords.set_index('topic_theme', inplace=True)
    return df_topic_keywords.T

In [170]:
topic_featuring(topics)

topic_theme,0,1,2,3
Word 0,power,door,lasted,price
Word 1,cook,open,month,large
Word 2,time,latch,working,fit
Word 3,food,close,warranty,big
Word 4,sensor,button,service,size
Word 5,reheat,door latch,repair,space
Word 6,button,push,died,powerful
Word 7,popcorn,door open,new,need
Word 8,cooking,hard,model,counter
Word 9,setting,problem,worked,kitchen


In [171]:
def nmf_featurizer(max_features=10000, min_df=4, max_df=.8, ngram_max = 2, n_components=10, n_words=10, column_names_known = 'n'):
    X, vectorizer = vectorize_this(max_features, min_df, max_df, ngram_max)
    
    nmf, W, H = make_nmf(n_components, .1, X)
    
    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    
    if column_names_known == 'n':
        Topics_theme = range(n_components)
    elif column_names_known == 'Y':
        Topics_theme = topic_labels
    df_topic_keywords['topic_theme'] = Topics_theme
    df_topic_keywords.set_index('topic_theme', inplace=True)
    return df_topic_keywords.T

In [172]:
nmf_featurizer(max_df=maxdf, ngram_max=grams, n_components=topics)

topic_theme,0,1,2,3
Word 0,power,door,lasted,price
Word 1,cook,open,month,large
Word 2,time,latch,working,fit
Word 3,food,close,warranty,big
Word 4,sensor,button,service,size
Word 5,reheat,door latch,repair,space
Word 6,button,push,died,powerful
Word 7,popcorn,door open,new,need
Word 8,cooking,hard,model,counter
Word 9,setting,problem,worked,kitchen


In [173]:
vocabulary = np.array(vectorizer.get_feature_names())

In [174]:
def label_topics(H, vocabulary):
    '''
    Print the most influential words of each latent topic, and prompt the user
    to label each topic. The user should use their humanness to figure out what
    each latent topic is capturing.
    '''
    topic_labels = []
    for i, row in enumerate(H):
        top_five = np.argsort(row)[::-1][:12]
        print('topic', i)
        print('-->', ' '.join(vocabulary[top_five]))
        label = input('please label this topic: ')
        topic_labels.append(label)
        print()
    return topic_labels

In [175]:
nmf_featurizer(max_df=maxdf, ngram_max=grams, n_components=topics)

topic_theme,0,1,2,3
Word 0,power,door,lasted,price
Word 1,cook,open,month,large
Word 2,time,latch,working,fit
Word 3,food,close,warranty,big
Word 4,sensor,button,service,size
Word 5,reheat,door latch,repair,space
Word 6,button,push,died,powerful
Word 7,popcorn,door open,new,need
Word 8,cooking,hard,model,counter
Word 9,setting,problem,worked,kitchen


In [176]:
topic_labels = label_topics(H,vocabulary)

topic 0
--> power cook time food sensor reheat button popcorn cooking setting used defrost
please label this topic: programmed functions

topic 1
--> door open latch close button door latch push door open hard problem open door slam
please label this topic: door

topic 2
--> lasted month working warranty service repair died new model worked purchased stopped
please label this topic: lifetime

topic 3
--> price large fit big size space powerful need counter kitchen needed wanted
please label this topic: size



In [177]:
nmf_featurizer(max_df=maxdf, ngram_max=grams, n_components=topics, column_names_known = 'Y')

topic_theme,programmed functions,door,lifetime,size
Word 0,power,door,lasted,price
Word 1,cook,open,month,large
Word 2,time,latch,working,fit
Word 3,food,close,warranty,big
Word 4,sensor,button,service,size
Word 5,reheat,door latch,repair,space
Word 6,button,push,died,powerful
Word 7,popcorn,door open,new,need
Word 8,cooking,hard,model,counter
Word 9,setting,problem,worked,kitchen


In [178]:
def softmax(v, temperature=1.0):
    '''
    A heuristic to convert arbitrary positive values into probabilities.
    See: https://en.wikipedia.org/wiki/Softmax_function
    '''
    expv = np.exp(v / temperature)
    s = np.sum(expv)
    return expv / s

In [179]:
def analyze_reviews(W, topic_labels):
    '''
    Print an analysis of a single Amazon review, including the review ID
    and a summary of which topics it represents. The topics are identified
    via the hand-labels which were assigned by the user.
    '''
    topic_count = len(topic_labels)
    topic_percentages = [[] for i in range(0, topic_count)]

    for idx, i in enumerate(W):  
        probs = softmax(W[idx], temperature=0.01)
        topic_list_counter = 0
        for prob, label in zip(probs, topic_labels):
            topic_percentages[topic_list_counter].append(round(prob, 5))
            topic_list_counter += 1
        
    return pd.DataFrame(dict(zip(topic_labels, topic_percentages)))

In [180]:
top_df = analyze_reviews(W, topic_labels)

In [181]:
top_df

Unnamed: 0,programmed functions,door,lifetime,size
0,0.00003,0.99996,0.00001,0.00000
1,0.07972,0.00281,0.00281,0.91467
2,0.00009,0.00009,0.00009,0.99972
3,0.01273,0.01273,0.95088,0.02365
4,0.00001,0.99999,0.00000,0.00000
...,...,...,...,...
1843,0.11868,0.00285,0.00285,0.87563
1844,0.95531,0.00084,0.00084,0.04301
1845,0.13805,0.00088,0.00088,0.86019
1846,0.51986,0.02807,0.42400,0.02807


In [182]:
top_df.loc[0]

programmed functions    0.00003
door                    0.99996
lifetime                0.00001
size                    0.00000
Name: 0, dtype: float64

In [183]:
feature_score = {}
for _ in range(len(top_df.columns)):
    key = top_df.columns[_]
    feature_score[key] = round(sum(top_df[key])/(len(top_df)),5)

In [184]:
feature_score

{'programmed functions': 0.31233,
 'door': 0.15698,
 'lifetime': 0.25422,
 'size': 0.27647}

In [185]:
sorted(feature_score.items(), key= lambda x: x[1], reverse=True)

[('programmed functions', 0.31233),
 ('size', 0.27647),
 ('lifetime', 0.25422),
 ('door', 0.15698)]

In [186]:
topic_words_df = nmf_featurizer(column_names_known ='Y')
topic_words_df

topic_theme,programmed functions,door,lifetime,size
Word 0,power,door,lasted,price
Word 1,cook,open,month,large
Word 2,time,latch,working,fit
Word 3,food,close,warranty,big
Word 4,sensor,button,service,size
Word 5,reheat,door latch,repair,space
Word 6,button,push,died,powerful
Word 7,popcorn,door open,new,need
Word 8,cooking,hard,model,counter
Word 9,setting,problem,worked,kitchen


In [187]:
review_df_columns = (['review_body'] + topic_labels)

In [188]:
review_df = pd.concat([df, top_df.reindex(df.index)], axis=1)[review_df_columns]

In [189]:
def review_checker(num_samples):
    review_index = []
    for _ in range(num_samples):
        review_index.append(random.randint(0, len(df)))
        
    for _ in review_index:
        print('Review ' + str(_))
        print(' ')
        print(review_df.review_body[_])
        print(' ')
        print(review_df.loc[_][1:])
        print(' ')
        print('___________________________________________________')
        print(' ')
    

In [190]:
review_checker(5)

Review 1107
 
If you're like me and hate the way most microwaves cook food at 100% even if you set the percent to less then you will love this microwave.  I typically like to cook at 60 or 70% so meat does not get tough.  Traditional microwaves are either on or off when cooking, so when you set it to 70%, it's off 30% of the cooking time (if you listen you can hear when the microwave turns on and off even if the light is still on and the platter is spinning).  Imagine cooking in your oven with this same principle.<br /><br />However, with an inverter microwave if you set it to 70% it cooks the food with 70% of the of the heat (or power).  Meats are much more tender.  In fact I'm now cooking meat dishes at 40 - 70% and extending the cooking time.  It's similar to slow cooking.  I'm doing sauces on 20 - 40%.  What a big step forward in technology.<br /><br />I like this particular unit.  It's large at 2.2 cubit units.  It has a nice software program too, with many features and is user fr