In [120]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
plt.style.use ('ggplot')
import seaborn as sns
%matplotlib inline
sns.set_style(style="whitegrid")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text

from bs4 import BeautifulSoup

import nltk, string, contractions, random

from gensim.corpora.dictionary import Dictionary
from gensim.models.nmf import Nmf
from gensim.models.coherencemodel import CoherenceModel

from operator import itemgetter

In [3]:
df = pd.read_csv('data/modeling_ready_microwave1')

In [4]:
df.dropna(inplace=True)

In [5]:
df.review_date = pd.to_datetime(df.review_date)

In [14]:
keep_words = ['not', 'ain', 'aren', "aren't", 'couldn', "couldn't", 
              'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 
              'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 
              'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn',
              "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 
              'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 
              'wouldn', "wouldn't", 'fire', 'off']
test_stop_words = ['great', 'excellent', '1', '2', 'feature', 'nice', 'old']
review_stop_words = ['panasonic', 'really', 'husband', 'thanks', 'thank', 'ha', 
              'just', 'thing', 'did', 'nn', 'wa', 'yr', 'u', 'say', 'doe',
              'mom', 'christmas', 'gift', 'got', 'way', 'le', 'daughter',
              'e','not','love','good','bought','great microwave','micro',
              'great oven','microwave','product','work great','nice work',
              'work great use','work great love','feature work great',
              'unit work great','oven work great','easy use love','old oven',
              'old old','20 year old','unit', 'not', 'work', 'amazon', 'com',
              'old', 'wife', 'highly', 'recommend', 'like', 'charm', '20', 
              'easy', 'oven', 'use', 'year', 'lot', 'pleased', 'happy', 'hope',
              'review', 'buy', 'far', 'day']
for _ in text.ENGLISH_STOP_WORDS:
    if _ in keep_words:
        pass
    else:
        review_stop_words.append(_)
for _ in test_stop_words:
    review_stop_words.append(_)

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()    
    
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

def vectorize_this(max_features, min_df, max_df, ngram_max):
    vectorizer = TfidfVectorizer(tokenizer = lemmatize_text,
                             stop_words= review_stop_words,
                             max_features = max_features,
                             min_df = min_df,
                             max_df =  max_df,
                             ngram_range=(1, ngram_max)
                            )
    X = df['model_ready']
    X = vectorizer.fit_transform(X)
    return X, vectorizer

In [80]:
topics = 7
mindf = 1
maxdf = .95
grams = 2

In [81]:
X, vectorizer = vectorize_this(10000, mindf, maxdf, grams)

In [82]:
def make_nmf(n_components, alpha, X):
    nmf = NMF(
            n_components=n_components,
            init='nndsvd',
            random_state=12345,
            alpha = alpha
            ).fit(X)

    W = nmf.fit_transform(X)
    H = nmf.components_
    return nmf, W, H

In [83]:
nmf, W, H, = make_nmf(topics, .1, X)

In [84]:
def topic_keywords(vectorizer=vectorizer, lda_model=nmf, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [85]:
topic_keywords = topic_keywords()        

In [86]:
topic_keywords

[array(['time', 'used', 'popcorn', 'cooking', 'long', 'cooking time',
        'item', 'cook', 'long time', 'minute'], dtype='<U24'),
 array(['door', 'open', 'latch', 'close', 'button', 'push', 'door latch',
        'door open', 'hard', 'open door'], dtype='<U24'),
 array(['lasted', 'month', 'model', 'new', 'died', 'service', 'warranty',
        'repair', 'worked', 'purchased'], dtype='<U24'),
 array(['large', 'price', 'fit', 'big', 'size', 'space', 'powerful',
        'counter', 'need', 'kitchen'], dtype='<U24'),
 array(['sensor', 'food', 'cook', 'reheat', 'sensor reheat', 'defrost',
        'heat', 'button', 'evenly', 'function'], dtype='<U24'),
 array(['working', 'stopped working', 'stopped', 'quit working', 'quit',
        'month', 'big', 'month working', 'given', 'completely stopped'],
       dtype='<U24'),
 array(['power', 'level', 'watt', 'power level', 'setting', '1250',
        'inverter', 'popcorn', '1250 watt', 'power setting'], dtype='<U24')]

In [87]:
def topic_featuring(n_components=10, n_words=10):
    
    nmf, W, H = make_nmf(n_components, .1, X)
    

    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]

    Topics_theme = range(n_components)
    df_topic_keywords['topic_theme'] = Topics_theme
    df_topic_keywords.set_index('topic_theme', inplace=True)
    return df_topic_keywords.T

In [88]:
topic_featuring(topics)

topic_theme,0,1,2,3,4,5,6
Word 0,time,door,lasted,large,sensor,working,power
Word 1,used,open,month,price,food,stopped working,level
Word 2,popcorn,latch,model,fit,cook,stopped,watt
Word 3,cooking,close,new,big,reheat,quit working,power level
Word 4,long,button,died,size,sensor reheat,quit,setting
Word 5,cooking time,push,service,space,defrost,month,1250
Word 6,item,door latch,warranty,powerful,heat,big,inverter
Word 7,cook,door open,repair,counter,button,month working,popcorn
Word 8,long time,hard,worked,need,evenly,given,1250 watt
Word 9,minute,open door,purchased,kitchen,function,completely stopped,power setting


In [89]:
def nmf_featurizer(max_features=10000, min_df=4, max_df=.8, ngram_max = 2, n_components=10, n_words=10, column_names_known = 'n'):
    X, vectorizer = vectorize_this(max_features, min_df, max_df, ngram_max)
    
    nmf, W, H = make_nmf(n_components, .1, X)
    
    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    
    if column_names_known == 'n':
        Topics_theme = range(n_components)
    elif column_names_known == 'Y':
        Topics_theme = topic_labels
    df_topic_keywords['topic_theme'] = Topics_theme
    df_topic_keywords.set_index('topic_theme', inplace=True)
    return df_topic_keywords.T

In [90]:
nmf_featurizer(max_df=maxdf, ngram_max=grams, n_components=topics)

topic_theme,0,1,2,3,4,5,6
Word 0,time,door,lasted,large,sensor,working,power
Word 1,used,open,month,price,food,stopped working,level
Word 2,popcorn,latch,model,fit,cook,stopped,watt
Word 3,cooking,close,new,big,reheat,quit working,power level
Word 4,long,button,died,size,sensor reheat,quit,setting
Word 5,cooking time,push,service,space,defrost,month,1250
Word 6,item,door latch,warranty,powerful,heat,big,inverter
Word 7,cook,door open,repair,counter,button,month working,popcorn
Word 8,long time,hard,worked,need,evenly,given,1250 watt
Word 9,minute,open door,purchased,kitchen,function,completely stopped,power setting


In [91]:
vocabulary = np.array(vectorizer.get_feature_names())

In [92]:
def label_topics(H, vocabulary):
    '''
    Print the most influential words of each latent topic, and prompt the user
    to label each topic. The user should use their humanness to figure out what
    each latent topic is capturing.
    '''
    topic_labels = []
    for i, row in enumerate(H):
        top_five = np.argsort(row)[::-1][:12]
        print('topic', i)
        print('-->', ' '.join(vocabulary[top_five]))
        label = input('please label this topic: ')
        topic_labels.append(label)
        print()
    return topic_labels

In [93]:
nmf_featurizer(max_df=maxdf, ngram_max=grams, n_components=topics)

topic_theme,0,1,2,3,4,5,6
Word 0,time,door,lasted,large,sensor,working,power
Word 1,used,open,month,price,food,stopped working,level
Word 2,popcorn,latch,model,fit,cook,stopped,watt
Word 3,cooking,close,new,big,reheat,quit working,power level
Word 4,long,button,died,size,sensor reheat,quit,setting
Word 5,cooking time,push,service,space,defrost,month,1250
Word 6,item,door latch,warranty,powerful,heat,big,inverter
Word 7,cook,door open,repair,counter,button,month working,popcorn
Word 8,long time,hard,worked,need,evenly,given,1250 watt
Word 9,minute,open door,purchased,kitchen,function,completely stopped,power setting


In [94]:
topic_labels = label_topics(H,vocabulary)

topic 0
--> time used popcorn cooking long cooking time item cook long time minute second time used
please label this topic: cooking time

topic 1
--> door open latch close button push door latch door open hard open door problem slam
please label this topic: door

topic 2
--> lasted month model new died service warranty repair worked purchased 3 second
please label this topic: customer service and support

topic 3
--> large price fit big size space powerful counter need kitchen needed look
please label this topic: size

topic 4
--> sensor food cook reheat sensor reheat defrost heat button evenly function used sensor cook
please label this topic: programmed functions

topic 5
--> working stopped working stopped quit working quit month big month working given completely stopped working month color
please label this topic: quit working

topic 6
--> power level watt power level setting 1250 inverter popcorn 1250 watt power setting cooking button
please label this topic: power



In [96]:
nmf_featurizer(max_df=maxdf, ngram_max=grams, n_components=topics, column_names_known = 'Y')

topic_theme,cooking time,door,customer service and support,size,programmed functions,quit working,power
Word 0,time,door,lasted,large,sensor,working,power
Word 1,used,open,month,price,food,stopped working,level
Word 2,popcorn,latch,model,fit,cook,stopped,watt
Word 3,cooking,close,new,big,reheat,quit working,power level
Word 4,long,button,died,size,sensor reheat,quit,setting
Word 5,cooking time,push,service,space,defrost,month,1250
Word 6,item,door latch,warranty,powerful,heat,big,inverter
Word 7,cook,door open,repair,counter,button,month working,popcorn
Word 8,long time,hard,worked,need,evenly,given,1250 watt
Word 9,minute,open door,purchased,kitchen,function,completely stopped,power setting


In [97]:
def softmax(v, temperature=1.0):
    '''
    A heuristic to convert arbitrary positive values into probabilities.
    See: https://en.wikipedia.org/wiki/Softmax_function
    '''
    expv = np.exp(v / temperature)
    s = np.sum(expv)
    return expv / s

In [98]:
def analyze_reviews(W, topic_labels):
    '''
    Print an analysis of a single Amazon review, including the review ID
    and a summary of which topics it represents. The topics are identified
    via the hand-labels which were assigned by the user.
    '''
    topic_count = len(topic_labels)
    topic_percentages = [[] for i in range(0, topic_count)]

    for idx, i in enumerate(W):  
        probs = softmax(W[idx], temperature=0.01)
        topic_list_counter = 0
        for prob, label in zip(probs, topic_labels):
            topic_percentages[topic_list_counter].append(round(prob, 5))
            topic_list_counter += 1
        
    return pd.DataFrame(dict(zip(topic_labels, topic_percentages)))

In [99]:
top_df = analyze_reviews(W, topic_labels)

In [100]:
top_df

Unnamed: 0,cooking time,door,customer service and support,size,programmed functions,quit working,power
0,0.00028,0.99970,0.00001,0.00000,0.00000,0.00000,0.00000
1,0.00185,0.00185,0.00185,0.56816,0.42259,0.00185,0.00185
2,0.00005,0.00005,0.00005,0.99970,0.00005,0.00005,0.00005
3,0.00764,0.00764,0.95185,0.00994,0.00764,0.00764,0.00764
4,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...
1843,0.31023,0.00115,0.00115,0.67305,0.01211,0.00115,0.00115
1844,0.00002,0.00002,0.00002,0.00082,0.99909,0.00002,0.00002
1845,0.00364,0.00081,0.00069,0.91987,0.07233,0.00069,0.00197
1846,0.99993,0.00001,0.00002,0.00001,0.00001,0.00001,0.00001


In [101]:
top_df.loc[0]

cooking time                    0.00028
door                            0.99970
customer service and support    0.00001
size                            0.00000
programmed functions            0.00000
quit working                    0.00000
power                           0.00000
Name: 0, dtype: float64

In [102]:
feature_score = {}
for _ in range(len(top_df.columns)):
    key = top_df.columns[_]
    feature_score[key] = round(sum(top_df[key])/(len(top_df)),5)

In [103]:
feature_score

{'cooking time': 0.10802,
 'door': 0.13614,
 'customer service and support': 0.20974,
 'size': 0.21452,
 'programmed functions': 0.16287,
 'quit working': 0.05717,
 'power': 0.11154}

In [108]:
sorted(feature_score.items(), key= lambda x: x[1], reverse=True)

[('size', 0.21452),
 ('customer service and support', 0.20974),
 ('programmed functions', 0.16287),
 ('door', 0.13614),
 ('power', 0.11154),
 ('cooking time', 0.10802),
 ('quit working', 0.05717)]

In [109]:
topic_words_df = nmf_featurizer(column_names_known ='Y')
topic_words_df

topic_theme,cooking time,door,customer service and support,size,programmed functions,quit working,power
Word 0,time,door,lasted,large,sensor,working,power
Word 1,used,open,month,price,food,stopped working,level
Word 2,popcorn,latch,model,fit,cook,stopped,watt
Word 3,cooking,close,new,big,reheat,quit working,power level
Word 4,long,button,died,size,sensor reheat,quit,setting
Word 5,cooking time,push,service,space,defrost,month,1250
Word 6,item,door latch,warranty,powerful,heat,big,inverter
Word 7,cook,door open,repair,counter,button,month working,popcorn
Word 8,long time,hard,worked,need,evenly,given,1250 watt
Word 9,minute,open door,purchased,kitchen,function,completely stopped,power setting


In [110]:
review_df_columns = (['review_body'] + topic_labels)

['review_body',
 'cooking time',
 'door',
 'customer service and support',
 'size',
 'programmed functions',
 'quit working',
 'power']

In [111]:
review_df = pd.concat([df, top_df.reindex(df.index)], axis=1)[review_df_columns]

In [124]:
def review_checker(num_samples):
    review_index = []
    for _ in range(num_samples):
        review_index.append(random.randint(0, len(df)))
        
    for _ in review_index:
        print('Review ' + str(_))
        print(' ')
        print(review_df.review_body[_])
        print(' ')
        print(review_df.loc[_][1:])
        print(' ')
        print('___________________________________________________')
        print(' ')
    

In [125]:
review_checker(5)

Review 1186
 
Just the perfect microwave . We had our last Panosonic for 17 years and it finally stopped working. So decided to stay with Panosonic and we love this one to. Easy to set up, easy to use. Got the product quick and in perfect shape .
 
cooking time                     0.3006
door                            0.00043
customer service and support    0.00033
size                            0.00114
programmed functions            0.01044
quit working                    0.00047
power                            0.6866
Name: 1186, dtype: object
 
___________________________________________________
 
Review 1497
 
Well designed and easy to use product. fits just right in space above oven. this is the 2nd one and there have been improvements.
 
cooking time                    0.00052
door                            0.00052
customer service and support    0.92125
size                            0.07549
programmed functions            0.00118
quit working                    0.00052
pow