# Topic Modeling with NMF

In [3]:
# Dataframing and analysis
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
plt.style.use ('ggplot')
import seaborn as sns
%matplotlib inline
sns.set_style(style="whitegrid")

# Modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text

import nltk, string, contractions, random, statistics
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

from operator import itemgetter

Read in the dataset and prepare for modeling.

In [5]:
df = pd.read_csv('data/model_ready_data')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1854 entries, 0 to 1853
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   customer_id        1854 non-null   int64  
 1   review_id          1854 non-null   object 
 2   product_id         1854 non-null   object 
 3   star_rating        1854 non-null   float64
 4   helpful_votes      1854 non-null   float64
 5   total_votes        1854 non-null   float64
 6   verified_purchase  1854 non-null   object 
 7   review_headline    1854 non-null   object 
 8   review_body        1854 non-null   object 
 9   review_date        1854 non-null   object 
 10  review_wordcount   1854 non-null   int64  
 11  clean_review       1854 non-null   object 
 12  model_ready        1854 non-null   object 
dtypes: float64(3), int64(2), object(8)
memory usage: 188.4+ KB


In [7]:
df.dropna(inplace=True)
df.review_date = pd.to_datetime(df.review_date)

# Model Building

### Vectorizer

First we will build our vectorizer for our model. We will be using TfidfVectorizer to vectorize our corpus. We are using TfidfVectorizer in place of CountVectorizer because we do not want to ignore rare words that may hold intrinsic value to our model. We will be calling our stop_words and lemmatization function built previously since we will be incorporating them into our vectoring function.

In [8]:
keep_words = ['not', 'fire', 'off']
test_stop_words = []
review_stop_words = ['panasonic', 'really', 'husband', 'thanks', 'thank', 'ha', 
              'just', 'thing', 'did', 'nn', 'wa', 'yr', 'u', 'say', 'doe',
              'mom', 'christmas', 'gift', 'got', 'way', 'le', 'daughter', "i'mwork",
              'e','not','love','good','bought','great microwave','micro','nicely',
              'great oven','microwave','product','work great','nice work', "i'm",
              'work great use','work great love','feature work great', 'genius',
              'unit work great','oven work great','easy use love','old oven',
              'old old','20 year old','unit', 'not', 'work', 'amazon', 'com', 'cu',
              'old', 'wife', 'highly', 'recommend', 'like', 'charm', 'sharp', 'ft', 
              'easy', 'oven', 'use', 'year', 'lot', 'pleased', 'happy', 'hope',
              'review', 'buy', 'far', 'day', '1', '2', '3', '4', '5', '6', '7',
              '8', '9', '10', '20', 'great', 'excellent','feature', 'nice', 'old',
              'geniusunit', "don't", "doesn't", '34', '1250', 'br', "it's", 'does',
              'doesn', 'don', 'mwork', '2.2']
for _ in text.ENGLISH_STOP_WORDS:
    if _ in keep_words:
        pass
    else:
        review_stop_words.append(_)
for _ in test_stop_words:
    review_stop_words.append(_)
    
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()    
    
def lemmatize_text(text):
    '''
    Given a row, this function will tokenize then lemmatize our reviews
    '''
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

def vectorize_this(df, max_features, min_df, max_df, ngram_max):
    '''
    Given a dataframe,and hyper-parameters, returns a transformed corpus 
    that is model ready (X) and a vectorizer tuned with our hyper-parameters.
    '''
    vectorizer = TfidfVectorizer(tokenizer = lemmatize_text,
                         strip_accents = 'unicode',
                         stop_words= review_stop_words,
                         max_features = max_features,
                         min_df = min_df,
                         max_df =  max_df,
                         ngram_range=(1, ngram_max)
                        )
    X = df['model_ready']
    X = vectorizer.fit_transform(X)
    return X, vectorizer

## NMF Model

Next we will build our Non-negative Matrix Factorization model. We will be using TfidfVectorizer to vectorize our corpus. We are using TfidfVectorizer in place of CountVectorizer because we do not want to ignore rare words that may hold intrinsic value to our model. We will be calling our stop_words and lemmatization function built previously since we will be incorporating them into our vectoring function.

In [12]:
def make_nmf(n_components, alpha, X):
    '''
    Given a number of topics (n_components), an alpha to adjust reconstruction accuracy,
    and our fit and transformed corpus (X outputted from 'vectorize_this') returns an
    NMF model with tuned hyper-parameters (nmf), a feature matrix (W), and a coefficient
    matrix (H).
    '''
    nmf = NMF(
            n_components=n_components,
            init='nndsvda',
            solver='mu',
            beta_loss='kullback-leibler', 
            random_state=12345,
            alpha = alpha
            )

    W = nmf.fit_transform(X)
    H = nmf.components_
    return nmf, W, H

## Hyper-Parameters

Store your hyper-parameters in the below variables. 

In [16]:
# maximum features (words) your vectorizer calls from your corpus
max_f = 3000
# min_df - a threshold which your vectorizer will ignore terms that have a document frequency strictly lower than 
mindf = 2
# max_df - a threshold which your vectorizer will ignore terms that have a document frequency strictly higher than
maxdf = .95
# number of n_grams you want your model to incorporate
grams = 2
# total number of topics your model will categorize associated key words into
topics = 5
# adjusts reconstruction accuracy
alpha_ = .1

### Build your Vectorizer and Model by calling your Hyper-Parameters

In [44]:
X, vectorizer = vectorize_this(df, max_f, mindf, maxdf, grams)

In [45]:
nmf, W, H, = make_nmf(topics, alpha_, X)

## Topic Interpretation

Now that the model and vectorizer are built we can start gleaning some insights into our topics. The below functions will return a list of arrays with the top most associated key words for each topic, then put those lists into a dataframe for improved interpretability.

In [162]:
def topic_keywords(nmf_model, vectorizer, n_words=10):
    '''
    Given a trained nmf model, vectorizer, and number of words you want printed for each topic,
    returns a list of arrays with the top most associated keywords for each of your topics in
    descending order.
    '''
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in nmf_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [163]:
def df_featurizer(topic_keywords, topic_labels=None):
    '''
    Given a list of arrays containing topic keywords and topic labels for each topic, returns
    an easy to intrepret dataframe containing topic keywords with topic labels as column names.
    '''
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    
    if topic_labels == None:
        Topics_theme = range(1, len(topic_keywords[0][0])+1)
    else:
        Topics_theme = topic_labels
    df_topic_keywords['topic_theme'] = Topics_theme
    df_topic_keywords.set_index('topic_theme', inplace=True)
    return df_topic_keywords.T

In [164]:
topic_keywords = topic_keywords(nmf, vectorizer)
df_featurizer(topic_keywords)

topic_theme,1,2,3,4,5
Word 0,large,door,model,power,sensor
Word 1,big,open,new,time,food
Word 2,fit,close,lasted,popcorn,cook
Word 3,space,latch,died,setting,reheat
Word 4,size,month,service,level,defrost
Word 5,kitchen,problem,month,button,microwav
Word 6,larger,button,price,watt,heat
Word 7,need,broke,purchased,technology,sensor reheat
Word 8,powerful,hard,working,second,evenly
Word 9,counter,push,replace,minute,function


Review this dataframe and infer relationships for each of the columns as they represent each of our topics. Once a theme is affirmed for each grouping call the function below to label your topics.

In [165]:
def label_topics(H, vocabulary):
    '''
    Print the most influential words of each latent topic, and prompt the user
    to label each topic. The user should use their humanness to figure out what
    each latent topic is capturing.
    '''
    topic_labels = []
    for i, row in enumerate(H):
        top_words = np.argsort(row)[::-1][:12]
        print('topic', i)
        print('-->', ' '.join(vocabulary[top_words]))
        label = str(input('please label this topic: '))
        topic_labels.append(label)
        print()
    return topic_labels

In [166]:
vocabulary = np.array(vectorizer.get_feature_names())
topic_labels = label_topics(H,vocabulary)

topic 0
--> large big fit space size kitchen larger need powerful counter cooking cook
please label this topic: size

topic 1
--> door open close latch month problem button broke hard push door latch start
please label this topic: door

topic 2
--> model new lasted died service month price purchased working replace replaced brand
please label this topic: lifetime

topic 3
--> power time popcorn setting level button watt technology second minute cook power level
please label this topic: power

topic 4
--> sensor food cook reheat defrost microwav heat sensor reheat evenly function month button
please label this topic: functions



Once your topics are labeled your can call our df_featurizer function adding in topic_labels to retitle your keywords dataframe column.

In [167]:
df_featurizer(topic_keywords, topic_labels)

topic_theme,size,door,lifetime,power,functions
Word 0,large,door,model,power,sensor
Word 1,big,open,new,time,food
Word 2,fit,close,lasted,popcorn,cook
Word 3,space,latch,died,setting,reheat
Word 4,size,month,service,level,defrost
Word 5,kitchen,problem,month,button,microwav
Word 6,larger,button,price,watt,heat
Word 7,need,broke,purchased,technology,sensor reheat
Word 8,powerful,hard,working,second,evenly
Word 9,counter,push,replace,minute,function


## Analyzing Reviews

Now that we have our topics labeled and have a firm understanding for our product features, we can begin categorizing our reviews into each of the different topics. We will build and integrate a softmax function to turn an array into a probability, and then we will call this function as we iterate through our W matrix to assign a probability for our reviews under each category.

Call the functions below to return a dataframe with every review categorized into different probabilities for each topic.

In [180]:
def softmax(v, temperature=.01):
    '''
    Softmax is a heuristic that converts arbitrary positive values into probabilities. 
    Given an array (sourced from our W matrix) and a temperature (a hyper-parameter that 
    increases the sensitivity of lower probable values as it decreases), this function
    returns a probability.
    See: https://en.wikipedia.org/wiki/Softmax_function
    '''
    expv = np.exp(v / temperature)
    s = np.sum(expv)
    return expv / s

In [181]:
def analyze_reviews(W, topic_labels):
    '''
    Print an analysis of a single Amazon review, including the review ID
    and a summary of which topics it represents. The topics are identified
    via the labels which were assigned in the label_topics() function.
    '''
    topic_count = len(topic_labels)
    topic_percentages = [[] for i in range(0, topic_count)]

    for idx, i in enumerate(W):  
        probs = softmax(W[idx], temperature=.01)
        topic_list_counter = 0
        for prob, label in zip(probs, topic_labels):
            topic_percentages[topic_list_counter].append(round(prob, 5))
            topic_list_counter += 1
        
    return pd.DataFrame(dict(zip(topic_labels, topic_percentages)))

In [182]:
topic_df = analyze_reviews(W, topic_labels)

In [183]:
topic_df

Unnamed: 0,size,door,lifetime,power,functions
0,0.00000,0.99999,0.00000,0.00000,0.00000
1,0.99813,0.00047,0.00047,0.00047,0.00047
2,0.55778,0.04897,0.04897,0.04897,0.29530
3,0.01677,0.01677,0.93290,0.01677,0.01677
4,0.00000,0.99994,0.00000,0.00000,0.00005
...,...,...,...,...,...
1849,0.99965,0.00007,0.00007,0.00007,0.00015
1850,0.00106,0.00025,0.00025,0.00025,0.99817
1851,0.02519,0.00434,0.00434,0.88605,0.08007
1852,0.98665,0.00334,0.00334,0.00334,0.00334


Now that we have a dataframe with all reviews categorized we can simply sum each of our columns and divide the result by our total number of reviews to determine our product feature priorities, in other words, what is most important to our Amazon customers and the relative importance of each product feature when compared with the others.

In [184]:
def feature_score(topic_df):
    '''
    Given a dataframe with reviews categorized into different probabilities for relevance for
    our modeled topics, returns a score for each topic category (product feature).
    '''
    feature_score = {}
    for _ in range(len(topic_df.columns)):
        key = topic_df.columns[_]
        feature_score[key] = round(sum(topic_df[key])/(len(topic_df)),4)
    return feature_score

In [186]:
feature_score = feature_score(topic_df)
sorted(feature_score.items(), key= lambda x: x[1], reverse=True)

[('size', 0.2577),
 ('lifetime', 0.2298),
 ('door', 0.1817),
 ('power', 0.1656),
 ('functions', 0.1652)]

Now that we have our product features prioritized we should go through our reviews and see how our model has performed.

You can check random articles before we move forward with the below function. See if you can identify any reoccuring themes or spot errors in our categorization.

In [254]:
var_index = random.randint(1,len(df))
print(var_index)
print(df.review_body[var_index])
print(review_df.loc[(var_index)])

1554
Arrived quickly, plugged it in and dead as a door knob! Nothing showed up in the display and yes I plugged it into multiple outlets. My other microwave was dying so I purchased this one rather than wait for my other microwave to die so I would not be without a microwave. I have now ordered a Sharp and will keep my fingers crossed. Of course I now have no microwave and have to return this defective one. It is unfortunate but nothing today is made well and yet the costs are fairly high. I feel sorry for those that can't afford these continual breakages, let alone the initial cost for these products.
review_body    Arrived quickly, plugged it in and dead as a d...
size                                                      0.0068
door                                                     0.00065
lifetime                                                 0.99072
power                                                    0.00142
functions                                                0.00041


Call the cells below and instantiate the function 'review_checker' to build a larger list of reviews with scores to browse.

In [237]:
review_df_columns = (['review_body'] + topic_labels)
review_df = pd.concat([df, topic_df.reindex(df.index)], axis=1)[review_df_columns]

In [255]:
def review_checker(review_df, n_samples):
    '''
    Given a dataframe that includes reviews and topic probabilities and a defined number of
    samples, returns a randomly selected collection of reviews with scores for each topic that
    is the length of n_samples.
    '''
    review_df = review_df
    review_index = []
    for _ in range(n_samples):
        review_index.append(random.randint(1, len(review_df)))
        
    for _ in review_index:
        print('Review ' + str(_))
        print(' ')
        print(review_df.review_body[_])
        print(' ')
        print(review_df.loc[_][1:])
        print(' ')
        print('___________________________________________________')
        print(' ')
    

In [256]:
review_checker(review_df, 50)

Review 643
 
This microwave is fantastic ! The inside is huge . It has  power levels that works easy than the old one I had. I've only had this for a 2 months but so far Its is great.
 
size         0.00361
door         0.00361
lifetime     0.00361
power        0.04201
functions    0.94717
Name: 643, dtype: object
 
___________________________________________________
 
Review 360
 
Purchased this item and within 7 days it quit working.  I followed the manuals guidance on page 25 under the &#34;Oven will not start cooking.&#34; without success.  I'll return this to Amazon and thengo to Walmart and get a microwave that works.
 
size         0.00336
door         0.00336
lifetime      0.9012
power        0.08873
functions    0.00336
Name: 360, dtype: object
 
___________________________________________________
 
Review 1553
 
great product easy to use features large cavity to hold dinner plates  used the reheat feature which is great and eliminates a lot of guesswork when reheating a singl

As you can see. Our model performs excellent on some reviews and greatly misses the mark on others. Feel free to use the same code on a larger dataset with more reviews. You will probably find better results.

## Additional Analysis

For further analysis into our topics' keywords and topic assignment you can play with the following commands to gain a deeper insight into our model's performance.

These were some tools I used to gain some quick insights and to perform spot checks during the modeling of this project.

In [231]:
# Given a topic index returns the top words most associated with that topic
topic_index = 1
np.array(vectorizer.get_feature_names()).take(nmf.components_[topic_index].argsort()[::-1])[:100]

array(['door', 'open', 'close', 'latch', 'month', 'problem', 'button',
       'broke', 'hard', 'push', 'door latch', 'start', 'shut', 'worked',
       'fine', 'working', 'door open', 'stopped', 'time', 'open door',
       'opening', 'slam', 'week', 'off', 'light', 'broken', 'turn',
       'complaint', 'door close', 'repair', 'plastic', 'stopped working',
       'shop', 'worked fine', 'fix', 'stop', 'closing', 'push button',
       'junk', 'mechanism', 'stay', 'switch', 'closed', 'ago', 'warranty',
       'started', 'half', 'issue', 'slam door', 'piece', 'hand', 'think',
       'release', 'design', 'longer', 'loud', 'noise', 'lasted', 'cheap',
       'completely', 'break', 'star', 'button open', 'make', 'quality',
       'poor', 'later', 'buying', 'little', 'hard close', '2013',
       'piece junk', 'stick', 'purchased', 'second', 'machine', 'opened',
       'pull', 'open close', 'close door', 'feel', 'closing door',
       'door hard', 'read', 'march', '2014', 'failed', 'disappointed',

In [233]:
# Given a review index and topic index returns the rank that review will 
# appear within the topic
review_idx = 368
topic_index = 2
for idx, i in enumerate(W[:,2].argsort()[:]):
    if i == review_idx:
        print(idx)

799


In [261]:
# Prints the Amazon reviews most associated for a given topic index. For further analysis you can
# comment back in the other lines
topic_index = 2
n = 20
for num in W[:,topic_index].argsort()[-n:][::-1]:
    print(num)
#     print(df.review_body[num])
#     print(df.model_ready[num])
#     print(review_df.loc[(num)])

646
1763
37
657
1189
910
1837
1457
1689
1485
48
1205
611
1313
1251
1842
743
479
1215
425
