# **Amazon Nike Reviews**

For this project, we will explore and build topic models on Nike products and reviews on Amazon.

## Imports

In [None]:
import pickle
import json
from time import sleep

import os
try:
  import tmtoolkit
except:
  !pip install tmtoolkit
  os.kill(os.getpid(), 9)

import nltk
import random
import numpy as np
from tmtoolkit.corpus import Corpus
import json
import pickle
import scipy.sparse

random.seed(20191120)   # to make the sampling reproducible
np.set_printoptions(precision=5)

from tmtoolkit.preprocess import TMPreproc

try:
  from tmtoolkit.topicmod.tm_lda import compute_models_parallel
except: 
  !pip install tmtoolkit['lda']
  from tmtoolkit.topicmod.tm_lda import compute_models_parallel

import logging
import warnings

try:
  from lda import LDA
except: 
  !pip install lda

from tmtoolkit.topicmod.model_io import print_ldamodel_topic_words

from tmtoolkit.bow.bow_stats import doc_lengths
from tmtoolkit.topicmod.model_stats import generate_topic_labels_from_top_words

from tmtoolkit.topicmod.visualize import parameters_for_ldavis
try:
  import pyLDAvis
except:
  !pip install pyLDAvis==2.1.2
  import pyLDAvis

# Data Extraction

We'll get our products data.

In [None]:
!wget http://128.138.93.164/meta_Clothing_Shoes_and_Jewelry.json.gz -P /content/drive/MyDrive/MSDS_marketing_text_analytics/master_files/2_topic_modeling

--2023-04-19 19:44:30--  http://128.138.93.164/meta_Clothing_Shoes_and_Jewelry.json.gz
Connecting to 128.138.93.164:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 279748879 (267M) [application/octet-stream]
Saving to: ‘/content/drive/MyDrive/MSDS_marketing_text_analytics/master_files/2_topic_modeling/meta_Clothing_Shoes_and_Jewelry.json.gz’


2023-04-19 19:44:35 (49.9 MB/s) - ‘/content/drive/MyDrive/MSDS_marketing_text_analytics/master_files/2_topic_modeling/meta_Clothing_Shoes_and_Jewelry.json.gz’ saved [279748879/279748879]



In [None]:
!gzip -d /content/drive/MyDrive/MSDS_marketing_text_analytics/master_files/2_topic_modeling/meta_Clothing_Shoes_and_Jewelry.json.gz

In [None]:
#this assigns the filename we're trying to load in to a string variable
working_directory = '/content/drive/MyDrive/MSDS_marketing_text_analytics/master_files/2_topic_modeling'
working_file = '%s/meta_Clothing_Shoes_and_Jewelry.json' % working_directory
loadedjson = open(working_file, 'r')

In [None]:
#First, let's iterate through the data and store it as a python dictionary

#let's set a counter to see how many products we have in the json
count = 0
#loading the json file
allproducts = {}

#each line of data here is a product and its metadata
for aline in loadedjson:
    count += 1
    if count % 100000 == 0:
        #we're only going to print our count every 100k, this way we don't spam
        #our output console
        print(count)
    
    aproduct = eval(aline)

    #making a dictionary entry with the ASIN of the product as the key 
    #and it's metadata as nested dictionaries
    allproducts[aproduct['asin']] = aproduct

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000


In [None]:
#Let's create a dictionary of all the product subcategories 
#and by doing so, also come up with a list of brands and the number of products
#they have listed in the amazon product catalog

allcategories = {}
count = 0

for aproduct in allproducts:
    count += 1
    if count % 100000 == 0:
        #we now know there are 1.5 million products, so we can build a counter
        #that tells how our processing is going.
        print(count/1503384)
    #setting a dict up with just one product, so we can inspect and ref it  
    aproduct = allproducts[aproduct]
    #creating a dictionary entry for each product category
    #also counting the occurances of each category
    if 'categories' in aproduct:
        for categories in aproduct['categories']:
            for acategory in categories:
                if acategory in allcategories:
                    allcategories[acategory] += 1
                if acategory not in allcategories:
                    allcategories[acategory] = 1


0.06651660520532346
0.13303321041064692
0.19954981561597038
0.26606642082129384
0.33258302602661727
0.39909963123194075
0.4656162364372642
0.5321328416425877
0.5986494468479111
0.6651660520532345
0.7316826572585581
0.7981992624638815
0.8647158676692049
0.9312324728745284
0.9977490780798518


In [None]:
allcategories['Nike']

8327

We need all data related to Nike.

In [None]:
#Now we need to go through our newly first dictionary and extract out the
#matching ASINs for Nike

#First, create a set where we will store our ASINs
allnikeasins = set()
count = 0

for areview in allproducts:
    theproduct = allproducts[areview]
    count += 1
    if count % 100000 == 0:
        print(count/1503384)

    #let's iterate fore each category for a product, again, any given product 
    #can be assigned multiple product categories,
    for categories in theproduct['categories']:
        for acategory in categories:
            #checking to see if the product category matches Nike
            if 'nike' in acategory.lower():
                allnikeasins.add(theproduct['asin'])
                
print(len(allnikeasins))

0.06651660520532346
0.13303321041064692
0.19954981561597038
0.26606642082129384
0.33258302602661727
0.39909963123194075
0.4656162364372642
0.5321328416425877
0.5986494468479111
0.6651660520532345
0.7316826572585581
0.7981992624638815
0.8647158676692049
0.9312324728745284
0.9977490780798518
8327


In [None]:
#Let's write the ASINs out to a file so we can extract product reviews.
outputfile = open('%s/allasins.txt' % working_directory, 'w')

outputfile.write(','.join(allnikeasins))
outputfile.close()

Now we'll get our review data.

In [None]:
!wget http://128.138.93.164/reviews_Clothing_Shoes_and_Jewelry.json.gz -P /content/drive/MyDrive/MSDS_marketing_text_analytics/master_files/2_topic_modeling

--2023-04-19 19:52:03--  http://128.138.93.164/reviews_Clothing_Shoes_and_Jewelry.json.gz
Connecting to 128.138.93.164:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 888065454 (847M) [application/octet-stream]
Saving to: ‘/content/drive/MyDrive/MSDS_marketing_text_analytics/master_files/2_topic_modeling/reviews_Clothing_Shoes_and_Jewelry.json.gz’


2023-04-19 19:52:18 (56.7 MB/s) - ‘/content/drive/MyDrive/MSDS_marketing_text_analytics/master_files/2_topic_modeling/reviews_Clothing_Shoes_and_Jewelry.json.gz’ saved [888065454/888065454]



In [None]:
!gzip -d /content/drive/MyDrive/MSDS_marketing_text_analytics/master_files/2_topic_modeling/reviews_Clothing_Shoes_and_Jewelry.json.gz

In [None]:
#this assigns the filename we're trying to load in to a string variable
working_file = '%s/reviews_Clothing_Shoes_and_Jewelry.json' % working_directory
loadedjson = open(working_file, 'r')

In [None]:
#Parsing the review data
#Let's load the review data into a dictionary.
count = 0
allreviews = {}
for aline in loadedjson:
   count += 1
   if count % 100000 == 0:
       print(count)
   areview = eval(aline)
   allreviews[count] = areview
   
print(len(allreviews))

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5748920


We need the reviews that are related to Nike products.

In [None]:
#Now, we need to go through all the reviews and pick out the reviews that
#correspond to the matching ASINs, that is reviews that are tied to Nike ASINs

allnikeasins = []

for data in open('%s/allasins.txt' % working_directory, 'r'):
  asins = data.split(',')
  for anasin in asins:
    allnikeasins.append(anasin)

In [None]:
nikereviews = {}
count = 0
for areview in allreviews:
   count += 1
   if count % 100000 == 0:
       print(count/5748920)      
   #setting current review as a dict, so we can easily reference its entries
   thereview = allreviews[areview]
   theasin = thereview['asin']
   reviewerid = thereview['reviewerID']
   if theasin in allnikeasins:
       thekey = '%s.%s' % (theasin, reviewerid)
       nikereviews[thekey] = thereview

print(len(nikereviews))
        
#let's save our data as a JSON dictionary
json.dump(nikereviews, open('%s/allnikereviews.json' % working_directory, 'w'))

0.017394571502125616
0.03478914300425123
0.05218371450637685
0.06957828600850247
0.08697285751062808
0.1043674290127537
0.12176200051487931
0.13915657201700493
0.15655114351913055
0.17394571502125616
0.19134028652338178
0.2087348580255074
0.226129429527633
0.24352400102975863
0.2609185725318843
0.27831314403400986
0.2957077155361355
0.3131022870382611
0.33049685854038674
0.34789143004251233
0.365286001544638
0.38268057304676356
0.4000751445488892
0.4174697160510148
0.43486428755314044
0.452258859055266
0.46965343055739167
0.48704800205951726
0.5044425735616429
0.5218371450637685
0.5392317165658941
0.5566262880680197
0.5740208595701454
0.591415431072271
0.6088100025743965
0.6262045740765222
0.6435991455786478
0.6609937170807735
0.678388288582899
0.6957828600850247
0.7131774315871503
0.730572003089276
0.7479665745914015
0.7653611460935271
0.7827557175956528
0.8001502890977784
0.8175448605999039
0.8349394321020296
0.8523340036041552
0.8697285751062809
0.8871231466084064
0.904517718110532


## Preprocessing

Now that we have our data, we can build our corpus of reviews.

In [None]:
json_path = "%s/allnikereviews.json" % working_directory
json_file = json.load(open(json_path, 'r'))

In [None]:
corpus = Corpus()
for i, a_review in enumerate(json_file):
  the_review = json_file[a_review]
  corpus.add_doc(str(i), the_review['reviewText'])

In [None]:
print(len(corpus))

21570


Due to runtime problems, we will reduce our corpus to the first 7,500 reviews.

In [None]:
corpus_short = Corpus()
for i, a_review in enumerate(json_file):
  if i < 7500:
    the_review = json_file[a_review]
    corpus_short.add_doc(str(i), the_review['reviewText'])

We need to clean our data to remove unnecessary text from our reviews in order to build better models.

In [None]:
preproc = TMPreproc(corpus_short, language='en')
preproc.pos_tag()
preproc.lemmatize()
preproc.tokens_to_lowercase()
preproc.remove_special_chars_in_tokens()
preproc.add_stopwords(['http', 'nt'])
preproc.filter_for_pos('N', 'V', 'ADJ')
preproc.clean_tokens(remove_numbers=True, remove_shorter_than=2)
preproc.remove_common_tokens(df_threshold=0.8)
preproc.remove_uncommon_tokens(df_threshold=0.01)

<TMPreproc [7500 documents / en]>

In [None]:
doc_labels = np.array(preproc.doc_labels)
vocab = np.array(preproc.vocabulary)
dtm_pre = preproc.dtm

# Topic Modeling

Now that we have clean data, we are finally ready to build our model using the 'LDA' method. We'll start by creating a model that contains 25 clusters with an alpha of 1/25. We will tune these parameters if need be based on the results.

In [None]:
# suppress the "INFO" messages and warnings from lda
logger = logging.getLogger('lda')
logger.addHandler(logging.NullHandler())
logger.propagate = False
warnings.filterwarnings('ignore')

In [None]:
# set data to use
dtms = {
    'dtm': dtm_pre
}

# and fixed hyperparameters
lda_params = {
    'n_topics': 25,
    'eta': .5,
    'alpha': 1/25,
    'n_iter': 1000,
    'random_state': 20191122  # to make results reproducible
}

models = compute_models_parallel(dtms, constant_parameters=lda_params)

Let's take a look at the top 3 words in each topic.

In [None]:
model = models['dtm'][0][1]
print_ldamodel_topic_words(model.topic_word_, vocab, top_n=3)

topic_1
> #1. wear (0.096159)
> #2. buy (0.065114)
> #3. comfortable (0.033018)
topic_2
> #1. order (0.075691)
> #2. love (0.055801)
> #3. great (0.049908)
topic_3
> #1. work (0.040418)
> #2. get (0.037577)
> #3. like (0.035511)
topic_4
> #1. easy (0.096475)
> #2. use (0.060592)
> #3. read (0.047057)
topic_5
> #1. look (0.061375)
> #2. like (0.058373)
> #3. make (0.045168)
topic_6
> #1. great (0.084603)
> #2. good (0.072857)
> #3. feel (0.068095)
topic_7
> #1. get (0.045156)
> #2. buy (0.044828)
> #3. break (0.043842)
topic_8
> #1. wear (0.059311)
> #2. wide (0.058661)
> #3. fit (0.056061)
topic_9
> #1. keep (0.046178)
> #2. wear (0.043756)
> #3. great (0.040989)
topic_10
> #1. small (0.098785)
> #2. run (0.069672)
> #3. order (0.067411)
topic_11
> #1. small (0.058348)
> #2. would (0.043890)
> #3. fit (0.040103)
topic_12
> #1. run (0.070888)
> #2. comfortable (0.061349)
> #3. use (0.049178)
topic_13
> #1. back (0.057314)
> #2. send (0.049887)
> #3. order (0.048918)
topic_14
> #1. find 

Now we'll look at our 25 topics.

In [None]:
doc_lengths = doc_lengths(dtm_pre)
topic_labels = generate_topic_labels_from_top_words(
    model.topic_word_,
    model.doc_topic_,
    doc_lengths,
    vocab,
    lambda_=0.7
)
topic_labels

array(['1_wear_buy', '2_order_expect', '3_work_get', '4_easy_read',
       '5_cheap_look', '6_feel_great', '7_break_purchase',
       '8_wide_narrow', '9_keep_dry', '10_small_order', '11_small_big',
       '12_run_use', '13_send_back', '14_find_happy', '15_white_black',
       '16_great_love', '17_wear_love', '18_buy_love', '19_great_wear',
       '20_little_tight', '21_try_feel', '22_good_nice', '23_look_really',
       '24_last_long', '25_recommend_highly'], dtype='<U19')

Our 25 topics look pretty good. Let's visualize our clusters.

In [None]:
ldavis_params = parameters_for_ldavis(model.topic_word_,
                                      model.doc_topic_,
                                      dtm_pre,
                                      vocab)

In [None]:
%matplotlib inline
vis = pyLDAvis.prepare(**ldavis_params)
pyLDAvis.enable_notebook(local=True)
pyLDAvis.display(vis)

  and should_run_async(code)


Our topic distribution looks really good with not a lot of overlap between topics. We are happy to leave the parameters as is with 25 clusters and an alpha of 1/25.

In summary, the majority of our topics are positive reviews of Nike products, including topic 6: 'feel great', topic 24: 'last long' and topic 25: 'highly recommend'. It looks like one of the common issue is sizing. For example topic 8 is 'wide narrow' and topic 20 is 'little tight'. In order to make suggestions to Nike, we would need to explore those two topics more as well as topic 13 'send back'.