
# Special Install of Packages with clearer checks and updates
print('[-] Importing packages...')

import os
import sys

# Installing or checking for tmtoolkit and dependencies
try:
    import tmtoolkit
except ImportError:
    print('Installing tmtoolkit...')
    !pip install --quiet -U "tmtoolkit[recommended,lda,sklearn,wordclouds,textproc_extra,topic_modeling_eval_extra]"
    print('tmtoolkit installed.')

# Installing or checking for matplotlib version
import matplotlib
if matplotlib.__version__ != "3.1.3":
    print('Installing specific matplotlib version (3.1.3)...')
    !pip uninstall --quiet -y matplotlib
    !pip install --quiet matplotlib==3.1.3
    print('matplotlib (3.1.3) installed.')

# Installing lda library
try:
    from lda import LDA
except ImportError:
    print('Installing lda package...')
    !pip install --quiet lda
    print('lda installed.')

# Check available RAM and GPU (if applicable)
print("\n[INFO] Checking system resources...")

# Checking available RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print(f"  [.] Your runtime has {ram_gb:.1f} GB of available RAM")
if ram_gb < 16:
    print("  [.] Warning: This runtime may not be suitable for large datasets.")
elif ram_gb >= 32:
    print("  [.] High-RAM runtime detected. Optimal for larger datasets.")

# Check GPU status (optional, can be removed)
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if 'failed' in gpu_info.lower():
    print("  [.] No GPU available.")
else:
    print(f"  [.] GPU found: {gpu_info.splitlines()[1]}")



# Section 1: Data Loading
import pandas as pd

def load_data(filepath):
    '''
    Loads dataset from a given file path. Assumes the dataset contains a column named "reviews".
    Returns a Pandas DataFrame.
    '''
    try:
        df = pd.read_json(filepath)
        print(f"Data loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Example usage
data_path = 'path_to_your_data.json'
df = load_data(data_path)



# Section 2: Text Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    '''
    Cleans and tokenizes input text.
    Steps include lowercasing, removing non-alphabetical characters, tokenizing, and lemmatizing.
    '''
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabetical characters
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to the review column
df['cleaned_text'] = df['reviews'].apply(preprocess_text)
print("Text preprocessing completed.")



# Section 3: LDA Topic Modeling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split

# Vectorize the cleaned text
vectorizer = CountVectorizer(max_df=0.9, min_df=5, stop_words='english')
X = vectorizer.fit_transform(df['cleaned_text'])

# Split data for train and evaluation
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Train the LDA model
def train_lda_model(X, n_topics=10, max_iter=10, random_state=42):
    '''
    Trains an LDA model and returns the trained model.
    '''
    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter, random_state=random_state)
    lda.fit(X)
    return lda

lda_model = train_lda_model(X_train, n_topics=10)
print("LDA model training completed.")

# Step 4: Evaluating the Model
def evaluate_lda_model(lda_model, X_test):
    '''
    Evaluates the LDA model using log-likelihood and perplexity.
    '''
    log_likelihood = lda_model.score(X_test)
    perplexity = lda_model.perplexity(X_test)
    print(f"Log-Likelihood: {log_likelihood}, Perplexity: {perplexity}")

evaluate_lda_model(lda_model, X_test)



# Section 4: Visualizing Topics
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud

def display_topics(model, feature_names, n_top_words=10):
    '''
    Displays the top words for each topic in both text and visual word cloud formats.
    '''
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print("Top words: ", [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        
        # Generate word cloud
        wordcloud = WordCloud()
        plt.figure()
        plt.imshow(wordcloud.fit_words({feature_names[i]: topic[i] for i in topic.argsort()[:-n_top_words - 1:-1]}))
        plt.axis("off")
        plt.title(f"Topic {topic_idx} WordCloud")
        plt.show()

# Get feature names from vectorizer
feature_names = vectorizer.get_feature_names_out()
display_topics(lda_model, feature_names)


In [None]:
# Special Install of Packages
print('[-] Importing packages...')
#special_install_tmtoolkit
import os
try:
  import tmtoolkit
except:
  print('starting patch of tmtoolkit.')
  !pip install --quiet -U "tmtoolkit[recommended,lda,sklearn,wordclouds,textproc_extra,topic_modeling_eval_extra]"
  print('finished patch of tmtoolkit.')
  os.kill(os.getpid(), 9)

#special_install_matplotlib
import os
import matplotlib
if matplotlib.__version__ != "3.1.3":
    print('starting patch of matplotlib.')
    !pip uninstall --quiet -y matplotlib
    !pip install --quiet matplotlib==3.1.3
    print('finished patch of matplotlib.')
    os.kill(os.getpid(), 9)

#special_install_lda
import os
try:
  from tmtoolkit.topicmod.tm_lda import compute_models_parallel
except:
  !pip install --quiet tmtoolkit['lda']
  from tmtoolkit.topicmod.tm_lda import compute_models_parallel

try:
  from lda import LDA
except:
  !pip install --quiet lda
  from lda import LDA

#special_install_pyLDAvis
try:
  import pyLDAvis
except:
  !pip install --quiet pyLDAvis==2.1.2
  import pyLDAvis


[-] Importing packages...
starting patch of matplotlib.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for matplotlib (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
arviz 0.19.0 requires matplotlib>=3.5, but you have matplotlib 3.1.3 which is incompatible.
bigframes 1.21.0 requires matplotlib>=3.7.1, but you have matplotlib 3.1.3 which is incompatible.
plotnine 0.13.6 requires matplotlib>=3.7.0, but you have matplotlib 3.1.3 which is incompatible.
seaborn 0.13.2 requires matplotlib!=3.6.1,>=3.4, but you have matplotlib 3.1.3 which is incompatible.
tmtoolkit 0.12.0 requires matplotlib<4.0,>=3.5.0, but you have matplotlib 3.1.3 which is incompatible.[0m[31m
[0m

In [None]:
print('[-] Importing packages...')
# File Connection and File Manipulation
import os
import pickle
import json
import glob
# Import Usability Functions
import logging
import warnings
# Basic Data Science Toolkits
import pandas as pd
import numpy as np
import math
import random
import time
from time import sleep
# Basic Data Vizualization
import seaborn as sns
import matplotlib.pyplot as plt
# Text Preprocessing (tmtoolkit)
import tmtoolkit
from tmtoolkit.corpus import Corpus, lemmatize, to_lowercase, remove_chars, filter_clean_tokens
from tmtoolkit.corpus import filter_for_pos, remove_common_tokens, remove_uncommon_tokens
from tmtoolkit.corpus import corpus_num_tokens, corpus_tokens_flattened
from tmtoolkit.corpus import doc_tokens, tokens_table, doc_labels, dtm
from tmtoolkit.corpus import vocabulary, vocabulary_size, vocabulary_counts
from tmtoolkit.topicmod.model_io import print_ldamodel_topic_words
from tmtoolkit.topicmod.tm_lda import compute_models_parallel
from tmtoolkit.corpus.visualize import plot_doc_lengths_hist, plot_doc_frequencies_hist, plot_ranked_vocab_counts
#https://tmtoolkit.readthedocs.io/en/latest/preprocessing.html
# Text Preprocessing(other)
from string import punctuation
import nltk
import scipy.sparse
# Topic Modeling
from lda import LDA
import pyLDAvis
from tmtoolkit.topicmod import tm_lda
from tmtoolkit.topicmod.tm_lda import compute_models_parallel
from tmtoolkit.topicmod.model_io import print_ldamodel_topic_words
from tmtoolkit.topicmod.model_io import save_ldamodel_to_pickle
from tmtoolkit.topicmod.model_io import load_ldamodel_from_pickle
from tmtoolkit.topicmod.model_io import ldamodel_top_doc_topics
from tmtoolkit.topicmod.evaluate import results_by_parameter
from tmtoolkit.topicmod.visualize import plot_eval_results
from tmtoolkit.topicmod.visualize import parameters_for_ldavis
from tmtoolkit.topicmod.visualize import generate_wordclouds_for_topic_words
from tmtoolkit.topicmod.model_stats import generate_topic_labels_from_top_words
from tmtoolkit.bow.bow_stats import doc_lengths
# Sentiment Modeling
from textblob import TextBlob
# normalize
from sklearn.preprocessing import MinMaxScaler

## Set Global Variables

In [None]:
random.seed(20191120)   # to make the sampling reproducible
np.set_printoptions(precision=5)

## Verify GPU Runtime

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('  [.] Your runtime has {:.1f} gigabytes of available RAM'.format(ram_gb))

if ram_gb < 20:
  print('  [.] Not using a high-RAM runtime')
else:
  print('  [.] You are using a high-RAM runtime!')

## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Setup Directories

In [None]:
ROOT_DIR = "/content/drive/MyDrive/MSDS_marketing_text_analytics/master_files/2_topic_modeling"
DATA_DIR = "%s/data" % ROOT_DIR
EVAL_DIR = "%s/evaluation" % ROOT_DIR
MODEL_DIR = "%s/models" % ROOT_DIR

#Create missing directories, if they don't exist
if not os.path.exists(DATA_DIR):
  # Create a new directory because it does not exist
  os.makedirs(DATA_DIR)
  print("The data directory is created!")
if not os.path.exists(EVAL_DIR):
  # Create a new directory because it does not exist
  os.makedirs(EVAL_DIR)
  print("The evaluation directory is created!")
if not os.path.exists(MODEL_DIR):
  # Create a new directory because it does not exist
  os.makedirs(MODEL_DIR)
  print("The model directory is created!")

# 2.&nbsp;Data Source

Import the product and review data and focus the datasets on the Nike brand. This is accomplished by identifing the ASINs associated with products related to the Nike brand using the product dataset. Using a list of the products, the reviews can be filtered with the list of ASINs, resulting in a subset of review data associated to the Nike brand.

## Copy Data From Source

In [None]:
#!wget <URL> -P <COLAB PATH>
#source_url = 'http://128.138.93.164/meta_Clothing_Shoes_and_Jewelry.json.gz' # true source, need better link
source_url = 'https://docs.google.com/uc?export=download&id=12cPbdNpQ6Dmqg25Fb0kAxFSEug-8t3gc&confirm=t' # local source, working for testing
dest_path = '%s/meta_Clothing_Shoes_and_Jewelry.jsonl.gz' % DATA_DIR
!wget "$source_url" -O "$dest_path"


In [None]:
#!wget <URL> -P <COLAB PATH>
#source_url = 'http://128.138.93.164/reviews_Clothing_Shoes_and_Jewelry.json.gz' # true source, need better link
source_url = "https://docs.google.com/uc?export=download&id=12detwlesuD7S-O8i9w4LOii1DWML0i7Q&confirm=t" # local source, working for testing
dest_path = '%s/reviews_Clothing_Shoes_and_Jewelry.json.gz' % DATA_DIR
file_name = 'reviews_Clothing_Shoes_and_Jewelry.json.gz'
print(dest_path)
!wget "$source_url" -O "$dest_path"

In [None]:
meta_file_path = '%s/meta_Clothing_Shoes_and_Jewelry.jsonl.gz' % DATA_DIR
review_file_path = '%s/reviews_Clothing_Shoes_and_Jewelry.json.gz' % DATA_DIR

!gzip -d "$meta_file_path"
!gzip -d "$review_file_path"

## Load the Product Data

In [None]:
##this assigns the filename we're trying to load in to a string variable
meta_file_path = '%s/meta_Clothing_Shoes_and_Jewelry.jsonl' % DATA_DIR
loadedjson = open(meta_file_path, 'r')

In [None]:
#The data used in this script comes from: http://jmcauley.ucsd.edu/data/amazon/links.html
#The data here is the 'per category' data for Clothing, Shoes and Jewelry
#use the above url to better understand the data, where it came from, and some
#tips on how to use it!

#getting reviews is going to be a two step process:
#1) go through the amazon product catalog for "Clothing, Shoes and Jewelery
#and extract out matching products by their ASIN
#2) go through the review data and parse out the matching reviews by ASIN

#1) - Extracting ASINs by brand
#First, let's iterate through the data and store it as a python dictionary

#let's set a counter to see how many products we have in the json
count = 0
start_time = time.time()
#loading the json file
#we've always got to initiate dictionaries before we can use them
allproducts = {}

#each line of data here is a product and its metadata
print('loading product data to dictionary:')
for aline in loadedjson:
    #creating a counter to know our progress in processing the entire catalog
    count += 1
    if count % 100000 == 0:
        #we're only going to print our count every 100k, this way we don't spam
        #our output console
        current_runtime = round(time.time() - start_time,3)
        print('[-] current progress:', count, 'and a runtime of', current_runtime, 'seconds.')
    #interestingly enough, this data isn't true JSON, instead it's python
    #dictionaries that have essentially been printed as text. It's odd, but if
    #we read the documentaion, all we need to do to load a dictionary is use
    #the eval function. https://www.programiz.com/python-programming/methods/built-in/eval
    #eval takes whatever string is passed to it, and interprets it as python code
    #and runs it. So here, it's exactly what we need to interpret a printed
    #python dictionary

    aproduct = eval(aline)

    #making a dictionary entry with the ASIN of the product as the key
    #and it's metadata as nested dictionaries
    allproducts[aproduct['asin']] = aproduct

In [None]:
#print a summary of the records processed
allproducts_length = len(allproducts)
current_runtime = round(time.time() - start_time,3)
print('Process completed for', count, 'of', allproducts_length, 'records with a final runtime of', current_runtime, 'seconds.')

In [None]:
#preview the product record
allproducts['B00KUSKHDC']

In [None]:
#save the files to disk
allproducts_file_path = '%s/allproducts.p' % DATA_DIR
pickle.dump(allproducts, open(allproducts_file_path, 'wb'))

## Summarize the Product Categories

In [None]:
#Next we need to explore the product data to see what categories are common in the
#data. As you'll learn, product categories are wishywashy in that they can be
#product categories (e.g., baby, house and home), or they can be brands!
#We're already dealing with a subset of the product categories, Clothing, Shoes
#and Jewlery. We still need to find a list of product ids for our specific
#brand. To do this,We're going to use the 'categories' metadata field to find
#your brand

##Let's create a dictionary of all the product subcategories
#and by doing so, also come up with a list of brands and the number of products
#they have listed in the amazon product catalog

allcategories = {}
count = 0
start_time = time.time()

#each line of data here is a product and its metadata
print('loading categories data to dictionary:')
for aproduct in allproducts:
    #creating a counter to know our progress in processing the entire catalog
    count += 1
    if count % 100000 == 0:
        #we now know there are 1.5 million products, so we can build a counter
        #that tells how our processing is going. When the counter reaches one
        #we're done!
        current_progress = int(round(count/allproducts_length,2)*100)
        current_runtime = round(time.time() - start_time,3)
        print('[-] current progress:', current_progress, '%', 'and a runtime of', current_runtime, 'seconds.')

    #setting a dict up with just one product, so we can inspect and ref it
    aproduct = allproducts[aproduct]
    #creating a dictionary entry for each product category
    #also counting the occurances of each category
    if 'categories' in aproduct:
        for categories in aproduct['categories']:
            for acategory in categories:
                if acategory in allcategories:
                    allcategories[acategory] += 1
                if acategory not in allcategories:
                    allcategories[acategory] = 1

In [None]:
#print a summary of the categories processed
allcategories_length = len(allcategories)
current_runtime = round(time.time() - start_time,3)
print('Process completed for', allcategories_length, 'categories with a final runtime of', current_runtime, 'seconds.')

In [None]:
#create a sorted list of categories
sortedlist = []
#covert the dictionary to a list of tuples
for acategory in allcategories:
  sortedlist.append((allcategories[acategory],acategory))
#sort the list
sortedlist = sorted(sortedlist, reverse=True)
#print the top x records in the list
top_n = 20
for item in range(0,top_n):
  print('[',str(item).zfill(2),']', sortedlist[item])

In [None]:
nike_categories = allcategories['Nike']
print(nike_categories, 'product records for Nike.')

## Extract a List of Product Ids

In [None]:
#Now, go ahead and use the Variable Expolorer in Spyder to locate a brand
#that has a lot of product entries! Alternatively, type allcategories['Brand name']
#to get a count for a specific brand. For instance:
#>>allcategories['Nike']
#>> 8327
#>>allcategories['adidas']
#>> 8645

#I'd reccommend at least 1.5k products, but you're welcome to try smaller counts
#all I care about is whether you have at least 2k reviews when it's all said and done


##Now we need to go through our newly first dictionary and extract out the
##matching ASINs for Nike

##First, create a set where we will store our ASINs
##We choose a set here because we don't want duplicates
allnikeasins = set()
count = 0
start_time = time.time()

for areview in allproducts:
    theproduct = allproducts[areview]
    count += 1
    if count % 100000 == 0:
        current_progress = int(round(count/allproducts_length,2)*100)
        current_runtime = round(time.time() - start_time,3)
        print('[-] current progress:', current_progress, '%', 'and a runtime of', current_runtime, 'seconds.')

    #let's iterate fore each category for a product, again, any given product
    #can be assigned multiple product categories,
    for categories in theproduct['categories']:
        #each category is actually encoded as a list (even though they should
        #just be strings, so we need to iterate one more time)
        for acategory in categories:
            #checking to see if the product category matches Nike
            #lowercasing the category string incase capitalization might get
            #in the way of a match
            if 'nike' in acategory.lower():
                #let's go ahead and store it to our set of Nike ASINs
                allnikeasins.add(theproduct['asin'])

In [None]:
#print a summary of the categories processed
allnikeasins_length = len(allnikeasins)
current_runtime = round(time.time() - start_time,3)
print('Process completed for', allnikeasins_length, 'records with a final runtime of', current_runtime, 'seconds.')

In [None]:
# write the ASINs out to a file as a checkpoint
outputfile = open('%s/allasins.txt' % DATA_DIR, 'w')

outputfile.write(','.join(allnikeasins))
outputfile.close()

## Load the Review Data

In [None]:
#this assigns the filename we're trying to load in to a string variable
review_file_path = '%s/reviews_Clothing_Shoes_and_Jewelry.json' % DATA_DIR
loadedjson = open(review_file_path, 'r')

In [None]:
#2) - Parsing the review data
#First, let's iterate through the data and store it as a python dictionary

#let's set a counter to see how many products we have in the json
count = 0
start_time = time.time()
#loading the json file
#we've always got to initiate dictionaries before we can use them
allreviews = {}

#each line of data here is a product and its metadata
print('loading review data to dictionary:')
for aline in loadedjson:
    #creating a counter to know our progress in processing the entire catalog
    count += 1
    if count % 500000 == 0:
        #we're only going to print our count every 100k, this way we don't spam
        #our output console
        current_runtime = round(time.time() - start_time,3)
        print('[-] current progress:', count, 'and a runtime of', current_runtime, 'seconds.')
    #interestingly enough, this data isn't true JSON, instead it's python
    #dictionaries that have essentially been printed as text. It's odd, but if
    #we read the documentaion, all we need to do to load a dictionary is use
    #the eval function. https://www.programiz.com/python-programming/methods/built-in/eval
    #eval takes whatever string is passed to it, and interprets it as python code
    #and runs it. So here, it's exactly what we need to interpret a printed
    #python dictionary

    areview = eval(aline)

    #making a dictionary entry with the iteration count as the review key
    #and it's metadata as nested dictionaries
    allreviews[count] = areview
print('completed load of review data to dictionary.')

In [None]:
#print a summary of the records processed
allreviews_length = len(allreviews)
current_runtime = round(time.time() - start_time,3)
print('Process completed for', count, 'of', allreviews_length, 'records with a final runtime of', current_runtime, 'seconds.')

## Extract a List of Reviews Related to the Product Ids

In [None]:
#Load the list of Nike Asins

allnikeasins = []
allasins_file_path = '%s/allasins.txt' % DATA_DIR

#open the file and load to a list
for data in open(allasins_file_path, 'r'):
  asins = data.split(',')
  for anasin in asins:
    allnikeasins.append(anasin)

In [None]:
#print a summary of the records processed
allnikeasins_length = len(allnikeasins)
print('Process completed for', allnikeasins_length, 'records.')
print('First 5 Asins in list:', allnikeasins[0:5])

In [None]:
#Now, we need to go through all the reviews and pick out the reviews that
#correspond to the matching ASINs, that is reviews that are tied to Nike ASINs

#let's set a counter to see how many products we have in the json
count = 0
start_time = time.time()
#loading the json file
#we've always got to initiate dictionaries before we can use them
nikereviews = {}

#each line of data here is a product and its metadata
print('loading review data to dictionary:')
for areview in allreviews:
  count += 1
  if count % 500000 == 0:
      current_progress = int(round(count/allreviews_length,2)*100)
      current_runtime = round(time.time() - start_time,3)
      print('[-] current progress:', current_progress, '%', 'and a runtime of', current_runtime, 'seconds.')
  #setting current review as a dictionary, so we can easily reference its
  #entries
  thereview = allreviews[areview]
  theasin = thereview['asin']
  reviewerid = thereview['reviewerID']
  if theasin in allnikeasins:
      #im setting the key here as something unique. if we just did by asin
      #we'd only have one review for each asin, with the last review the only
      #one being stored
      thekey = '%s.%s' % (theasin, reviewerid)
      nikereviews[thekey] = thereview
print('completed load of review data to dictionary.')

In [None]:
#print a summary of the records processed
nikereviews_length = len(nikereviews)
current_runtime = round(time.time() - start_time,3)
print('Process completed for', count, 'of', nikereviews_length, 'records with a final runtime of', current_runtime, 'seconds.')

In [None]:
#save our data to a JSON dictionary
allnikereviews_file_path = '%s/allnikereviews.json' % DATA_DIR
json.dump(nikereviews, open(allnikereviews_file_path, 'w'))

## Preview a Record from the File

In [None]:
#this assigns the filename we're trying to load
allnikereviews_file_path = '%s/allnikereviews.json' % DATA_DIR
json_file = json.load(open(allnikereviews_file_path, 'r'))

In [None]:
#select a random review
count = 0
for a_review in json_file:
  count += 1
  if count % 1000 == 0:
    the_review = json_file[a_review]
    print(the_review)
    #sleep(10)
  if count >= 10000:
    break

In [None]:
#print the review to the screen
the_review

## Extract a List of Products Related to Product Ids

In [None]:
#Load the list of Nike Asins

allnikeasins = []
allasins_file_path = '%s/allasins.txt' % DATA_DIR

#open the file and load to a list
for data in open(allasins_file_path, 'r'):
  asins = data.split(',')
  for anasin in asins:
    allnikeasins.append(anasin)

In [None]:
#print a summary of the records processed
allnikeasins_length = len(allnikeasins)
print('Process completed for', allnikeasins_length, 'records.')
print('First 5 Asins in list:', allnikeasins[0:5])

In [None]:
#the path for the all product dict
allproducts_file_path = '%s/allproducts.p' % DATA_DIR
#load the dict
allproducts =  pickle.load(open(allproducts_file_path, 'rb'))

In [None]:
print('size of the full product catelog:', len(allproducts))
keys = set(allnikeasins).intersection(allproducts)
allnikeproducts = {key:allproducts[key] for key in keys}
print('size of the nike product catelog:', len(allnikeproducts))

In [None]:
#save the files to disk
allnikeproducts_file_path = '%s/allnikeproducts.p' % DATA_DIR
pickle.dump(allnikeproducts, open(allnikeproducts_file_path, 'wb'))

# 3.&nbsp;Preprocessing the Data

## Load the Nike Review Data

In [None]:
#this assigns the filename we're trying to load
allnikereviews_file_path = '%s/allnikereviews.json' % DATA_DIR
json_file = json.load(open(allnikereviews_file_path, 'r'))

In [None]:
#extract review text from all review details
reviews = []
for a_review in json_file:
    the_review = json_file[a_review]
    text = the_review["reviewText"]
    reviews.append(text)

## Create the Corpus

In [None]:
#create a corpus of the nike reviews
corpus = Corpus({ i:r for i, r in enumerate(reviews)}, language='en')

In [None]:
#print the length of the corpus
corpus_length = len(corpus)
print('Length of the Corpus:', corpus_length)

In [None]:
#create a summary of the preprocess
n = 10
k = 91
vocab_size = vocabulary_size(corpus)
df_tokens = tmtoolkit.corpus.tokens_table(corpus)
#print a summary of the preprocess
print('record for key', k, 'contains:')
print(corpus[k])
print('corpus vocabulary size:', vocab_size)
print('first', n, 'rows of tokens table:')
print(df_tokens[df_tokens['doc'] == k])

In [None]:
#view a histogram of document lengths
fig, ax = plt.subplots(figsize=(5, 3))   # make the plot larger
plot_doc_lengths_hist(fig, ax, corpus, y_log=False, bins=20)  # use 20 bins
ax.set_xticks(range(0, 1001, 100))    # set x axis ticks and range
plt.show()

In [None]:
#rank-frequency distribution plot for token frequencies
fig, ax = plt.subplots(figsize=(5, 3))
plot_ranked_vocab_counts(fig, ax, corpus, zipf=True)
plt.show();

In [None]:
#save the files to disk
corpus_file_path = '%s/corpus_source.p' % DATA_DIR
pickle.dump(corpus, open(corpus_file_path, 'wb'))

In [None]:
#save the raw text of reviews to disk
text_file_path = '%s/corpus_raw_text.p' % DATA_DIR
pickle.dump(reviews, open(text_file_path, 'wb'))

## Preprocess Text Data

In [None]:
#location of corpus files
corpus_file_path = '%s/corpus_source.p' % DATA_DIR
#load the corpus
corpus = pickle.load(open(corpus_file_path, 'rb'))

In [None]:
#create a summary of the preprocess
n = 10
k = 91
vocab_size = vocabulary_size(corpus)
df_tokens = tmtoolkit.corpus.tokens_table(corpus)
#print a summary of the preprocess
print('record for key', k, 'contains:')
print(corpus[k])
print('corpus vocabulary size:', vocab_size)
print('first', n, 'rows of tokens table:')
print(df_tokens[df_tokens['doc'] == k])

In [None]:
####create an english pre-processor
####preproc = TMPreproc(corpus, language='en')
####tag the words with parts of speach
####preproc.pos_tag()
# lemmatize the words (convert to root base)
lemmatize(corpus, inplace=True)
# convert words to lowercase
to_lowercase(corpus, inplace=True)
# remove special charecters
#####preproc.remove_special_chars_in_tokens()
remove_chars(corpus, chars=punctuation, inplace=True)
# add custom stopwords to remove for urls and not
####preproc.add_stopwords(['http', 'nt'])       #####
filter_clean_tokens(corpus, remove_stopwords=True, inplace=True)
# limit words to nouns, verbs, and adjectives
filter_for_pos(corpus, search_pos=['N', 'V', 'ADJ'], inplace=True)
# remove numbers and any word shorter than 2 characters
filter_clean_tokens(corpus, remove_numbers=True, inplace=True)
filter_clean_tokens(corpus, remove_shorter_than=2, inplace=True)
# remove tokens that are fairly common
remove_common_tokens(corpus, df_threshold=0.90, inplace=True)
# remove tokens that are extreamly rare
remove_uncommon_tokens(corpus, df_threshold=0.01, inplace=True)

## Review Preprocessing of Text

In [None]:
#create a summary of a sample
def preview_sample_review(corpus, k = 0):
  samp_texty = corpus[k]
  tok = doc_tokens(corpus, with_attr=True)
  samp_token_list = tok[k]['token']
  samp_token_list_length = len(samp_token_list)

  #print a random corpus
  print('record for key', k, 'contains:')
  print(corpus[k])
  print('the text contains', samp_token_list_length, 'tokens.')
  print('sample token list:', samp_token_list[0:10])
  print()

In [None]:
#preview a couple specific lists of tokens in the corpus
preview_sample_review(corpus, k = 91)
preview_sample_review(corpus, k = 1)
preview_sample_review(corpus, k = 2000)

In [None]:
#create a summary of the preprocess
n = 10
k = 91
vocab_size = vocabulary_size(corpus)
df_tokens = tmtoolkit.corpus.tokens_table(corpus)
#print a summary of the preprocess
print('record for key', k, 'contains:')
print(corpus[k])
print('corpus vocabulary size:', vocab_size)
print('first', n, 'rows of tokens table:')
print(df_tokens[df_tokens['doc'] == k])

In [None]:
#view a histogram of document lengths
fig, ax = plt.subplots(figsize=(5, 3))   # make the plot larger
plot_doc_lengths_hist(fig, ax, corpus, y_log=False, bins=50)  # use 20 bins
ax.set_xticks(range(0, 201, 20))    # set x axis ticks and range
plt.show()

In [None]:
#rank-frequency distribution plot for token frequencies
fig, ax = plt.subplots(figsize=(5, 3))
plot_ranked_vocab_counts(fig, ax, corpus, zipf=True)
plt.show();

In [None]:
#summarize the size of the vocabulary
vocabulary_size(corpus)

## Save the Preprocessed Data

In [None]:
#create the document labels
doc_lbls = np.array(doc_labels(corpus))
#preview the document labels
print(doc_lbls[:10])

In [None]:
#create the vocabulary
vocab = np.array(vocabulary(corpus))
#preview the document labels
print(vocab[:10])

In [None]:
#create the document-term matrix (DTM)
dtm_main = dtm(corpus)
#dtm_main = scipy.sparse.csr_matrix(dtm_main)
#preview the dtm
dtm_main

In [None]:
#save the files to disk
corpus_file_path = '%s/corpus.p' % DATA_DIR
doc_labels_file_path = '%s/doc_labels.p' % DATA_DIR
vocab_file_path = '%s/vocab.p' % DATA_DIR
dtm_file_path = '%s/dtm_main.npz' % DATA_DIR

pickle.dump(corpus, open(corpus_file_path, 'wb') )
pickle.dump(doc_lbls, open(doc_labels_file_path, 'wb'))
pickle.dump(vocab, open(vocab_file_path, 'wb'))
scipy.sparse.save_npz(dtm_file_path, dtm_main)

# 4.&nbsp;Model: Parameter Tuning

## Import the Corpus

In [None]:
#location of corpus files
corpus_file_path = '%s/corpus.p' % DATA_DIR
doc_labels_file_path = '%s/doc_labels.p' % DATA_DIR
vocab_file_path = '%s/vocab.p' % DATA_DIR
dtm_file_path = '%s/dtm_main.npz' % DATA_DIR

#load the corpus
corpus = pickle.load(open(corpus_file_path, 'rb'))
doc_lbls = pickle.load(open(doc_labels_file_path, 'rb'))
dtm_main = scipy.sparse.load_npz(dtm_file_path)
vocab = pickle.load(open(vocab_file_path, 'rb'))

#preview the document labels
print('sample of document labels:', doc_lbls[:10])
print('sample of vocabulary:', vocab[:10])
print('dtm none zero count:', dtm_main.count_nonzero())

In [None]:
# suppress the "INFO" messages and warnings from lda
logger = logging.getLogger('lda')
logger.addHandler(logging.NullHandler())
logger.propagate = False
warnings.filterwarnings('ignore')

## Model Evaluation Functions

In [None]:
#build a parameter inputs for the model using a scaling factor for eta and alpha
def build_param_inputs(kmax = 100, eta_factor = 1, alpha_factor = 1):
  #setup parameters
  const_params = {'n_iter': 500,'eta': round(0.1/eta_factor,5), 'random_state': 20191122}
  if kmax <=50:
   ks = list(range(5, kmax+1, 5))
  elif kmax <=100:
    ks = [5,10,15] + list(range(20, kmax+1, 10))
  else:
    ks = list(range(10, 100, 10)) + list(range(100, kmax+1, 20))
  varying_params = [dict(n_topics=k, alpha=round(1/(alpha_factor*k), 5)) for k in ks]
  num_trials = len(varying_params)
  #display the parameter selections
  print('[.] constant parameters:', const_params)
  print('[.] number of topics to try:', ks)
  print('[.] number of trials:', num_trials)
  print('[.] variable parameter trials:', varying_params)

  return varying_params, const_params

In [None]:
#build a parameter inputs for the model using a fixed value for eta and alpha
def build_param_inputs_fixed(kmax = 100, eta_value = .1, alpha_value = 1):
  #setup parameters
  const_params = {'n_iter': 500,'eta': eta_value, 'random_state': 20191122}
  if kmax <=50:
   ks = list(range(5, kmax+1, 5))
  elif kmax <=100:
    ks = [5,10,15] + list(range(20, kmax+1, 10))
  else:
    ks = list(range(10, 100, 10)) + list(range(100, kmax+1, 20))
  varying_params = [dict(n_topics=k, alpha=alpha_value) for k in ks]
  num_trials = len(varying_params)
  #display the parameter selections
  print('[.] constant parameters:', const_params)
  print('[.] number of topics to try:', ks)
  print('[.] number of trials:', num_trials)
  print('[.] variable parameter trials:', varying_params)

  return varying_params, const_params

In [None]:
#evaluate the model results for the model and return the performance metrics
def evaluate_model_results_custom(dtm_p, varying_p, const_p):
  #evaluate model results
  eval_results = tm_lda.evaluate_topic_models(dtm_p,
                                              varying_parameters = varying_p,
                                              constant_parameters = const_p,
                                              metric = ['arun_2010','cao_juan_2009','coherence_mimno_2011']
                                              )
  results_by_n_topics = results_by_parameter(eval_results, 'n_topics')
  results_by_n_topics = [(x, {key: round(value, 3) for key, value in inner_dict.items()}) for x, inner_dict in results_by_n_topics]
  #display the results of the models as text
  print('[.] number of results calculated:', len(results_by_n_topics))
  print('[.] results by n topic models:', results_by_n_topics)

  return results_by_n_topics

In [None]:
# plot the results of the model trials
#plot_eval_results(eval_results = results_by_n_topics, figsize=(8, 6))
#I cannot get this function to work for the life of me
#package conflict with version of pyplot

In [None]:
#custom function to display the model results
def plot_eval_results_custom(results_by_n_topics):
  # Extract data from results_by_n_topics
  n_topics = [item[0] for item in results_by_n_topics]
  arun_2010_values = [item[1]['arun_2010'] for item in results_by_n_topics]
  cao_juan_2009_values = [item[1]['cao_juan_2009'] for item in results_by_n_topics]
  coherence_mimno_2011_values = [item[1]['coherence_mimno_2011'] for item in results_by_n_topics]

  # Create subplots with a single row and multiple columns
  fig, axes = plt.subplots(1, 3, figsize=(12, 3))  # Adjust the figure size as needed
  fs = 8

  # Plot Arun 2010 Metric
  axes[0].plot(n_topics, arun_2010_values, label='Arun 2010', marker='o')
  axes[0].set_xlabel('Number of Topics', fontsize=fs)
  axes[0].set_ylabel('Metric Value', fontsize=fs)
  axes[0].set_title('Minimize: Arun 2010', fontsize=fs)

  # Plot Cao Juan 2009 Metric
  axes[1].plot(n_topics, cao_juan_2009_values, label='Cao Juan 2009', marker='o')
  axes[1].set_xlabel('Number of Topics', fontsize=fs)
  axes[1].set_ylabel('Metric Value', fontsize=fs)
  axes[1].set_title('Minimize: Cao Juan 2009', fontsize=fs)

  # Plot Coherence Mimno 2011 Metric
  axes[2].plot(n_topics, coherence_mimno_2011_values, label='Coherence Mimno 2011', marker='o')
  axes[2].set_xlabel('Number of Topics', fontsize=fs)
  axes[2].set_ylabel('Metric Value', fontsize=fs)
  axes[2].set_title('Maximize: Coherence Mimno 2011', fontsize=fs)

  # Adjust layout spacing
  plt.tight_layout()

  # Show the plots
  plt.show()

In [None]:
#repeatable steps for completing a model evaluation with the variable parameters
def automate_test_varible_model(dtm_p, eta_list, alpha_list, kmax, output_file):
  #establish tracking metrics
  total_runs = len(eta_list) * len(alpha_list)
  all_model_runs = {}
  start_time = time.time()
  count = 0
  #iterate through all inputs for eta and alpha

  for e in eta_list:
    for a in alpha_list:
      run_code = 'eta_factor_' + str(e).zfill(4) + '__' + 'alpha_factor_' + str(a).zfill(4)
      print('running the evaluation for:', run_code)
      #build the variable params
      varying_params, const_params = build_param_inputs(kmax = kmax, eta_factor = e, alpha_factor = a)
      #build the results
      results_by_n_topics = evaluate_model_results_custom(dtm_p = dtm_p, varying_p = varying_params, const_p = const_params)
      #record the results
      all_model_runs[run_code] = results_by_n_topics
      #update on the progress and runtime
      count += 1
      current_progress = int(round(count/total_runs,2)*100)
      current_runtime = round(time.time() - start_time,3)
      print('[-] current progress:', current_progress, '%', 'and a runtime of', current_runtime, 'seconds.')
      #plot the results
      plot_eval_results_custom(results_by_n_topics)
  #print summary of the entire run
  print('evaluation data captured for', len(all_model_runs), 'sets of parameters.')
  #save the files to disk
  eval_file_path = '%s/%s' % (DATA_DIR, output_file)
  pickle.dump(all_model_runs, open(eval_file_path, 'wb'))
  #return the results
  return all_model_runs

In [None]:
#repeatable steps for completing a model evaluation with the fixed parameters
def automate_test_fixed_model(dtm_p, eta_list, alpha_list, kmax, output_file):
  #establish tracking metrics
  total_runs = len(eta_list) * len(alpha_list)
  all_model_runs = {}
  start_time = time.time()
  count = 0
  #iterate through all inputs for eta and alpha

  for e in eta_list:
    for a in alpha_list:
      run_code = 'eta_' + str(e).zfill(4) + '__' + 'alpha_' + str(a).zfill(4)
      print('running the evaluation for:', run_code)
      #build the variable params
      varying_params, const_params = build_param_inputs_fixed(kmax = kmax, eta_value = e, alpha_value = a)
      #build the results
      results_by_n_topics = evaluate_model_results_custom(dtm_p = dtm_p, varying_p = varying_params, const_p = const_params)
      #record the results
      all_model_runs[run_code] = results_by_n_topics
      #update on the progress and runtime
      count += 1
      current_progress = int(round(count/total_runs,2)*100)
      current_runtime = round(time.time() - start_time,3)
      print('[-] current progress:', current_progress, '%', 'and a runtime of', current_runtime, 'seconds.')
      #plot the results
      plot_eval_results_custom(results_by_n_topics)
  #print summary of the entire run
  print('evaluation data captured for', len(all_model_runs), 'sets of parameters.')
  #save the files to disk
  eval_file_path = '%s/%s' % (DATA_DIR, output_file)
  pickle.dump(all_model_runs, open(eval_file_path, 'wb'))
  #return the results
  return all_model_runs

In [None]:
#load model evaluations previously saved in a file
def load_model_evaluations(DATA_DIR, path_location):
  #location of model evaluation file files
  eval_file_path = '%s/%s' % (DATA_DIR, path_location)
  #load the model
  all_model_runs = pickle.load(open(eval_file_path, 'rb'))
  #load the keys for the models
  model_keys = list(all_model_runs.keys())
  #preview the document labels
  print('all keys:', model_keys)
  #preview a sample of the model data
  print('sample of model data:', all_model_runs[model_keys[0]])
  # return the model data
  return all_model_runs

In [None]:
#reprint the model evaluations
def print_model_evaluations(all_model_runs, limit = 1000, model_keys = None):
  if model_keys == None:
    key_list = list(all_model_runs.keys())
  else:
    key_list = model_keys
  len_to_print = min(len(key_list), limit)
  print('displaying charts for', len_to_print, 'models:\n')
  count = 0
  for key in key_list:
    print ('model key:', key)
    plot_eval_results_custom(all_model_runs[key])
    print()
    count += 1
    if count >= limit:
      break

## Round 1: Evaluate Model Options

- [evaluate_topic_models()](https://tmtoolkit.readthedocs.io/en/latest/api.html#tmtoolkit.topicmod.tm_lda.evaluate_topic_models)
- [Guide: Evaluate Topic Model](https://tmtoolkit.readthedocs.io/en/latest/topic_modeling.html#Evaluation-of-topic-models)
- [tm_lda](https://tmtoolkit.readthedocs.io/en/latest/api.html#module-tmtoolkit.topicmod.tm_lda)

A high alpha value indicates that each document contains most of the topics and on the contrary, a lower alpha value indicates that the documents are likely to contain a fewer number of topic.

A Higher value of η indicates that the topics are likely to cover most of the words and on the contrary, lower eta value indicates that the topics are likely to contain a fewer number of words.

In [None]:
#define the input parameters
eta_list = [1]
alpha_list = [0.25, 0.5, 1, 2, 4]
#eta_list   = [0.5, 1, 2, 4, 8]
#alpha_list = [0.25, 0.5, 1, 2, 4]
kmax = 80
# run the function
all_model_runs = automate_test_varible_model(dtm_p = dtm_main,
                                           eta_list = eta_list,
                                           alpha_list = alpha_list,
                                           kmax = kmax,
                                           output_file = 'explore_eval_vary_ae_etaf_1o0_alphaf_list.p')

In [None]:
#define the input parameters
eta_list = [2]
alpha_list = [0.25, 0.5, 1, 2, 4]
#eta_list   = [0.5, 1, 2, 4, 8]
#alpha_list = [0.25, 0.5, 1, 2, 4]
kmax = 80
# run the function
all_model_runs = automate_test_varible_model(dtm_p = dtm_main,
                                           eta_list = eta_list,
                                           alpha_list = alpha_list,
                                           kmax = kmax,
                                           output_file = 'explore_eval_vary_ae_etaf_2o0_alphaf_list.p')

In [None]:
#define the input parameters
eta_list = [4]
alpha_list = [0.25, 0.5, 1, 2, 4]
#eta_list   = [0.5, 1, 2, 4, 8]
#alpha_list = [0.25, 0.5, 1, 2, 4]
kmax = 80
# run the function
all_model_runs = automate_test_varible_model(dtm_p = dtm_main,
                                           eta_list = eta_list,
                                           alpha_list = alpha_list,
                                           kmax = kmax,
                                           output_file = 'explore_eval_vary_ae_etaf_4o0_alphaf_list.p')

In [None]:
#define the input parameters
eta_list = [0.5]
alpha_list = [0.25, 0.5, 1, 2, 4]
#eta_list   = [0.5, 1, 2, 4, 8]
#alpha_list = [0.25, 0.5, 1, 2, 4]
kmax = 80
# run the function
all_model_runs = automate_test_varible_model(dtm_p = dtm_main,
                                           eta_list = eta_list,
                                           alpha_list = alpha_list,
                                           kmax = kmax,
                                           output_file = 'explore_eval_vary_ae_etaf_0o5_alphaf_list.p')

In [None]:
#define the input parameters
eta_list = [1]
alpha_list = [0.125, 0.25, 0.5]
#eta_list   = [0.5, 1, 2, 4, 8]
#alpha_list = [0.25, 0.5, 1, 2, 4]
kmax = 80
# run the function
all_model_runs = automate_test_varible_model(dtm_p = dtm_main,
                                           eta_list = eta_list,
                                           alpha_list = alpha_list,
                                           kmax = kmax,
                                           output_file = 'explore_eval_vary_ae_etaf_1o0_alphaf_list_r2.p')

After reviewing many performance charts, a pattern emerged that an alpha_factor of 0.25 and 30 topics appers to perfrom well. There is more abiguity around what should be selected for eta. This will be explored in more detail in round 2.

## Round 2: Evaluate Model Options

In [None]:
#define the input parameters
eta_list = [0.125, 0.25, 0.5, 1, 2, 4, 8]
alpha_list = [0.25]
#eta_list   = [0.5, 1, 2, 4, 8]
#alpha_list = [0.25, 0.5, 1, 2, 4]
kmax = 80
# run the function
all_model_runs = automate_test_varible_model(dtm_p = dtm_main,
                                           eta_list = eta_list,
                                           alpha_list = alpha_list,
                                           kmax = kmax,
                                           output_file = 'explore_eval_vary_ae_etaf_list_r2_alphaf_0o25.p')

In [None]:
#define the input parameters
eta_list = [0.20,0.25,0.30]
alpha_list = [0.20,0.25,0.30]
kmax = 80
# run the function
all_model_runs = automate_test_varible_model(dtm_p = dtm_main,
                                           eta_list = eta_list,
                                           alpha_list = alpha_list,
                                           kmax = kmax,
                                           output_file = 'explore_eval_vary_ae_etaf_list_r2_alphaf_list_r2.p')

In [None]:
#test loading data saved in a file to the all model objcet
#all_model_runs = load_model_evaluations(DATA_DIR = DATA_DIR, path_location = 'explore_eval_vary_ae_etaf_list_r2_alphaf_0o25.p')
#print()
#test the print function for 1 set of charts
#print_model_evaluations(all_model_runs = all_model_runs, limit = 1)

Main parameter sets of interest:
- eta_factor_0001__alpha_factor_0.25 - 30 topics
- eta_factor_0.25__alpha_factor_0.25 - 30 topics
- eta_factor_0.25__alpha_factor_00.3

## Round 3: Strategic Model Options

In [None]:
#model parameters
eta_factor = 1; alpha_factor = 0.25; k = 30
#claculate parameters
eta = round(0.1/eta_factor,5)
alpha = round(1/(alpha_factor*k), 5)
# run calculations and vizualize
varying_params, const_params = build_param_inputs_fixed(kmax = 60, eta_value = eta, alpha_value = alpha)
results_by_n_topics = evaluate_model_results_custom(dtm_p = dtm_main, varying_p = varying_params, const_p = const_params)
plot_eval_results_custom(results_by_n_topics)

In [None]:
#model parameters
eta_factor = 1; alpha_factor = 0.30; k = 30
#claculate parameters
eta = round(0.1/eta_factor,5)
alpha = round(1/(alpha_factor*k), 5)
# run calculations and vizualize
varying_params, const_params = build_param_inputs_fixed(kmax = 60, eta_value = eta, alpha_value = alpha)
results_by_n_topics = evaluate_model_results_custom(dtm_p = dtm_main, varying_p = varying_params, const_p = const_params)
plot_eval_results_custom(results_by_n_topics)

In [None]:
#model parameters
eta_factor = 0.25; alpha_factor = 0.25; k = 30
#claculate parameters
eta = round(0.1/eta_factor,5)
alpha = round(1/(alpha_factor*k), 5)
# run calculations and vizualize
varying_params, const_params = build_param_inputs_fixed(kmax = 60, eta_value = eta, alpha_value = alpha)
results_by_n_topics = evaluate_model_results_custom(dtm_p = dtm_main, varying_p = varying_params, const_p = const_params)
plot_eval_results_custom(results_by_n_topics)

In [None]:
#model parameters
eta_factor = 0.25; alpha_factor = 0.30; k = 30
#claculate parameters
eta = round(0.1/eta_factor,5)
alpha = round(1/(alpha_factor*k), 5)
# run calculations and vizualize
varying_params, const_params = build_param_inputs_fixed(kmax = 60, eta_value = eta, alpha_value = alpha)
results_by_n_topics = evaluate_model_results_custom(dtm_p = dtm_main, varying_p = varying_params, const_p = const_params)
plot_eval_results_custom(results_by_n_topics)

In [None]:
# 'eta': 0.1  'alpha': 0.13333
#(30, {'arun_2010': 0.155, 'cao_juan_2009': 0.223, 'coherence_mimno_2011': -461.601})

# 'eta': 0.1  'alpha': 0.11111
#(30, {'arun_2010': 0.162, 'cao_juan_2009': 0.237, 'coherence_mimno_2011': -462.157})

# 'eta': 0.4  'alpha': 0.13333
#(30, {'arun_2010': 0.158, 'cao_juan_2009': 0.232, 'coherence_mimno_2011': -461.677})

# 'eta': 0.4  'alpha': 0.11111
#(30, {'arun_2010': 0.158, 'cao_juan_2009': 0.25, 'coherence_mimno_2011': -453.226})

Base on detailed exploration, the following parameters appear to optimze the charts best:
- Number of Topics = 30
- Eta = 0.1
- Alpha = 1/25

## Round 4: Evaluate Model Options

In [None]:
# test loading data saved in a file to the all model objcet
all_model_runs = load_model_evaluations(DATA_DIR = DATA_DIR, path_location = 'explore_eval_fixed_eta_001.p')
print()
#test the print function for 1 set of charts
print_model_evaluations(all_model_runs = all_model_runs, limit = 1)

In [None]:
# print models of interest
'''
pl = ['explore_eval_fixed_eta_010.p',
'explore_eval_fixed_eta_008.p',
'explore_eval_fixed_eta_007.p',
'explore_eval_fixed_eta_006.p',
'explore_eval_fixed_eta_004.p',
'explore_eval_fixed_eta_004.p',
'explore_eval_fixed_eta_003.p']
ml = [['eta_00.1__alpha_00.2'],
['eta_0.08__alpha_00.5'],
['eta_0.07__alpha_00.4'],
['eta_0.06__alpha_00.1'],
['eta_0.04__alpha_00.6'],
['eta_0.04__alpha_00.4'],
['eta_0.03__alpha_00.2']]

for p, m in zip (pl, ml):
  print(p, '-', m)
  all_model_runs = load_model_evaluations(DATA_DIR = DATA_DIR, path_location = p)
  print_model_evaluations(all_model_runs = all_model_runs, limit = 10, model_keys = m)
'''
print('skipped')

# 5.&nbsp;Model: Final

## Import the Corpus

In [None]:
#location of corpus files
corpus_file_path = '%s/corpus.p' % DATA_DIR
doc_labels_file_path = '%s/doc_labels.p' % DATA_DIR
vocab_file_path = '%s/vocab.p' % DATA_DIR
dtm_file_path = '%s/dtm_main.npz' % DATA_DIR

#load the corpus
corpus = pickle.load(open(corpus_file_path, 'rb'))
doc_labels_main = pickle.load(open(doc_labels_file_path, 'rb'))
dtm_main = scipy.sparse.load_npz(dtm_file_path)
vocab_main = pickle.load(open(vocab_file_path, 'rb'))

## Create the Model

In [None]:
#model parameters
eta_factor = 1; alpha_factor = 0.25
#eta_factor = 0.25; alpha_factor = 0.35
k = 30
#claculate parameters
eta = round(0.1/eta_factor,5)
alpha = round(1/(alpha_factor*k), 5)
#display parameters for model
print('k:', k,'\teta:', eta,'\talpha:', alpha)

In [None]:
# set data to use
dtms = {
    'main': dtm_main
}

# and fixed hyperparameters
# Here, alpha represents document-topic density - with a higher alpha, documents
# are made up of more topics, and with lower alpha, documents contain fewer topics.
#Beta represents topic-word density - with a high beta, topics are made up of
#most of the words in the corpus, and with a low beta they consist of few words.
# https://www.thoughtvector.io/blog/lda-alpha-and-beta-parameters-the-intuition/
lda_params = {
    'n_topics': k,
    'eta': eta,
    'n_iter': 1000,
    'random_state': 20191122,  # to make results reproducible
    'alpha': alpha
}

In [None]:
#create the model
models = compute_models_parallel(dtms, constant_parameters=lda_params)

In [None]:
#preview the model
model_main = models['main'][0][1]
print_ldamodel_topic_words(model_main.topic_word_, vocab_main, top_n=3)

## Save Topic Model

In [None]:
#save the model to disk
model_file_path = '%s/main_model.p' % MODEL_DIR

with open(model_file_path, "wb") as modelfile:
    save_ldamodel_to_pickle(modelfile, model_main, vocab_main, doc_labels_main, dtm=dtm_main)

# 6.&nbsp; Classify and Enrich Topic Data

With a final topic model in place, topics can be assigned to documents to automatically cluster the review data by topic. Additional details can be added to the topic model to better filter and focus the dataset.

## Load the Model

In [None]:
#build the paths
corpus_file_path = '%s/corpus.p' % DATA_DIR
model_file_path = '%s/main_model.p' % MODEL_DIR
text_file_path = '%s/corpus_raw_text.p' % DATA_DIR
#load the files
with open(corpus_file_path, "rb") as corpusfile:
    corpus = pickle.load(corpusfile)
with open(text_file_path, "rb") as textfile:
    corpus_raw_text = pickle.load(textfile)
with open(model_file_path, "rb") as modelfile:
    model_info = load_ldamodel_from_pickle(modelfile)

#preview the model info keys
model_info.keys()

In [None]:
#extract the parts of the model
model_main      = model_info["model"]
vocab_main      = model_info["vocab"]
dtm_main        = model_info["dtm"]
doc_labels_main = model_info["doc_labels"]

## Create Topic Names and Classification

In [None]:
#create a function to create topic labels
def calc_topic_labels(dtm_p, model_p, vocab_p, lamda_p):
  #create the document lengths
  doc_lengths_main = doc_lengths(dtm_p)
  #create the topic label names
  topic_labels_main = generate_topic_labels_from_top_words(
                          model_p.topic_word_,
                          model_p.doc_topic_,
                          doc_lengths_main,
                          vocab_p,
                          lambda_=lamda_p,
                          n_words=4
                      )
  return topic_labels_main

In [None]:
lambda_list = list(range(1,11))
lambda_list = [round(i/10,3) for i in lambda_list]
print(lambda_list)

dict_topic_test = {}

#generate topic labels with a variety of different lambdas and display in a table
for l in lambda_list:
    topic_labels_main = calc_topic_labels(dtm_p = dtm_main,
                                        model_p = model_main,
                                        vocab_p = vocab_main,
                                        lamda_p = l)
    key = 'model_lamdda_' + str(int(l*100)).zfill(3)
    dict_topic_test[key] = topic_labels_main

df_dict_topic_test = pd.DataFrame(dict_topic_test)
df_dict_topic_test

In [None]:
lambda_list = list(range(6,13))
lambda_list = [round(i/20,3) for i in lambda_list]
print(lambda_list)

dict_topic_test = {}

#generate topic labels with a variety of different lambdas and display in a table
for l in lambda_list:
    topic_labels_main = calc_topic_labels(dtm_p = dtm_main,
                                        model_p = model_main,
                                        vocab_p = vocab_main,
                                        lamda_p = l)
    key = 'model_lamdda_' + str(int(l*100)).zfill(3)
    dict_topic_test[key] = topic_labels_main

df_dict_topic_test = pd.DataFrame(dict_topic_test)
df_dict_topic_test

In [None]:
  #create the final topics labels for the topic model
  topic_labels_main = calc_topic_labels(dtm_p = dtm_main,
                                        model_p = model_main,
                                        vocab_p = vocab_main,
                                        lamda_p = 0.4)

  #display the list of topic labels
  print('created topic labels:')
  print(topic_labels_main)

In [None]:
# Topic Model Coherence
from tmtoolkit.topicmod.evaluate import metric_coherence_mimno_2011

# use top 20 words per topic for metric
coh = metric_coherence_mimno_2011(model_main.topic_word_, dtm_main, top_n=10, include_prob=True)
print(coh, '\n')


#display a histogram of the coherence
plt.hist(coh, bins=20)
plt.xlabel('coherence')
plt.ylabel('n')
plt.show();

#print the best and worst topics according to this metric
top10_t_indices = np.argsort(coh)[::-1][:10]
bottom10_t_indices = np.argsort(coh)[:10]

print('\ntop 10 topics:', topic_labels_main[top10_t_indices])
print('\nbottom 10 topics:', topic_labels_main[bottom10_t_indices])

## Classify the Documents

In [None]:
#classify each document with the label
doc_topic_main = model_main.doc_topic_
documentclassifications = ldamodel_top_doc_topics(doc_topic_main,
                                                  doc_labels_main,
                                                  top_n=3,
                                                  topic_labels=topic_labels_main)
#preview the document classifications
documentclassifications.head()

In [None]:
#add details from the dicument to label table
documentclassifications['rank_1_topic'] = [i.split(' ')[0]for i in documentclassifications['rank_1']]
documentclassifications['rank_1_prob'] = [float(i.split(' ')[1][1:-1]) for i in documentclassifications['rank_1']]
documentclassifications['rank_2_topic'] = [i.split(' ')[0] for i in documentclassifications['rank_2']]
documentclassifications['rank_2_prob'] = [float(i.split(' ')[1][1:-1]) for i in documentclassifications['rank_2']]
documentclassifications['rank_3_topic'] = [i.split(' ')[0] for i in documentclassifications['rank_3']]
documentclassifications['rank_3_prob'] = [float(i.split(' ')[1][1:-1]) for i in documentclassifications['rank_3']]
#drop the combined columns
documentclassifications = documentclassifications.drop(columns = ['rank_1','rank_2','rank_3'])
#preview the document classifications
documentclassifications.head()

In [None]:
#plot the distribution of rank 1 topics
sns.distplot(documentclassifications['rank_1_prob'])

In [None]:
#plot the probability distribution for topics (rank 1 - rank 3)
sns.set(color_codes=True)
sns.set(style="white", palette="muted")
sns.histplot(documentclassifications[['rank_1_prob','rank_2_prob','rank_3_prob']],
             kde=True,
             stat="percent",
             binwidth = 0.02)

## Enrich Data with Review Info

In [None]:
#this assigns the filename we're trying to load
allnikereviews_file_path = '%s/allnikereviews.json' % DATA_DIR
json_file = json.load(open(allnikereviews_file_path, 'r'))

In [None]:
#extract fields from all review details
reviews = []
asin = []
overall_rating = []
reviewer_id = []

#extract review details to add to the tagged documents
for a_review in json_file:
    the_review = json_file[a_review]
    reviews.append(the_review["reviewText"])
    asin.append(the_review["asin"])
    overall_rating.append(the_review["overall"])
    reviewer_id.append(the_review["reviewerID"])

In [None]:
#add details from the dicument to label table
documentclassifications['text'] = reviews
documentclassifications['asin'] = asin
documentclassifications['overall_rating'] = overall_rating
documentclassifications['reviewer_id'] = reviewer_id
#preview the document classifications
documentclassifications.head()

## Enrich the Data with Product Info

In [None]:
#this assigns the filename we're trying to load
allnikeproducts_file_path = '%s/allnikeproducts.p' % DATA_DIR
allnikeproducts = pickle.load(open(allnikeproducts_file_path, 'rb'))

allnikeproducts['B0000V9K32']

In [None]:
nike_asin_list = list(documentclassifications['asin'])
print(nike_asin_list[1:5])

In [None]:
allnikeproducts['B0000V9K32']['salesRank']

In [None]:
allnikeproducts[nike_asin_list[0]]['salesRank'].keys()

In [None]:
sale_category = []
sale_rank = []
title = []

#extract out product details to add to the tagged documents
for asin in nike_asin_list:
    #print(asin)
    the_product = allnikeproducts[asin]
    #print(the_product)
    if the_product.get("salesRank"):
      sale_category.append(list(allnikeproducts[asin]['salesRank'].keys())[0])
      sale_rank.append(list(allnikeproducts[asin]['salesRank'].values())[0])
    else:
      sale_category.append('Unknown')
      sale_rank.append(None)
    title.append(the_product['title'])

#add details from the dicument to label table
documentclassifications['sale_category'] = sale_category
documentclassifications['sale_rank'] = sale_rank
documentclassifications['title'] = title
#preview the document classifications
documentclassifications.head()

## Enrich Data with Sentiment

In [None]:
#create a function to get the sentiment of a list of text
def get_sentiment_texts(texts):
    """Implement this function which should take a list of texts
    and returns 2 lists with the sentiment polarity

    See the TextBlob documentation for how to evaluate sentiment. For our
    purposes here, negative sentiment is a sentiment with polarity < 0.0.
    """
    texts_length = len(texts)
    sentiments = [None] * texts_length
    results = []
    print('processing sentiments for', texts_length, 'texts:')
    for t, i in zip(texts, range(len(texts))):
      text = TextBlob(t)
      sentiments[i] = text.sentiment.polarity

    return sentiments

In [None]:
#add details from the dicument to label table
documentclassifications['sentiment'] = get_sentiment_texts(list(documentclassifications['text']))
#add flag for sentiment
documentclassifications['positive_sentiment'] = [i >= 0 for i in documentclassifications['sentiment']]
documentclassifications['negative_sentiment'] = [i < 0 for i in documentclassifications['sentiment']]
#add overal rating categories
documentclassifications['overall_rating_low'] = [i <= 2 for i in documentclassifications['overall_rating']]
documentclassifications['overall_rating_high'] = [i >= 4 for i in documentclassifications['overall_rating']]
#preview the document classifications
documentclassifications.head(3)

## Preview Samples from Topic Labels

In [None]:
#preview the topic labels for a sample record
k = 901
documentclassifications.loc[k]

In [None]:
#preview the full text for the sample record
documentclassifications.loc[k]['text']

## Export the Topic Labels to Excel

In [None]:
#export the topic classification data to excel
topic_excel_path = '%s/topics.documentclassification.xlsx' % DATA_DIR
documentclassifications.to_excel(topic_excel_path)

# 7.&nbsp; Model Evaluation

## Visualize the Topics

In [None]:
#create parameters for ldavis
ldavis_params = parameters_for_ldavis(model_main.topic_word_,
                                      model_main.doc_topic_,
                                      dtm_main,
                                      vocab_main)

In [None]:
#plot the distance map
%matplotlib inline
vis = pyLDAvis.prepare(**ldavis_params)
pyLDAvis.enable_notebook(local=True)
pyLDAvis.display(vis)

## Vizualize Segmentations

### Overview of Sales Categories

In [None]:
#plot the distribution of reviews across the different sales categories
sns.countplot(data=documentclassifications, y="sale_category")

### Top 5 Products Most Negative Sentiment

In [None]:
#create a table of the top negative sentiment products
df_asin = documentclassifications[['asin','title', 'sentiment']]
df_asin = df_asin[df_asin['sentiment'] < 0]
df_asin = df_asin.groupby(['asin','title']).agg(['mean','count']).reset_index()
df_asin.columns = ['_'.join(col) for col in df_asin.columns]
df_asin = df_asin.rename(columns={'asin_': 'asin', 'title_': 'title', 'sentiment_count': 'count'})
df_asin = df_asin[df_asin['count'] >= 10]
df_asin = df_asin.sort_values(by=['count'], ascending=False)
df_asin = df_asin.head(5)
df_asin

### Top 5 Topics Most Negative Sentiment

In [None]:
#adjust properties to display all text
pd.set_option('display.max_colwidth', -1)
print(pd.get_option("display.max_colwidth"))

In [None]:
#focus on a specific product and display negative review text for the product
df_prod_dtl = documentclassifications[documentclassifications['asin'] == 'B007FXKMLW']
df_prod_dtl = df_prod_dtl[df_prod_dtl['sentiment'] < 0]
df_prod_dtl = df_prod_dtl[df_prod_dtl['overall_rating'] < 3]
df_prod_dtl = df_prod_dtl[df_prod_dtl['rank_1_prob'] >= 0.3]
df_prod_dtl = df_prod_dtl.sort_values(by=['rank_1_topic', 'sentiment'], ascending=True).reset_index()
df_prod_dtl = df_prod_dtl[['asin','title', 'sale_category', 'overall_rating', 'sentiment',
                           'rank_1_topic', 'rank_1_prob', 'text']]
df_prod_dtl.head(5)

In [None]:
#Return Properties to default
pd.reset_option("display.max_colwidth")
print(pd.get_option("display.max_colwidth"))

### Sentiment vs. Category Across Sale Categories

In [None]:
#segment data acrtoss 3 categories of the sales category field
dc_shoes = documentclassifications[documentclassifications['sale_category'] == 'Shoes']
dc_watches = documentclassifications[documentclassifications['sale_category'] == 'Watches']
dc_other = documentclassifications[(documentclassifications['sale_category'] != 'Shoes')
                                   & (documentclassifications['sale_category'] != 'Watches')]
#display plots for the distribution of sentiment across each overall rating level
f, axes = plt.subplots(1,3, figsize=(12, 4))
min_lim = -1.25; max_lim = 1.25
axes[0].set(ylim=(min_lim, max_lim)); axes[1].set(ylim=(min_lim, max_lim)); axes[2].set(ylim=(min_lim, max_lim))
sns.violinplot(data=dc_shoes, x="overall_rating", y="sentiment", ax=axes[0]).set(title='Shoes Sales Category')
sns.violinplot(data=dc_watches, x="overall_rating", y="sentiment", ax=axes[1]).set(title='Watches Sales Category')
sns.violinplot(data=dc_other, x="overall_rating", y="sentiment", ax=axes[2]).set(title='Not Shoes & Watched Sales Category')

### Heatmap of Topics vs. Categories

In [None]:
#summarize the tagged documents into their rank_1 topics with aggregate metrics
df_sum_cats = documentclassifications[['sale_category','rank_1_topic', 'sentiment']]
df_sum_cats = df_sum_cats.groupby(['sale_category','rank_1_topic']).agg(['min','mean','max','count']).reset_index()
df_sum_cats.columns = ['_'.join(col) for col in df_sum_cats.columns]
df_sum_cats = df_sum_cats.rename(columns={'sale_category_': 'sale_category', 'rank_1_topic_': 'rank_1_topic', 'sentiment_count': 'count'})
df_sum_cats['sale_category_count'] = df_sum_cats.groupby('sale_category')['count'].transform('sum')
df_sum_cats['pct_of_sales_category'] = df_sum_cats['count'] / df_sum_cats['sale_category_count']
df_sum_cats.head(3)

In [None]:
#summarize the columns in the table
df_sum_cats.info()

In [None]:
#plot a heatmap of what topics relate to what sales categories
df_heat = df_sum_cats[['sale_category','rank_1_topic', 'pct_of_sales_category']].pivot("rank_1_topic", "sale_category", "pct_of_sales_category")
f, axes = plt.subplots(1,1, figsize=(12, 11))
sns.heatmap(data=df_heat, cmap="YlGnBu", annot=True, fmt='.2f', ax=axes)

### Explore Extended List of Words for Topics

In [None]:
#create the top topic words for each topic in the model and add to table
from tmtoolkit.topicmod.model_io import ldamodel_top_topic_words

top_topic_word = ldamodel_top_topic_words(model_main.topic_word_,
                                          vocab_main,
                                          row_labels=topic_labels_main,
                                          top_n=15,
                                          val_fmt = '{lbl}').reset_index()
top_topic_word['rank_1'] = [i.split(' ')[0] for i in top_topic_word['rank_1']]

top_topic_word[top_topic_word['topic'] == '7_pair_last_year_month']

In [None]:
#sample of the top topic owrds dataframe
top_topic_word[top_topic_word['topic'] == '7_pair_last_year_month'].values.tolist()[0][1:]

###Negative Sentiment Proportion

In [None]:
# Determine Percent Negative for Rank 1 Topics
df_sent_count = documentclassifications[['rank_1_topic', 'sentiment', 'positive_sentiment', 'negative_sentiment']]
df_sent_count = df_sent_count.drop(columns = ['sentiment'])
df_sent_count = df_sent_count.groupby(['rank_1_topic']).agg(['sum', 'count']).reset_index()
df_sent_count.columns = ['_'.join(col) for col in df_sent_count.columns]
df_sent_count = df_sent_count.drop(columns = ['positive_sentiment_count'])
df_sent_count = df_sent_count.rename(columns={'rank_1_topic_': 'rank_1_topic',
                                          'negative_sentiment_count': 'total_count',
                                          'positive_sentiment_sum': 'positive_sentiment',
                                          'negative_sentiment_sum': 'negative_sentiment'})
df_sent_count['pct_neg_sentiment'] = df_sent_count['negative_sentiment'] / df_sent_count['total_count']
df_sent_count['pct_neg_sentiment'] = [round(i,3) for i in df_sent_count['pct_neg_sentiment']]
df_sent_count = df_sent_count.sort_values(by=['pct_neg_sentiment'], ascending=False)
df_sent_count.head(3)

In [None]:
# Mean Rating for Rank 1
df_or = documentclassifications[['rank_1_topic', 'overall_rating']]
df_or = df_or.groupby(['rank_1_topic']).agg(['mean', 'count']).reset_index()
df_or.columns = ['_'.join(col) for col in df_or.columns]
df_or = df_or.rename(columns={'rank_1_topic_': 'rank_1_topic',
                              'overall_rating_count': 'total_count'})
df_or['overall_rating_mean'] = [round(i,2) for i in df_or['overall_rating_mean']]
df_or = df_or.sort_values(by=['overall_rating_mean'], ascending=True)
df_or.head(3)

In [None]:
# Count Low and High Overall Rating Rating for Rank 1
df_or_cat = documentclassifications[['rank_1_topic', 'overall_rating_low', 'overall_rating_high']]
df_or_cat = df_or_cat.groupby(['rank_1_topic']).agg(['sum']).reset_index()
df_or_cat.columns = ['_'.join(col) for col in df_or_cat.columns]
df_or_cat = df_or_cat.rename(columns={'rank_1_topic_': 'rank_1_topic',
                              'overall_rating_low_sum': 'overall_rating_low',
                              'overall_rating_high_sum': 'overall_rating_high',})
df_or_cat['pct_low_rating'] = df_or_cat['overall_rating_low'] / (df_or_cat['overall_rating_low'] + df_or_cat['overall_rating_high'])
df_or_cat['pct_low_rating'] = [round(i,3) for i in df_or_cat['pct_low_rating']]
df_or_cat = df_or_cat.sort_values(by=['pct_low_rating'], ascending=False)
df_or_cat.head(3)

In [None]:
# calculate the top topic words and create the base of the topic summary metric table
df_sum_rank = documentclassifications[['rank_1_topic', 'sentiment']]
df_sum_rank = df_sum_rank.groupby(['rank_1_topic']).agg(['min','mean','max','count']).reset_index()
df_sum_rank.columns = ['_'.join(col) for col in df_sum_rank.columns]
df_sum_rank = df_sum_rank.rename(columns={'rank_1_topic_': 'rank_1_topic', 'sentiment_count': 'count'})
df_sum_rank['total_count'] = df_sum_rank['count'].sum()
df_sum_rank['pct_of_total'] = df_sum_rank['count'] / df_sum_rank['total_count']
#round the numeric columns in the table
df_sum_rank['pct_of_total'] = [round(i,3) for i in df_sum_rank['pct_of_total']]
df_sum_rank['sentiment_min'] = [round(i,3) for i in df_sum_rank['sentiment_min']]
df_sum_rank['sentiment_mean'] = [round(i,3) for i in df_sum_rank['sentiment_mean']]
df_sum_rank['sentiment_max'] = [round(i,3) for i in df_sum_rank['sentiment_max']]
#sort the table
df_sum_rank = df_sum_rank.sort_values(by=['sentiment_mean'], ascending=True)
#add list of topic words
df_sum_rank['top_topic_words'] = [top_topic_word[top_topic_word['topic'] == i].values.tolist()[0][1:] for i in df_sum_rank['rank_1_topic']]
#preview data
df_sum_rank.head(3)

In [None]:
#join the different metric tables into a single table
df_sum_rank = df_sum_rank.merge(df_or_cat[['pct_low_rating', 'rank_1_topic']], on = 'rank_1_topic', how = 'left')
df_sum_rank = df_sum_rank.merge(df_or[['overall_rating_mean', 'rank_1_topic']], on = 'rank_1_topic', how = 'left')
df_sum_rank = df_sum_rank.merge(df_sent_count[['pct_neg_sentiment', 'rank_1_topic']], on = 'rank_1_topic', how = 'left')
df_sum_rank = df_sum_rank.drop(columns=['sentiment_min','sentiment_max','total_count'])
df_sum_rank.head(3)

In [None]:
#normalize the sentiment metrics and compute a combined score
df_sum_rank['pct_neg_sentiment_norm'] = MinMaxScaler().fit_transform(np.array(df_sum_rank['pct_neg_sentiment']).reshape(-1,1))
df_sum_rank['overall_rating_mean_norm'] = MinMaxScaler().fit_transform(np.array(df_sum_rank['overall_rating_mean']).reshape(-1,1))
df_sum_rank['pct_low_rating_norm'] = MinMaxScaler().fit_transform(np.array(df_sum_rank['pct_low_rating']).reshape(-1,1))
df_sum_rank['sentiment_mean_norm'] = 1 - MinMaxScaler().fit_transform(np.array(df_sum_rank['sentiment_mean']).reshape(-1,1))
df_sum_rank['neg_view_score'] = (df_sum_rank['pct_neg_sentiment_norm'] + df_sum_rank['overall_rating_mean_norm'] + df_sum_rank['pct_low_rating_norm'] + df_sum_rank['sentiment_mean_norm']) / 4
df_sum_rank['neg_view_score'] = [round(i,3) for i in df_sum_rank['neg_view_score']]
df_sum_rank['neg_view_score'] = MinMaxScaler().fit_transform(np.array(df_sum_rank['neg_view_score']).reshape(-1,1))
df_sum_rank = df_sum_rank.sort_values(by=['neg_view_score'], ascending=False).reset_index()
bottom_5_topics = df_sum_rank['rank_1_topic'].head(5)
top_5_topics = df_sum_rank['rank_1_topic'].tail(5)
df_sum_rank.head(1)

In [None]:
# plot the distribution of topic tags by negative review score metric
sns.set(color_codes=True)
sns.set(style="white", palette="muted")
sns.histplot(df_sum_rank['neg_view_score'],
             kde=True,
             stat="percent",
             binwidth = 0.1)

In [None]:
#displaty topic world list for an example topic in the model
print(df_sum_rank[df_sum_rank['rank_1_topic'] == '2_great_look_awesome_fit']['top_topic_words'])

## Investigate: Shoe Sizing

### Select a Problem

Based on the sorted list, we can view the bottom 5 topics for Nike. For some topics (like 30_return_send_seller_item and 5_review_try_bad_problem) there is little Nike can do to resolve the problem. These are related to the buying experience where the customer had general issues and needed to return the item or bought an item that was already poorly reviewed. These can both be issues they work with their sales partner (Amazon) to resolve.

For this investigation, we will focus on the topic **23_size_small_half_large** to better understand the scope and how we can imporve the product or the customer experience.

In [None]:
#display the bottom 5 topics
bottom_5_topics

### Focus on the Data

Using the enriched data, focus on the specific topic, and limit records for instances where the topic probability is above 30% to ensure the review is more likely to be talking about the topic.

In [None]:
#filter reviews to focus
target = bottom_5_topics[1]
print('the target for analysis will be:', target)
df_focus_prblm = documentclassifications[documentclassifications['rank_1_topic'] == target]
df_focus_prblm = df_focus_prblm[df_focus_prblm['rank_1_prob'] >= 0.3]
df_focus_prblm = df_focus_prblm[['rank_1_topic','rank_1_prob', 'sentiment', 'asin', 'sale_category', 'title', 'text']]

print('collected', len(df_focus_prblm), 'reviews for analysis of', target)

To improve focus of the analysis, we can focus on a single sales category, shoes, that dominates the records in this topic.

In [None]:
#plot the sales categories to see if there is a main product
sns.countplot(data=df_focus_prblm, y="sale_category")

In [None]:
#filter problem category to only focus on shoes
df_focus_prblm = df_focus_prblm[df_focus_prblm['sale_category'] == 'Shoes']

### Review the Top Products in the Topic

In [None]:
#slice of data table focused on uniqu products in the problem topic
df_prod_count = df_focus_prblm[['title', 'sentiment']]
df_prod_count = df_prod_count.groupby(['title']).agg(['mean', 'count']).reset_index()
df_prod_count.columns = ['_'.join(col) for col in df_prod_count.columns]
df_prod_count = df_prod_count.rename(columns={'title_': 'title', 'sentiment_count': 'count'})
df_prod_count = df_prod_count.sort_values(by=['count'], ascending=False).reset_index()
df_prod_count['sentiment_mean'] = [round(i,3) for i in df_prod_count['sentiment_mean']]
df_prod_count['title'] = [i[:50] for i in df_prod_count['title']]
print('there are', len(df_prod_count), 'unique products that are part of this category')

In [None]:
#products with a count abive threshold for the topic
print('some of the top products:')
df_prod_count = df_prod_count[df_prod_count['count'] >= 5]
df_prod_count

In [None]:
#Most common products in the topic
f, ax = plt.subplots(1,1, figsize=(6, 6))
sns.set_context('paper', font_scale = 0.9)
ax = sns.barplot(data=df_prod_count.head(10), y='title', x = 'count', width=.8, ax=ax)
plt.figure()

### What Are People Saying

In [None]:
#best 10 reviews for topic
df_focus_prblm.sort_values(by=['sentiment'], ascending=False).reset_index().head(10)

In [None]:
#worst 10 reviews for topic
df_focus_prblm.sort_values(by=['sentiment'], ascending=True).reset_index().head(10)

In [None]:
#lookup where reviews talk about battery issues
#print('Results:')
#lookup = documentclassifications[documentclassifications['text'].str.contains('battery')]
#lookup.head(5)

### Generate Topic Wordcloud

In [None]:
# some options for wordcloud output
img_w = 400   # image width
img_h = 300   # image height

topic_clouds = generate_wordclouds_for_topic_words(
    model_main.topic_word_, vocab_main,
    top_n=20, topic_labels=topic_labels_main,
    width=img_w, height=img_h
)

# show all generated word clouds
topic_clouds.keys()

In [None]:
#display wordcloud for a topic
topic_clouds['23_size_small_half_large']

# 8.&nbsp; Results

**Model**

The final Topic Model produced 30 topics that clusted the Nike brand reviews into groups. After completing a sentiment analysis, the following clusters were the top 5 positive and negative clusters.

|Rank| |Positive| |Negative|
|----|--|--------|--|--------|
| 1  | |21_compliment_lot_stylish_get| |30_return_send_seller_item|
| 2  | |25_light_weight_training_workout| |23_size_small_half_large|
| 3  | |19_fast_delivery_arrive_condition| |4_toe_narrow_lace_bit|
| 4  | |29_color_love_bright_super| |5_review_try_bad_problem |
| 5  | |2_great_look_awesome_fit| |7_pair_last_year_month|


In [None]:
#bottom_5_topics

In [None]:
#top_5_topics

After completing a deep dive into the topic of 23_size_small_half_large. Overall customers were mostly talking about he fit of shoes in this category and that they either (1) new to size up a half size larger or (2) received shoes and wished they sized up a half size larger. We were able to identify some additional insights for Nike that groups into 3 key categories:
- Experienced Customers
- Customer Experience and Support
- Product Opportunity

**Experienced Customers**

Generally speaking, customers who knew the Nike brand, knew to size-up when buying shoes. While they talked about this topic, they tended to order the correct size, have a more positive sentiment in their review, and provide a positive rating (4 or 5). What can be taken away from this is that the Nike brand can be challenging for new customers, especially when shopping online. Perhaps marketing spending for new customers should be focused more on getting that new customer into a store to get their initial brand experience.

**Customer Experience and Support**

Customers who order their "correct" shoe size and get shoes that don't fit are frustrated. They generally provide a negative rating (1 or 2) and have a low or negative sentiment. This is not a good customer experience, but based on the experienced customer reviews, this appears to be preventable. Nike should invest in text, images, apps, etc. that can better support customers in selecting the best size of shoe. If they can get a customer to order the correct size, they are more likely to have a happy customer. As seen by both posititve and negative reviews grouped together, the topic isn't inherently negative. Customers are ok with the fact that product runs small, they simply are frustrated and angry when their knowledge and expectations don't match what they receive. A solution to support customers can go a long way here.

**Product Opportunity**

By focusing on the products (specific shoes) within this topic, we can identify the top 5 shoes that seem to have fit (product) sizing issues. By using the below list, Nike can focus on specific products and make changes in their design to better address the sizing of the product in future iterations.

In [None]:
print('top 5 products to address:')
df_prod_count.head(5)

# Conclusion

In this notebook, we successfully performed topic modeling on Amazon reviews related to Nike products using Latent Dirichlet Allocation (LDA). The key steps included:

Data Loading and Preprocessing:

We efficiently loaded and preprocessed the text data by cleaning, tokenizing, removing stopwords, and lemmatizing the reviews. This helped to standardize the text and make it ready for topic modeling.
Topic Modeling with LDA:

We used the LDA algorithm to discover hidden topics within the review data. The model was trained using a set number of topics, and key parameters like n_topics and max_iter were tuned to ensure better performance.
Model Evaluation:

The model's performance was evaluated using log-likelihood and perplexity, providing insights into the coherence of topics generated by the LDA model.
Topic Visualization:

We visualized the topics using both textual lists of top words and word clouds, allowing for a more intuitive understanding of the main themes present in the reviews.
Key Insights:
The topics generated by the LDA model give us a clearer picture of the main concerns and points of interest from customer reviews, ranging from product quality to customer satisfaction.
By tuning the LDA model and evaluating its performance, we ensured that the generated topics were coherent and relevant, providing valuable insights to the Nike brand regarding customer feedback.
This topic modeling approach can be extended to other product categories, allowing businesses to automatically uncover important themes in large-scale review data, thereby enabling better decision-making based on customer feedback.

# 9.&nbsp; References

* [1] Latent Dirichlet allocation, Wikipedia, (https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation)
* [2] Text Preprocessing and Basic Text Mining, https://tmtoolkit.readthedocs.io/en/latest/preprocessing.html#Lemmatization-and-token-normalization)
* [3] TextBlob Tutorial: Quickstart, https://textblob.readthedocs.io/en/dev/quickstart.html#sentiment-analysis

