###Hierarchical Dirichlet Process Analysis of Yoga Reviews

Here we apply the following procedure:
 1. Concatenate the reviews by yoga business, making sure there are no duplicate reviews for a given business.
 2. Convert to lower case, remove accents, and tokenize, retaining only tokens with alphabetical characters.
 3. Remove stop words and proper nouns.
 4. Stem.
 5. Create a corpus dictionary: (integer word ID, word, word frequency in corpus).
 6. Remove tokens that appear too often or not often enough.
 7. Convert each concatenated studio review into bag-of-words format: a list of (token ID, token count) 2-tuples.
 8. Apply tf-idf transformation to corpus.
 9. Apply Hierarchical Dirichlet Process algorithm to corpus.
 10. Look at the resulting topics.

In [1]:
'''
First get the packages we'll need.
'''
from   pymongo import MongoClient
import logging
import nltk
from   gensim import corpora, models, similarities, matutils, utils
from   collections import defaultdict
from   pprint import pprint
import re
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
'''
Set region: NYC or LA.
'''
region = "NYC"

In [3]:
'''
Make a list of the reviews we'll be analyzing, concatenating by business.
'''

client = MongoClient()
if region == "NYC":
    yoga = client.dsbc.yyrnyc
    print('Opening NYC database...')
else:
    yoga = client.dsbc.yyrla
    print('Opening LA database...')
    
print('Total number of Yoga businesses = %i' %yoga.count())

cursor          = yoga.find()
studio_names    = []
studio_reviews  = []
studio_ratings  = []
for record in cursor:
    reviews = []
    for review in record["usr_text"]:
        if review:
            # Save review:
            reviews.append(review)

    # Eliminate duplicate reviews for a given studio
    # (different studios may still "share" a review):
    n_reviews = len(reviews)
    ureviews  = []
    for review in set(reviews):
        ureviews.append(review)
    n_ureviews = len(ureviews)
    
    # Concatenate the unique reviews by business.
    con_review = ""
    for review in ureviews:
        con_review += " " + review

    if con_review:
        studio = record["biz_name"]+" [at] "+record["biz_address"]
        studio_names.append(studio)
        studio_reviews.append(con_review)
        studio_ratings.append(record["biz_rating"])
        
print('Number of reviewed Yoga businesses = %i' %len(studio_reviews))

Opening NYC database...
Total number of Yoga businesses = 796
Number of reviewed Yoga businesses = 550


In [4]:
'''
Use this cell to search through reviews.
'''
item    = "tsahi "

icount  = sum([review.lower().count(item) for review in studio_reviews])
print("Total number of occurrences = %i" % icount)
print(" ")

num     = 0
for review in studio_reviews:
    num   += 1
    icount = review.lower().count(item)
    if icount>0 and num > 127:
        print("Number of occurrences in review %i = %i" % (num,icount))
        print(" ")
        print('%i: %s' %(num,review))
        break

Total number of occurrences = 10
 


In [5]:
'''
Convert to lower case, remove accents, tokenize, remove stop words,
remove proper nouns, and stem.
Then make a dictionary to map stems into words (one word per stem).
'''

# Get list of stopwords.
stoplist = nltk.corpus.stopwords.words('english')
stoplist.append(u'\u0027s')   # "'s" as in "he's"
stoplist.append(u'n\u0027t')  # "n't" as in "he hasn't"
stoplist.append(u'\u0027m')   # "'m" as in "I'm"
stoplist.append(u'ya')        # as in "you"
stoplist.append(u'\u0027ve')  # "'ve" as in "I've"
stoplist.append(u'also')
stoplist.append(u've')
stoplist.append(u'm')

# Convert to lower case, remove accents, and tokenize (removing punctuation and numbers).
studio_reviews_1 = [list(utils.tokenize(studio_review,lowercase=True,deacc=True)) for studio_review in studio_reviews]

# Remove stop words.
studio_reviews_2 = [[word for word in studio_review if word not in stoplist] for studio_review in studio_reviews_1]

# Remove proper nouns.
ppn = ["aaron", "aarona", "abigail", "adam", "adelaide", "alice", "alicia", "amalia", "amanda", "andrea", 
       "angela", "angie", "anna", "annie", "anya", "ariel", "ash", "ashley", "audra", 
       "becker", "becky", "belle", "beverly", "bijorn", "bjorn", "brandon", "brian", 
       "caprice", "cara", "carla", "carlos", "carolyn", "cathy", "charlotte", "chris", 
       "christine", "claire", "connie", "corey", "courtney", 
       "dalton", "daniela", "davey", "david", "deborah", "deena", "diane", "dina", "dr", 
       "eddie", "edwin", "elaine", "ellen", "emily", "eric", "erica", "erik", "erika", "erin", "ezmy", 
       "fergus", 
       "gabriella", "gavin", "geralyn", "ghylian", "gina", "glenda", 
       "hannah", "heather", "heidy", "henry", "hermann", "hsiao", "hunt", 
       "ikaika", "ingrid", "ivette", 
       "jacqui", "jahaira", "james", "jane", "janet", "jen", "jeni", "jenni", "jennie", "jennifer", "jenny", 
       "jess", "jesse", "jessica", "jill", "jillian", "jim", "joe", "joetta", "jose", 
       "joy", "joyce", "jq", "judy", "julia", "juliana", "julie", 
       "kalie", "kallie", "karen", "kathleen", "katie", "kaurwar", "ken", "kerri", "kerry", 
       "lalita", "lani", "lara", "lauren", "laurie", "liliana", "lindsay", "lindsey", 
       "lisa", "liz", "lori", "luisa", "lynn", 
       "madalina", "maggie", "malaika", "mandy", "marco", "margaret", "marja", "mark", "martha", "masako", "mayuri", 
       "meagan", "megan", "melissa", "melody", "meriany", "merilynn", "mia", 
       "michael", "michelle", "mike", "mimi", "mollie", "molly", "monica", "monika", "morgan", 
       "namgyal", "naomi", "narisara", "nathaniel", "nick", "nicola", "nicole", "nikki", "novak",
       "paula", "pauline", "politeia", 
       "rachel", "rafael", "ramit", "rebeca", "rebecca", "rob", "roger", "rosie", "ruthie", "ryan", 
       "sandhya", "santoshi", "sara", "sarah", "shelly", "sherica", "sheryl", "sonja", "spencer", "stacey", "stacy", 
       "stephan", "stephanie", "stephaine", "steve", "sue", "susan", "suzanne", "suzi", "suzie", 
       "tzaki", "tsewang", "wayne", "wesley", "zander" ]
studio_reviews_2a = [[word for word in studio_review if word not in ppn] for studio_review in studio_reviews_2]

# Stem.
#stemmer          = nltk.stem.snowball.SnowballStemmer("english")
stemmer          = nltk.stem.porter.PorterStemmer()
studio_reviews_3 = [[stemmer.stem(word) for word in studio_review] for studio_review in studio_reviews_2a]

# Create a dictionary to map stems to words (this is a one-to-many map, but this shouldn't matter much).
stem_to_word = defaultdict(str)
for studio_review in studio_reviews_2a:
    for word in studio_review:
        word_stem = stemmer.stem(word)
        stem_to_word[word_stem] = word

#pprint(studio_reviews_3[0])

In [6]:
'''
Create a corpus dictionary: (integer word ID, word, word frequency in corpus).
Remove words that appear too infrequently or too frequently.

Load dictionary if it already exists on disk.
'''

make_dict = True

if region == "NYC":
    fname1 = "yoga_studios_nyc.dict"
    fname2 = "yoga_studios_nyc_txt.dict"
else:
    fname1 = "yoga_studios_la.dict"
    fname2 = "yoga_studios_la_txt.dict"
    
if make_dict:
    dictionary = corpora.Dictionary( studio_reviews_3 )
    dictionary.filter_extremes( no_below=1, no_above=0.7, keep_n=None )
    dictionary.save( fname1 )
    dictionary.save_as_text( fname2, sort_by_word=False )
else:
    dictionary = corpora.Dictionary.load( fname1 )
    
print(dictionary)
print("")
for i in range(10,20):
    print('Stem "%s" maps to word "%s"' % (dictionary[i],stem_to_word[dictionary[i]]))
print("")
print('Length of dictionary = %i' % len(dictionary))

Dictionary(15375 unique tokens: [u'fawn', u'foodi', u'childern', u'yellow', u'interchang']...)

Stem "acroyoga" maps to word "acroyoga"
Stem "fanci" maps to word "fancy"
Stem "enchantingli" maps to word "enchantingly"
Stem "scold" maps to word "scolded"
Stem "timberlak" maps to word "timberlake"
Stem "starsi" maps to word "starsi"
Stem "tuckaho" maps to word "tuckahoe"
Stem "lord" maps to word "lord"
Stem "starse" maps to word "starseed"
Stem "desensit" maps to word "desensitize"

Length of dictionary = 15375


In [7]:
'''
Convert the tokenized reviews of the corpus to bags of words,
or load the corpus from disk if it already exists.
'''
make_bow = True

if region == "NYC":
    fname3 = "yoga_studios_nyc.mm"
else:
    fname3 = "yoga_studios_la.mm"
    
if make_bow:
    # Convert each concatenated studio review into bag-of-words format: a list of (token ID, token count) 2-tuples.
    corpus_bow = [dictionary.doc2bow(studio_review) for studio_review in studio_reviews_3]

    # Store to disk in Matrix Market format (= a text format)
    corpora.MmCorpus.serialize( fname3, corpus_bow )

# Read bags-of-words corpus from disk.
corpus_bow = corpora.MmCorpus( fname3 )

print(corpus_bow)
#print(corpus_bow[0]) # retrieving first document (for example)

MmCorpus(550 documents, 15375 features, 220500 non-zero entries)


In [8]:
'''
Apply tf-idf transformation to corpus: Compute document frequencies of all the features.
'''
tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

In [35]:
'''
Apply Hierarchical Dirichlet Process analysis to the corpus.
The following are arguments to HdpModel, with a brief description:
    gamma:      [1]    first level concentration
    alpha:      [1]    second level concentration
    eta:        [0.01] the topic Dirichlet
    T:          [150]  top level truncation level
    K:          [15]   second level truncation level
    kappa:      [1.0]  learning rate
    tau:        [64.0] slow down parameter
    max_time:   [None] stop training after this many seconds
    max_chunks: [None] stop after having processed this many chunks (wrap around
                       corpus beginning in another corpus pass, if there are not 
                       enough chunks in the corpus)
    chunksize:  [256]  Training proceeds in chunks of `chunksize` documents at a time. 
                       The size of chunksize is a tradeoff between increased speed 
                       (bigger chunksize) and lower memory footprint (smaller chunksize). 
'''
if region == "NYC":
    hdp_file = 'results/yoga_studios_nyc_hdp'
else:
    hdp_file = 'results/yoga_studios_la_hdp'
    
make_hdp = True
if make_hdp:
    %time hdp = models.HdpModel( corpus_bow, id2word=dictionary, \
                                 alpha=1.0, gamma=1.0, T=150, K=15, kappa=0.1, eta=0.05 )
    hdp.save(hdp_file)
else:
    hdp = models.HdpModel.load(hdp_file)

CPU times: user 6.49 s, sys: 639 ms, total: 7.13 s
Wall time: 6.5 s


In [36]:
'''
Check topics found by HDP.
'''

topics_to_print = 150
words_per_topic = 5

alpha,beta = hdp.hdp_to_lda()
print('Number of topics = %i, Sum of topic probabilities = %f' % (len(alpha),sum(alpha)))
print("")
sorted_topics = sorted(zip(range(len(alpha)),list(alpha)),key=lambda x: -x[1])
topics        = hdp.show_topics( -1, formatted=False )
for topic_num in range(topics_to_print):
    topic_index  = sorted_topics[topic_num][0]
    topic_weight = sorted_topics[topic_num][1]
    if topic_weight > 0.001:
        out_string = "Topic "+"{0:03d}".format(topic_num)+" ("+"{:.3f}".format(topic_weight)+") : "
        for ind,(word,weight) in enumerate(topics[topic_index][1]):
            if ind == 0:
                out_string += "{:.5f}".format(weight)+"*"+stem_to_word[word]
            elif ind < words_per_topic:
                out_string += " + "+"{:.5f}".format(weight)+"*"+stem_to_word[word]
        print(out_string)
    else:
        break

Number of topics = 150, Sum of topic probabilities = 1.000000

Topic 000 (0.222) : 0.01088*instructors + 0.00970*place + 0.00915*like + 0.00893*teachers + 0.00763*practically
Topic 001 (0.202) : 0.00979*place + 0.00836*instructors + 0.00798*teachers + 0.00774*like + 0.00605*work
Topic 002 (0.198) : 0.01594*gym + 0.00816*like + 0.00703*work + 0.00661*rooms + 0.00656*instructors
Topic 003 (0.189) : 0.01063*instructors + 0.01005*place + 0.00798*like + 0.00720*practically + 0.00709*teachers
Topic 004 (0.039) : 0.01220*place + 0.01214*yogi + 0.01113*hapi + 0.00820*teachers + 0.00769*never
Topic 005 (0.021) : 0.00981*teachers + 0.00884*instructors + 0.00785*like + 0.00739*place + 0.00631*practically
Topic 006 (0.020) : 0.00807*practically + 0.00770*instructors + 0.00675*teachers + 0.00659*like + 0.00634*place
Topic 007 (0.016) : 0.00955*like + 0.00845*instructors + 0.00788*teachers + 0.00760*place + 0.00693*rooms
Topic 008 (0.014) : 0.00855*teachers + 0.00833*like + 0.00726*instructors + 0.0

In New York City corpus:

* alma = Nueva Alma studio
* bonda = Bonda Yoga Studio
* daya = Daya Yoga Studio
* elahi = Elahi Yoga in the UES
* hys = Harlem Yoga Studio
* ikm = International Krav Maga
* joschi = Joschi Body Bodega
* krav maga = self-defense system developed for the military in Israel
* mrg = MRG fitness studio in Staten Island
* tenafly = borough in Bergen County, New Jersey
* vdy = Brooklyn Vindhya Yoga
* yith = Yoga in the (Jersey City) Heights