In [1]:
'''
Fetch NYC or LA yoga businesses from Mongo database and store the following information 
in local lists:

studio_names:     Studio name
studio_addresses: Street address
studio_websites:  Website URL
studio_reviews:   Concatenation of all reviews
studio_ratings:   Yelp rating of business

These lists will be used in subsequent cells to create a json file that includes,
for each yoga business, a street address, longitude and latitude coordinates, 
a web address, and a topic vector.
'''
from   pymongo import MongoClient

region = "LA"

client = MongoClient()
if region == "NYC":
    yoga = client.dsbc.yyrnyc
else:
    yoga = client.dsbc.yyrla
nbt    = yoga.count()

cursor           = yoga.find()
studio_names     = []
studio_addresses = []
studio_websites  = []
studio_reviews   = []
studio_ratings   = []
nba              = 0
nbr              = 0
for record in cursor:
    reviews = []
    for review in record["usr_text"]:
        if review:
            # Save review:
            reviews.append(review)

    # Eliminate duplicate reviews for a given studio
    # (different studios may still "share" a review):
    n_reviews = len(reviews)
    ureviews  = []
    for review in set(reviews):
        ureviews.append(review)
    n_ureviews = len(ureviews)
    
    # Concatenate the unique reviews by studio:
    con_review = ""
    for review in ureviews:
        con_review += " " + review

    studio_names.append(record["biz_name"])
    studio_addresses.append(record["biz_address"])
    studio_websites.append(record["biz_website"])
    studio_reviews.append(con_review)
    studio_ratings.append(record["biz_rating"])
    
    if con_review != "":
        nbr += 1
    if record["biz_address"] != "":
        nba += 1
        
print('Total number of Yoga businesses =                    %i' % nbt)
print('Number of Yoga businesses with a street address =    %i' % nba)
print('Number of Yoga businesses with at least one review = %i' % nbr)

Total number of Yoga businesses =                    749
Number of Yoga businesses with a street address =    736
Number of Yoga businesses with at least one review = 749


In [2]:
'''
Here we define some interesting topics to look for in the yoga business reviews:
Each topic consists of a list of relevant and semantically related keywords.

For each topic and for each yoga business, we count the number of times that 
a topic keyword is mentioned in a review of the business or in its name.

For each topic we compute the median number of mentions over all businesses
(excluding businesses that have zero mention of the topic).

Finally, for each business we create a topic vector, which contains 1's at 
topic locations for which the number of mentions equals or exceeds the median,
and 0's at all other locations.
'''

# Define a function that returns the median of a list of numbers.
import numpy
def stats_median(lst):
    return numpy.median(numpy.array(lst))

# Define topics of interest.  Each topic is represented by a list of relevant words.
topics = [["vinyasa"], ["anusara"], ["ashtanga"], ["jivamukti"], ["hatha"], 
          ["iyengar"], ["kripalu"], ["bikram", "hot yoga"], ["sivananda"],
          ["viniyoga"], ["yogafit"], ["kundalini"], ["yoga nidra"], 
          ["prenatal", "pregnant", "pregnancy"], 
          ["exertion", "exerting", "workout", "cardio", "challeng", "sweat", "strenuous"],
          ["meditation", "meditate", "relax", "de-stress", "chant", "heal", "mind", "om",
           "aum", "feel", "centered", "bliss", "centering", "calm", "restorative", 
           "nurturing", "nourishing", "spiritual", "peace", "silence", "serene"],
          ["well managed", "beautiful", "terrific", "cool", "fun", "clean", "lovely", 
           "wonderful", "fabulous", "awesome", "spacious", "outstanding"],
          ["competent", "kind", "friendly", "respectful", "welcoming", "warm", 
           "knowledgeable", "supportive", "informative", "caring", "solid", "vigilant", 
           "experienced", "compassionate", "helpful", "enthusiastic", "seasoned", 
           "skillful", "insightful", "professional"],
          ["beginner", "intermediate", "advanced", "mixed level", "basics"],
          ["equipment", "blanket", "block", "band", "belt", "prop", "mat", "strap", 
           "accessor"],
          ["parking"], ["shower"], 
          ["discount", "package", "donation", "bargain", "cheap", "affordable", "deal"],
          ["correction", "assist", "demo", "personal", "one on one", "personalized attention",
           "personal attention", "individual attention", "hands-on", "adjustment"],
          ["community", "togetherness", "partner work", "partnering", "family"],
          ["core", "balance", "alignment", "flexibility", "strength", "stretch", "breath",
           "asana", "pranayama", "abs", "legs", "posture", "headstand", "handstand"],
          ["music"],
          ["crowded", "smelly", "packed", "full"],
          ["lululemon", "apparel"]]
print "Number of topics = %i" % len(topics)
print

review_topic_vector = []
for review,bname in zip(studio_reviews,studio_names):
    review = review.lower()
    bname  = bname.lower()
    topic_vector = []
    for topic in topics:
        nmatch = 0
        for word in topic:
            nmatch += review.count(word)
            nmatch += bname.count(word)
        topic_vector.append(nmatch)
    review_topic_vector.append(topic_vector)
    
medians = []
for i,topic in enumerate(topics):
    select_vector = []
    for topic_vector in review_topic_vector:
        freq = topic_vector[i]
        if freq > 0:
            select_vector.append(freq)
    if select_vector:
        topic_median = stats_median(select_vector)
    else:
        topic_median = 999999.0
    medians.append(topic_median)
    print 'For topic "%s", median frequency = %f' % (topic[0],topic_median)
    
binary_topic_vector = []
for topic_vector in review_topic_vector:
    binary_vector = []
    for i,freq in enumerate(topic_vector):
        if freq >= medians[i]:
            onoff = 1
        else:
            onoff = 0
        binary_vector.append(onoff)
    binary_topic_vector.append(binary_vector)
        

Number of topics = 29

For topic "vinyasa", median frequency = 2.000000
For topic "anusara", median frequency = 2.000000
For topic "ashtanga", median frequency = 1.000000
For topic "jivamukti", median frequency = 2.000000
For topic "hatha", median frequency = 2.000000
For topic "iyengar", median frequency = 1.000000
For topic "kripalu", median frequency = 1.000000
For topic "bikram", median frequency = 2.000000
For topic "sivananda", median frequency = 3.000000
For topic "viniyoga", median frequency = 1.000000
For topic "yogafit", median frequency = 1.500000
For topic "kundalini", median frequency = 2.000000
For topic "yoga nidra", median frequency = 1.000000
For topic "prenatal", median frequency = 2.000000
For topic "exertion", median frequency = 12.000000
For topic "meditation", median frequency = 48.000000
For topic "well managed", median frequency = 12.000000
For topic "competent", median frequency = 13.500000
For topic "beginner", median frequency = 3.000000
For topic "equipment"

In [3]:
'''
Here we check our previous calculations for one particular yoga business.
'''
if region == "NYC":
    studio = "community vinyasa yoga"
else:
    studio = "shaolin temple"
print medians
print
for i,studio_name in enumerate(studio_names):
    if studio_name.lower().count(studio) > 0:
        print '"%s": %s' % (studio_name,review_topic_vector[i])
        print '"%s": %s' % (studio_name,binary_topic_vector[i])
        print

[2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 3.0, 1.0, 1.5, 2.0, 1.0, 2.0, 12.0, 48.0, 12.0, 13.5, 3.0, 8.0, 8.0, 3.0, 6.0, 5.0, 3.0, 9.0, 3.0, 5.0, 1.0]

"Shaolin Temple": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 17, 5, 2, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]
"Shaolin Temple": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]



In [4]:
'''
Here we check the previous calculations on a particular topic keyword.
'''
topic_word = "apparel"
for i,topic in enumerate(topics):
    if topic_word in topic:
        topic_index = i
        break
        
nstudios = 0
for i in range(len(review_topic_vector)):
    topic_freq = review_topic_vector[i][topic_index]
    if topic_freq > 0:
        print 'Studio "%s": %i' % (studio_names[i],topic_freq)
        nstudios += 1
print
print 'Number of studios with reviews mentioning "%s" or similar topic: %i' \
% (topic_word,nstudios)

Studio "Modo Yoga LA": 2
Studio "One Down Dog": 3
Studio "Yogaraj": 1
Studio "YogaWorks Hollywood": 1
Studio "Corepower Yoga": 1
Studio "Bikram Yoga Silverlake": 1
Studio "Swerve Fitness Dance Yoga": 1
Studio "Pop Physique": 3
Studio "Goda Yoga": 1
Studio "Cyclelates": 1
Studio "CorePower Yoga": 1
Studio "The Yoga Collective": 1
Studio "CorePower Yoga": 6
Studio "The Bar Method": 1
Studio "Santa Monica Yoga": 1
Studio "Yoga Shelter": 1
Studio "YogaWorks Montana Ave": 3
Studio "The Green Yogi": 3
Studio "The Sweat Spot": 2
Studio "Mission Street Yoga": 1
Studio "Equinox": 2
Studio "Hot 8 Yoga": 3
Studio "lululemon athletica": 32
Studio "YogaWorks South Bay": 1
Studio "The Yoga Loft": 2
Studio "YogaHop": 1
Studio "YogaWorks Main Street": 1
Studio "Pop Physique": 2
Studio "Inner Power Yoga Studio": 1
Studio "Bikram Yoga South Pasadena": 1
Studio "YAS Fitness Centers": 1
Studio "Exhale Santa Monica": 2
Studio "Runyon Canyon Park": 10
Studio "CorePower Yoga": 1
Studio "lululemon athletica":

In [5]:
'''
Open old json file with yoga business names, street addresses and geocoordinates,
and add website information and binary topic vector.  Create new json file with
the result.
'''
import json

if region == "NYC":
    json_in  = "NYC_yoga_studios_v1.json"
    json_out = "NYC_yoga_studios_v2.json"
else:
    json_in  = "LA_yoga_studios_v1.json"
    json_out = "LA_yoga_studios_v2.json"
    
with open(json_in) as json_data:
    old_json = json.load(json_data)
print 'Number of businesses listed in old json: %i' % len(old_json)

new_json = []
for i in range(len(studio_names)):
    if studio_addresses[i] != "":
        for studio in old_json:
            if studio["name"]==studio_names[i] and studio["address"]==studio_addresses[i]:
                blat = studio["lat"]
                blon = studio["lon"]
                break
        else:
            print 'Studio "%s" at "%s" not found in json' \
            % (studio_names[i],studio_addresses[i])
        biz            = {}
        biz["name"]    = studio_names[i]
        biz["address"] = studio_addresses[i]
        biz["website"] = studio_websites[i]
        biz["lon"]     = blon
        biz["lat"]     = blat
        biz["topics"]  = binary_topic_vector[i]
        new_json.append(biz)
print 'Number of businesses listed in new json: %i' % len(new_json)

with open (json_out, 'w') as outfile:
    json.dump(new_json,outfile,indent=4,sort_keys=False)
    

Number of businesses listed in old json: 736
Number of businesses listed in new json: 736
