<h1 align = "center">Are Yelp reviews gendered ?</h1>

In [None]:
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

In [None]:
%matplotlib inline 

In [None]:
import json
import pyprind
import collections
import matplotlib.pyplot as plt
from textblob import TextBlob

In [None]:
# load gender detector

from gender_detector import GenderDetector 
detector = GenderDetector('us')

In [None]:
# load all users

users = {}

with open("../yelp_dataset/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_user.json", "rb") as f:
    for line in f:
        user = json.loads(line.strip())
        users[user['user_id']] = user

In [None]:
# detect all names' gender

name_set = set()

for user in users.values():
    name_set.add(user['name'])

bar = pyprind.ProgBar(len(name_set), width = 80)

# set mapping
gender = {}

for name in name_set:
    bar.update()
    try:
        gender[name] = detector.guess(name)
    except:
        gender[name] = 'unknown'

In [None]:
# save to file

with open("gender.json", "wb") as f:
    json.dump(gender, f)

In [None]:
with open("gender.json", "rb") as f:
    gender = json.load(f)

In [None]:
# assign gender to users

for user in users.values():
    try:
        user['gender'] = gender[user['name']]
    except:
        user['gender'] = 'unknown'

In [None]:
gender_list = [user['gender'] for user in users.values()]

print "total number of users is", len(gender_list)

In [None]:
counter=collections.Counter(gender_list)

labels = counter.keys()
sizes = counter.values()
colors = ['gold', 'yellowgreen', 'lightcoral']
explode = (0.1, 0, 0)  # explode 1st slice
 
# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
 
plt.axis('equal')
plt.show()

In [None]:
reviews = []
with open("../yelp_dataset/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json", "rb") as f:
    for line in f:
        reviews.append(json.loads(line.strip()))

In [None]:
bar = pyprind.ProgBar(len(reviews), width = 80)

for review in reviews:
    bar.update()
    review['text_blob'] = TextBlob(review['text'])

In [None]:
m_reviews = []
f_reviews = []
for review in reviews:
    user = users[review['user_id']]
    if user['gender'] == 'male':
        m_reviews.append(review)
    elif user['gender'] == 'female':
        f_reviews.append(review)

In [None]:
reviews[1]

In [None]:
text = '''
The titular threat of The Blob has always struck me as the ultimate movie
monster: an insatiably hungry, amoeba-like mass able to penetrate
virtually any safeguard, capable of--as a doomed doctor chillingly
describes it--"assimilating flesh on contact.
Snide comparisons to gelatin be damned, it's a concept with the most
devastating of potential consequences, not unlike the grey goo scenario
proposed by technological theorists fearful of
artificial intelligence run rampant.
'''

blob = TextBlob(text)
blob.words

<h1 align = "center">Text Analysis</h1>

### 1. Number of words

In [None]:
def num_words_dist(review_set):
    
    bar = pyprind.ProgBar(len(review_set), width = 80)
    
    ret = []

    for review in review_set:
        bar.update()
        n = len(review['text'].split(" "))
        ret.append(n)
    return ret

m_num_word_dist = num_words_dist(m_reviews)
f_num_word_dist = num_words_dist(f_reviews)

In [None]:
plt.hist(m_num_word_dist, bins=40, histtype='stepfilled', normed=True, color='b', label='Male')
plt.hist(f_num_word_dist, bins=40, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Female')
plt.title("Usage of words")
plt.xlabel("Number of words")
plt.ylabel("Frequency")
plt.legend()
plt.show()

In [None]:
plt.hist([n for n in m_num_word_dist if n <600], bins=40, histtype='stepfilled', normed=True, color='b', label='Male')
plt.hist([n for n in f_num_word_dist if n <600], bins=40, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Female')
plt.title("Usage of words")
plt.xlabel("Number of words")
plt.ylabel("Probability")
plt.legend()
plt.show()

In [None]:
m_avg = sum(m_num_word_dist) / float(len(m_num_word_dist))
f_avg = sum(f_num_word_dist) / float(len(f_num_word_dist))

print "Male_AVG", m_avg
print "Female_AVG", f_avg

### 2. Number of sentences

In [None]:
def num_sentences_dist(review_set):
    bar = pyprind.ProgBar(len(review_set), width = 80)
    
    ret = []
    for review in review_set:
        bar.update()
        segments = review['text'].split(". ")
        n = len(segments)
        ret.append(n)
    return ret

m_num_sen_dist = num_sentences_dist(m_reviews)
f_num_sen_dist = num_sentences_dist(f_reviews)

In [None]:
plt.hist([n for n in m_num_sen_dist if n <40], bins=40, histtype='stepfilled', normed=True, color='b', label='Male')
plt.hist([n for n in f_num_sen_dist if n <40], bins=40, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Female')
plt.title("Usage of sentences")
plt.xlabel("Number of sentence")
plt.ylabel("Probability")
plt.legend()
plt.show()

### 4. Number of exclamation

In [None]:
def num_ex_dist(review_set):
    bar = pyprind.ProgBar(len(review_set), width = 80)
    
    ret = []
    for review in review_set:
        bar.update()
        n = review['text'].count("!")
        ret.append(n)
    return ret

m_num_ex_dist = num_ex_dist(m_reviews)
f_num_ex_dist = num_ex_dist(f_reviews)

In [None]:
plt.hist([n for n in m_num_ex_dist if n <10], bins=40, histtype='stepfilled', normed=True, color='b', label='Male')
plt.hist([n for n in f_num_ex_dist if n <10], bins=40, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Female')
plt.title("Usage of sentences")
plt.xlabel("Number of sentence")
plt.ylabel("Probability")
plt.legend()
plt.show()

<h1 align="center">Sentiment Analysis</h1>

In [None]:
def rating_dist(review_set):
    bar = pyprind.ProgBar(len(review_set), width = 80)
    
    ret = []
    for review in review_set:
        bar.update()
        n = review['stars']
        ret.append(n)
        
    return ret

m_num_rating_dist = rating_dist(m_reviews)
f_num_rating_dist = rating_dist(f_reviews)

In [None]:
plt.hist([m_num_rating_dist], bins=10, histtype='stepfilled', normed=True, color='b', label='Male')
plt.hist([f_num_rating_dist], bins=10, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Female')
plt.title("Usage of sentences")
plt.xlabel("Number of sentence")
plt.ylabel("Probability")
plt.legend(loc=2)
plt.show()

In [None]:
m_avg = sum(m_num_rating_dist) / float(len(m_num_rating_dist))
f_avg = sum(f_num_rating_dist) / float(len(f_num_rating_dist))

print "Male_AVG", m_avg
print "Female_AVG", f_avg

<h1 align="center">Topic Analysis</h1>

In [None]:
import sys
sys.path.append("/Users/erichsu/Documents/research/rake_dir/RAKE-tutorial/")

In [None]:
import rake
import operator
import random
import wordcloud

In [None]:
rake_object = rake.Rake("/Users/erichsu/Documents/research/rake_dir/RAKE-tutorial/SmartStoplist.txt", 5, 3, 7)

In [None]:
def save_to_file(file_name, text):
    f = open(file_name, "wb")
    f.write(text)

def get_good_reviews(review_set):
    good_reviews = [r for r in review_set if r['stars']>=4]
    return good_reviews

def get_bad_reviews(review_set):
    bad_reviews = [r for r in review_set if r['stars']<=2]
    return bad_reviews

### Good Review

In [None]:
m_good_reviews = get_good_reviews(m_reviews)
f_good_reviews = get_good_reviews(f_reviews)

m_review_sample = random.sample(m_good_reviews, 1000)
f_review_sample = random.sample(f_good_reviews, 1000)

male_text = ". ".join([r['text'] for r in m_review_sample])
female_text = ". ".join([r['text'] for r in f_review_sample])

In [None]:
save_to_file("male_good_review.txt", male_text)
save_to_file("female_good_review.txt", female_text)

In [None]:
keywords = rake_object.run(male_text)

In [None]:
keywords[0:20]

In [None]:
def word_cloud(text):
    #wordcloud = WordCloud().generate(text)
    #plt.imshow(wordcloud)
    #plt.axis("off")
    
    wc = wordcloud.WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
    plt.figure(figsize=(10,8))
    plt.imshow(wc)
    plt.axis("off")
    plt.show()

In [None]:
word_cloud(male_text)

In [None]:
word_cloud(female_text)

### Bad Reviews

In [None]:
m_bad_reviews = get_bad_reviews(m_reviews)
f_bad_reviews = get_bad_reviews(f_reviews)

m_review_sample = random.sample(m_bad_reviews, 1000)
f_review_sample = random.sample(f_bad_reviews, 1000)

male_text = ". ".join([r['text'] for r in m_review_sample])
female_text = ". ".join([r['text'] for r in f_review_sample])

In [None]:
keywords = rake_object.run(male_text)
keywords[0:20]

In [None]:
keywords = rake_object.run(female_text)
keywords[0:20]

In [None]:
word_cloud(male_text)

In [None]:
word_cloud(female_text)

<h1 align="center">Behavior Analysis</h1>

In [None]:
from collections import Counter

### 1. Business Types

In [None]:
business = {}

with open("/Users/erichsu/Documents/research/yelp_dataset/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json", "rb") as f:
    for line in f:
        info = json.loads(line.strip())
        business[info['business_id']] = info

In [None]:
m_good_reviews = get_good_reviews(m_reviews)
f_good_reviews = get_good_reviews(f_reviews)

In [None]:
def count_best(review_set, business):
    type_set = []
    
    for r in review_set:
        bid = r['business_id']
        bus = business[bid]
        type_set.extend(bus['categories'])
    
    c = dict(Counter(type_set))
    sorted_c = sorted(c.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_c[0:20]

In [None]:
count_best(m_good_reviews, business)

In [None]:
count_best(f_good_reviews, business)

### 2. Preference of Food

In [None]:
food_cat_list = """
 Restaurants
 Fast Food
 Sandwiches
 Pizza
 Nightlife
 American (Traditional)
 Bars
 Chinese
 Food
 Burgers
 Mexican
 American (New)
 Chicken Wings
 Breakfast & Brunch
 Italian
 Barbeque
 Cafes
 Korean
 Pubs
 Asian Fusion
 Thai
 Japanese
 Salad
 Sushi Bars
 Steakhouses
"""

In [None]:
for line in food_cat_list:
    print f

In [None]:
print "s"

### 3. gender of friends

In [None]:
def get_gender(user):
    return user['gender']

def friend_stats(user, users):
    fs = user['friends']
    
    gender_stats = []
    
    for uid in fs:
        f_user = users[uid]
        gender = get_gender(f_user)
        if gender != 'unknown':
            gender_stats.append(gender)
        
    stats = Counter(gender_stats)
    
    total = float(sum(stats.values()))
        
    return stats

def get_average(arr):
    return sum(arr)/float(len(arr))

In [None]:
male_users = [user for user in users.values() if user['gender'] == 'male']
female_users = [user for user in users.values() if user['gender'] == 'female']

In [None]:
len(male_users)

#### male stats

In [None]:
stats_list = []

for m_user in male_users:
    stats = friend_stats(m_user, users)
    stats_list.append(stats)

males = [st['male'] for st in stats_list]
females = [st['female'] for st in stats_list]

In [None]:
plt.hist([n for n in males if n <20], bins=40, histtype='stepfilled', normed=True, color='b', label='Male')
plt.hist([n for n in females if n <20], bins=40, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Female')
plt.title("Usage of sentences")
plt.xlabel("Number of sentence")
plt.ylabel("Probability")
plt.legend()
plt.show()

#### female stats

In [None]:
stats_list = []

for m_user in female_users:
    stats = friend_stats(m_user, users)
    stats_list.append(stats)

males = [st['male'] for st in stats_list]
females = [st['female'] for st in stats_list]

In [None]:
plt.hist([n for n in males if n <20], bins=20, histtype='stepfilled', normed=True, color='b', label='Male')
plt.hist([n for n in females if n <20], bins=20, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Female')
plt.title("Usage of sentences")
plt.xlabel("Number of sentence")
plt.ylabel("Probability")
plt.legend()
plt.show()

### 3. number of friends

In [None]:
male_fr_count = get_average([len(user['friends']) for user in male_users])
female_fr_count = get_average([len(user['friends']) for user in female_users])

print "Male_AVG", male_fr_count
print "Female_AVG", female_fr_count

In [None]:
male_fr_count

In [None]:
female_fr_count

In [None]:
import timeit
def test():
    text = '''
    The titular threat of The Blob has always struck me as the ultimate movie
    monster: an insatiably hungry, amoeba-like mass able to penetrate
    virtually any safeguard, capable of--as a doomed doctor chillingly
    describes it--"assimilating flesh on contact.
    Snide comparisons to gelatin be damned, it's a concept with the most
    devastating of potential consequences, not unlike the grey goo scenario
    proposed by technological theorists fearful of
    artificial intelligence run rampant.
    '''
    blob = TextBlob(text)
    p= blob.tags
    
timeit.timeit(test, number=10000)

In [None]:
len(f_reviews)

In [None]:
reviews[1]