In [590]:
import pandas as pd
import numpy as np
import collections
import random
import gender_guesser.detector as gender

In [591]:
businesses = pd.read_csv('restaurants.csv')
businesses = businesses.loc[businesses['review_count'] >= 100] # 449

#businesses

In [592]:
reviews = pd.read_csv('cleaned_reviews.csv')
reviews = reviews.drop_duplicates()
reviews = reviews[pd.notnull(reviews['author'])]

### Add # scraped reviews

In [593]:
def add_scraped_count (row):
    return len(reviews.loc[reviews['business'] == row['id']])

businesses['num_scraped_reviews'] = businesses.apply (lambda row: add_scraped_count (row), axis=1)

### Scraped percentage

In [594]:
def add_scraped_perc (row):
    return row['num_scraped_reviews'] / row['review_count']

businesses['perc_scraped_reviews'] = businesses.apply (lambda row: add_scraped_perc (row), axis=1)

### Filter businesses & reviews

In [595]:
# Filter out businesses with less than 30 reviews
businesses = businesses.loc[businesses['num_scraped_reviews'] >= 30] # 261 businesses

In [596]:
# filter reviews on the ids
b_ids = businesses['id'].unique()
reviews = reviews[reviews['business'].isin(b_ids)]

### Add gender column

In [597]:
gd = gender.Detector()

def add_gender (row):
    return gd.get_gender(row['author'].split()[0].capitalize())

reviews['gender'] = reviews.apply (lambda row: add_gender (row), axis=1)

reviews.ix[reviews.gender == 'mostly_male', 'gender'] = 'male'
reviews.ix[reviews.gender == 'mostly_female', 'gender'] = 'female'

### Split coordinate col

In [598]:
businesses = businesses[pd.notnull(businesses['coordinate'])]
businesses['lat'], businesses['long'] = zip(*businesses['coordinate'].apply(lambda x: x.split(',', 1)))
businesses = businesses.drop('coordinate', 1)

### Add F/M frequency

In [599]:
def add_f_freq (row):
    curr_reviews = reviews.loc[reviews['business'] == row['id']]
    return round(len(curr_reviews.loc[curr_reviews['gender'] == 'female']) / len(curr_reviews), 4)

In [600]:
def add_m_freq (row):
    curr_reviews = reviews.loc[reviews['business'] == row['id']]
    return round(len(curr_reviews.loc[curr_reviews['gender'] == 'male']) / len(curr_reviews), 4)

In [601]:
businesses['f_freq'] = businesses.apply (lambda row: add_f_freq (row), axis=1)

In [602]:
businesses['m_freq'] = businesses.apply (lambda row: add_m_freq (row), axis=1)

In [603]:
#businesses

### Add F/M popularity

In [604]:
def add_f_pop (row):
    curr_reviews = reviews.loc[reviews['business'] == row['id']]
    return round(curr_reviews.loc[curr_reviews['gender'] == 'female']['stars'].mean(), 4)

In [605]:
def add_m_pop (row):
    curr_reviews = reviews.loc[reviews['business'] == row['id']]
    return round(curr_reviews.loc[curr_reviews['gender'] == 'male']['stars'].mean(), 4)

In [606]:
businesses['f_pop'] = businesses.apply (lambda row: add_f_pop (row), axis=1)

In [607]:
businesses['m_pop'] = businesses.apply (lambda row: add_m_pop (row), axis=1)

In [608]:
#len(businesses)

### Split into training, validation, and test sets

In [609]:
indexes = list(businesses.index)

In [610]:
tr_index = round(len(indexes) * 0.7)
v_index = round(len(indexes) * 0.1)

In [611]:
random.shuffle(indexes)

In [612]:
training = indexes[:tr_index]
validation = indexes[tr_index:(tr_index + v_index)]
test = indexes[(tr_index + v_index):]

In [613]:
training_set = businesses.ix[training]
validation_set = businesses.ix[validation]
test_set = businesses.ix[test]

# Start messing with training data

### Add benchmark review count

In [614]:
#training_set

In [615]:
def add_bench_count (row):
    categories = row['categories'].split(',')
    # categories = [s.strip() for s in categories]
    # categories = list(set(categories))
    avg_counts = []
    for c in categories:
        avg_counts.append(businesses.loc[businesses['categories'].str.contains(c)]['review_count'].mean())
    avg_counts = [x for x in avg_counts if str(x) != 'nan']
    return sum(avg_counts) / float(len(avg_counts))

In [616]:
training_set['bench_count'] = training_set.apply (lambda row: add_bench_count (row), axis=1)



In [617]:
validation_set['bench_count'] = validation_set.apply (lambda row: add_bench_count (row), axis=1)



In [618]:
#training_set

### Add benchmark review avg

In [619]:
def add_bench_avg (row):
    categories = row['categories'].split(',')
    # categories = [s.strip() for s in categories]
    # categories = list(set(categories))
    avg_averages = []
    for c in categories:
        avg_averages.append(businesses.loc[businesses['categories'].str.contains(c)]['rating'].mean())
    avg_averages = [x for x in avg_averages if str(x) != 'nan']
    return sum(avg_averages) / float(len(avg_averages))

In [620]:
training_set['bench_avg'] = training_set.apply (lambda row: add_bench_avg (row), axis=1)



In [621]:
validation_set['bench_avg'] = validation_set.apply (lambda row: add_bench_avg (row), axis=1)



In [622]:
#training_set

### Add whether restaurant is successful or not

In [623]:
def grade_success (row):
    return (row['review_count'] >= row['bench_count']) | (row['rating'] >= row['bench_avg'])

In [624]:
training_set['successful?'] = training_set.apply (lambda row: grade_success (row), axis=1)

In [625]:
validation_set['successful?'] = validation_set.apply (lambda row: grade_success (row), axis=1)

### K-NN

In [626]:
import math

In [627]:
def dist(row1, row2):
    distance = 0
    
    categories1 = row1['categories'].split(',')
    categories1 = [s.strip() for s in categories1]
    categories1 = list(set(categories1))
    
    categories2 = row2['categories'].split(',')
    categories2 = [s.strip() for s in categories2]
    categories2 = list(set(categories2))
    
    non_shared_count = len([item for item in categories1 if item not in categories2] + 
                           [item for item in categories2 if item not in categories1])
    distance += non_shared_count / (len(categories1) + len(categories2))
    
    #distance += int(bool(set(categories1) & set(categories2)))
    
    distance += pow((float(row1['lat']) - float(row2['lat'])), 2)
    distance += pow((float(row1['long']) - float(row2['long'])), 2)
    
    distance += pow(((row1['f_freq'] / (row1['f_freq'] + row1['m_freq'])) - 
                     (row2['f_freq'] / (row2['f_freq'] + row2['m_freq']))), 2)
    #distance += pow((row1['f_freq'] - row2['f_freq']), 2)
    #distance += pow((row1['m_freq'] - row2['m_freq']), 2)
    
    #distance += pow(((row1['f_pop'] / row1['rating']) - (row2['f_pop'] / row1['rating'])), 2)
    distance += pow((row1['f_pop'] - row2['f_pop']), 2)
    #distance += pow(((row1['m_pop'] / row1['rating']) - (row2['m_pop'] / row1['rating'])), 2)
    distance += pow((row1['m_pop'] - row2['m_pop']), 2)
    return math.sqrt(distance)

In [628]:
def get_k_nn (v_row, k):
    copy_training_set = training_set
    copy_training_set['curr_dist'] = copy_training_set.apply (lambda row: dist (v_row, row), axis=1)
    copy_training_set.sort_values('curr_dist', inplace=True)
    neighbors = np.array(copy_training_set.head(n=k)['successful?'])
    return np.average(neighbors) >= 0.5

In [632]:
best_accuracy = 0
find_best_k = 0
for i in range(1, 20):
    if 'knn_estimate' in validation_set:
        validation_set = validation_set.drop('knn_estimate', 1)
    validation_set['knn_estimate'] = validation_set.apply (lambda row: get_k_nn (row, 20), axis=1)
    if len(validation_set.loc[validation_set['successful?'] == validation_set['knn_estimate']]) / float(len(validation_set)) >= best_accuracy:
        find_best_k = i
        best_accuracy = len(validation_set.loc[validation_set['successful?'] == validation_set['knn_estimate']]) / float(len(validation_set))

In [633]:
find_best_k

19

In [634]:
best_accuracy

0.8461538461538461