# Import data

In [159]:
#disable some annoying warning
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

#plots the figures in place instead of a new window
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
import pandas.io.json as json
import geopy as geo
import time as time

from IPython.html.widgets import interact, interact_manual
from sklearn.feature_extraction import DictVectorizer
from sklearn.cluster import KMeans
import sklearn.cross_validation
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.patches as mpatches
from sklearn import decomposition
from sklearn import manifold

#from yelp.client import Client
#from yelp.oauth1_authenticator import Oauth1Authenticator

# Helper methods

Because there is no parent JSON object, I have to read the file line by line

In [160]:
# http://stackoverflow.com/questions/30088006/cant-figure-out-how-to-fix-the-error-in-the-following-code
def load_json_line_by_line(file_path):
    # read the entire file into a python array
    f = open(file_path, 'r')
    data = f.readlines()
    f.close()
    
    # remove the trailing "\n" from each line
    data = map(lambda x: x.rstrip(), data)

    # create one big JSON array, with each line being one entry
    data_json_str = "[" + ','.join(data) + "]"

    # now, load it into pandas (and normalize it got)
    data_df = json.json_normalize(json.loads(data_json_str))
    
    return data_df

In [161]:
def make_object_tuple(x):
    if(type(x) == 'list'):
        return tuple(map(lambda ele: make_object_tuple(ele), x))
    else:
        return x

Make columns which contains lists (like the categories column) to columns of tuples so the are hashable

In [162]:
# Lists like [[Coffee & Tea, coffee], [Bakeries, bakeries]] 
# have to be converted to tuples like ((Coffee & Tea, coffee), (Bakeries, bakeries))
# to be hashable

# Keeping this slow version just for demo purpose
def convert_listcolumns_to_tuplecolumns_old(df, col_names):
    i = 0
    for col_name in col_names:
        for row in df.iterrows():
            i = i+1
            print(str(i/len(df)) + "%")
            row[col_name] = eval(str(row[col_name]).replace('[','(').replace(']',')'))
    return df

In [163]:
def convert_listcolumns_to_tuplecolumns(df, col_names):
    for col_name in col_names:
        df[col_name] = df.apply(lambda row: eval(str(row[col_name]).replace('[','(').replace(']',')')), axis=1)
    return df

Converts columns with time to integer columns (e.g., 17:00 -> 1700)

In [164]:
def convert_timecolumns_to_numbercolumns(df, col_names):
    for col_name in col_names:
        df[col_name] = df.apply(lambda row: eval(str(row[col_name]).replace(':','.')), axis=1)
    return df

Converts any column to a string column

In [165]:
def convert_columns_to_stringcolumns(df, col_names):
    for col_name in col_names:
        df[col_name] = df.apply(lambda row: str(row[col_name]), axis=1)
    return df

Converts any column into a category column.

EDIT:
Note In contrast to R’s factor function, categorical data is not converting input values to strings and categories will end up the same data type as the original values.
-> Okay, then this is not really applicable

In [166]:
def convert_columns_to_categorical(df, col_names):
    for col_name in col_names:
        df[col_name] = df[col_name].astype('category')
    return df

Because many attributes are boolean we want to convert them from object to bool

In [167]:
def convert_columns_to_bool(df, col_names):
    for col_name in col_names:
        tmp = df[col_name].astype('category')
        if True in tmp.cat.categories or False in tmp.cat.categories:
            df[col_name] = df[col_name].astype('bool')
    return df

# Load data

Load the businesses from the dataset.

The dataset is a .json file, therefor we have to use methods form pandas.io.json to load the data.

On loading the data, it also gets normalized, that means substructures get flattend out and get added as single columns (most of the columns which start with attribute.* or hours.*)

Also set the index column to the businesses name.

In [168]:
start_time = time.time()
businesses = load_json_line_by_line("yelp_academic_dataset_business.json")
businesses.index = businesses.pop('name')
print('Data loading took ' + str(time.time() - start_time) + " seconds.")

FileNotFoundError: [Errno 2] No such file or directory: 'yelp_academic_dataset_business.json'

# Basic overview

Let's have a first look at the data.
We can use head() for this.

In [169]:
businesses.head()

Unnamed: 0_level_0,attributes.Accepts Credit Cards,attributes.Accepts Insurance,attributes.Ages Allowed,attributes.Alcohol,attributes.Ambience.casual,attributes.Ambience.classy,attributes.Ambience.divey,attributes.Ambience.hipster,attributes.Ambience.intimate,attributes.Ambience.romantic,...,hours.Tuesday.open,hours.Wednesday.close,hours.Wednesday.open,latitude,longitude,neighborhoods,open,review_count,stars,state
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Eric Goldberg, MD",,,,,,,,,,,...,08:00,17:00,08:00,33.499313,-111.983758,(),True,9,3,AZ
Clancy's Pub,True,,,,,,,,,,...,,,,40.350519,-79.88693,(),True,4,3,PA
Cool Springs Golf Center,,,,,,,,,,,...,,,,40.356896,-80.01591,(),False,5,2,PA
Verizon Wireless,,,,,,,,,,,...,10:00,21:00,10:00,40.35762,-80.05998,(),True,3,3,PA
Emil's Lounge,True,,,full_bar,False,False,False,False,False,False,...,10:00,19:00,10:00,40.408735,-79.866351,(),True,11,4,PA


# Conclusion from basic overview:

* We have many columns (104 + index!)
* We have many NaN attributes in the attributes columns
* We have list columns which have to be converted to tuples

# Convert list columns to tuple columns:

In [170]:
businesses = convert_listcolumns_to_tuplecolumns(businesses, ['categories','neighborhoods'])
businesses.head()

NameError: ("name 'Nightlife' is not defined", "occurred at index Clancy's Pub")

# Column overview:

Because there are so many columns, let's get an overview which columns are there:

In [None]:
for col in businesses.columns:
    print(col)

# Conclusion from column overview

Whoa! That's a whole bunch of columns. Can't wait to analyze them.

But first we have to think about missing data.

# Check which columns never contain NaN

In [None]:
for col in businesses.dropna(axis = 1, how = 'any', thresh = None, subset = None, inplace = False).columns:
    print(col)

Okay, this base columns look pretty promising: Name, type, categories, location, state, review_count and stars are available for every location

# Check if columns contain interesting information

In [None]:
businesses["type"].unique()

Okay, we can forget about the type column, this column does not add information

In [None]:
businesses = businesses.drop('type', 1)

In [None]:
businesses["neighborhoods"].unique()

Keep it, probably we may need it in the future

# Example code to get businesses with a given attribute set

Here some code to select only rows which don't have NaN values in certain columns

In [None]:
must_contain_value_in_columns = ['attributes.Accepts Credit Cards']
businesses.dropna(axis = 0, how = 'any', thresh = None, subset = must_contain_value_in_columns, inplace = False)

Let's see if there are businesses which have all attributes set

In [None]:
attributes = list()

for col in businesses:
    if col.startswith('attributes'):
        attributes.append(col)

businesses.dropna(axis = 0, how = 'any', thresh = None, subset = attributes, inplace = False)

As I thought, there is not a single business with all attributes set

# Some statistics

The general description

In [None]:
businesses.describe()

Mean review count and rating per category and per state

In [None]:
businesses.groupby(['categories']).agg({'stars' : 'mean', 'review_count': 'mean', 'full_address' : 'count'})

In [None]:
businesses.groupby(['state']).agg({'stars' : 'mean', 'review_count': 'mean', 'full_address' : 'count'})

Overview over the different attributes and their relation to review_count and rating

In [None]:
for attribute in attributes:
    res = businesses.groupby([attribute]).agg({'stars' : 'mean', 'review_count': 'mean'})
    print(res)

I found the attribute price range. That looks promising for a comparism between price and rating!

Let's have a look at the statstics:

In [None]:
must_contain_value_in_columns = ['attributes.Price Range']
price_range_businesses = businesses.dropna(axis = 0, how = 'any', thresh = None, subset = must_contain_value_in_columns, inplace = False)
price_range_businesses.groupby("attributes.Price Range").agg({"stars": 'mean'})

That are not really the result I hoped to see... I hoped for some clustering like " -10 dollar", "10-20 dollar", etc.
But at least we see that the price range does not have that much of an influence regarding rating.
So this column does not really contain valuable information.

# Plotting

Where are all the businesses?

In [None]:
plt.scatter(businesses["longitude"], businesses["latitude"])
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Location of businesses')

Okay, yet we didn't know exactly what this Yelp challenge dataset contained.
Now we know that the dataset contains american businesses.

Let's see when most of them open:

In [None]:
opening_times_columns = [
    'hours.Monday.open', 'hours.Tuesday.open', 'hours.Wednesday.open', 'hours.Thursday.open', 'hours.Friday.open', 'hours.Saturday.open', 'hours.Sunday.open',
    'hours.Monday.close', 'hours.Tuesday.close', 'hours.Wednesday.close', 'hours.Thursday.close', 'hours.Friday.close', 'hours.Saturday.close', 'hours.Sunday.close'
]

businesses_with_opening_times = businesses.dropna(axis = 0, how = 'any', thresh = None, subset = opening_times_columns, inplace = False)

In [None]:
@interact(day=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], open_close=['open','close'])
def show_opening_times(day, open_close):
    group = businesses_with_opening_times.groupby('hours.' + day + "." + open_close)
    group.agg({'stars' : 'count'}).plot(kind='bar')

Let's take first steps into heatmaps and clustermaps:
Do the opening and closing times of the different days correlate to each other?

In [None]:
corrmat = convert_timecolumns_to_numbercolumns(businesses_with_opening_times, opening_times_columns)[opening_times_columns].corr()

# draw a clustered heatmap using seaborn
sns.clustermap(corrmat, square=True)

As expected, the opening times mostly correlate to each other during monday to friday, with a second "correlation pair" on saturday and sunday. (Special opening times on the weekend).

Also, the closing times of friday and saturday correlate much more to each other than to the other days.

As next let's have a look at the number of review count and stars to get a coarse feeling for how average ratings on help look like.

In [None]:
plt.hist(businesses['stars'],bins=9)
plt.title('Stars')

In [None]:
plt.hist(businesses['review_count'],bins=5)
plt.title('Review count')

As we can see: we don't see much, because there are very very few locations with very much reviews. Let's try another visualisation method.

In [None]:
plt.boxplot(businesses['review_count'], 0, '') # Outliners are not shown
plt.title('Review count')

A boxplot which does not show points for the outliners is a much better choice for visualizing this.

Now let's get an overview over the different attributes and ratings

In [None]:
#stringcategory_businesses = convert_columns_to_stringcolumns(businesses, ["categories"])
#food_businesses = stringcategory_businesses[stringcategory_businesses["categories"].str.contains('Food')]
#food_businesses["categories"].unique()
#plt.hist(food_businesses[food_businesses["categories"].str.contains(str(category))]["stars"],bins=9)
#plt.title('Stars of businesses with categories ' + str(category))

In [None]:
@interact(category=tuple(sorted(attributes)))
def show_stars_per_category(category):
    businesses.groupby(category).agg({'stars' : 'mean'}).plot(kind='bar')

Average rating per state

In [None]:
businesses.groupby("state").agg({'stars' : 'mean'}).plot(kind='bar')

Number of businneses per state

In [None]:
businesses.groupby("state").agg({'stars' : 'count'}).plot(kind='bar')

This is some interesting result. For some states we just have very few businesses. This may be considered in further analysis.

In [None]:
idx = businesses['state'].isin(['AZ', 'NV'])
businesses_without_AZ_NV = businesses[~idx]
businesses_without_AZ_NV.groupby("state").agg({'stars' : 'count'}).plot(kind='bar')

Okay, now we get to the point. I will now include in the number of businesses in the descriptive statistics analysis. If we now look at the statistics again, it get's clear that there is a huge difference in the amount of data per state.

Now let's see if businessis which have a high review_count (and therefor probably are often visited) also have a higher rating

In [None]:
plt.scatter(businesses[businesses["review_count"]<1000]["review_count"], businesses[businesses["review_count"]<1000]["stars"])
plt.xlabel('Review Count')
plt.ylabel('Stars')
plt.title('Review count VS Rate')

This is not the optimal visalisation method, let's try it with a differnt method

In [None]:
businesses.groupby("review_count").agg({"stars": "mean"}).plot(kind='area')

This does not really look like a monotone growing curve. So the assumption that more often rated businesses have higher ratings is not true

And just again to repeat the findings from the statistics: The attribute "Price Range" does not really have a high influence on the rating.

In [None]:
price_range_businesses = businesses.dropna(axis = 0, how = 'any', thresh = None, subset = must_contain_value_in_columns, inplace = False)
price_range_businesses.groupby("attributes.Price Range").agg({"stars": 'mean'}).plot(kind='bar')

# Clustering

May we will be able to find out if a certain restaurant has a certain attribute by looking at the state, city, stars and category?

In [None]:
colors = sns.color_palette()

@interact_manual(n_components=(1,10), random_state=(1,100), attribute=attributes, n_clusters=(1,10))
def draw_plot(n_components, random_state, attribute, n_clusters):
    columns = ["state","city","stars","categories"]
    columns_with_target = ["state","city","stars", attribute,"categories"]
    businesses_without_nans = convert_columns_to_stringcolumns(businesses.dropna(axis = 0, how = 'any', thresh = None, subset = columns_with_target, inplace = False), ['categories'])
    businesses_without_nans = businesses_without_nans[~businesses_without_nans['categories'].str.contains(',')] # Because I always got MemoryExceptions   
    
    train_as_dicts = [dict(r.iteritems()) for _, r in businesses_without_nans[columns].iterrows()]
    #print(train_as_dicts)
    
    vec = DictVectorizer()
    fit_transformed_businesses = vec.fit_transform(train_as_dicts)
    fit_transformed_businesses_array = fit_transformed_businesses.toarray()
    #print(fit_transformed_businesses_array)
    
    mds = manifold.MDS(n_components, random_state)
    pca = decomposition.PCA(n_components)
    
    pca_pos = pca.fit(fit_transformed_businesses_array).transform(fit_transformed_businesses_array)
    mds_pos = mds.fit(fit_transformed_businesses_array).embedding_
    
    plt.figure(figsize=[20,7])
    plt.subplot(121)
    plt.scatter(mds_pos[:, 0], mds_pos[:, 1], s=30, c=[colors[1 if i else 0] for i in businesses_without_nans[attribute]])
    patches = [ mpatches.Patch(color=colors[i], label='False' if i == 0 else 'True') for i in range(2) ]
    plt.legend(handles=patches)
    plt.legend()
    plt.title('MDS - ' + attribute)

    plt.subplot(122)
    plt.scatter(pca_pos[:, 0], pca_pos[:, 1], s=30, c=[colors[1 if i else 0] for i in businesses_without_nans[attribute]])
    patches = [ mpatches.Patch(color=colors[i], label='False' if i == 0 else 'True') for i in range(2) ]
    plt.legend(handles=patches)
    plt.legend()
    plt.title('PCA - ' + attribute)
    
    kmean_pred_pca = KMeans(n_clusters=n_clusters, random_state = random_state).fit_predict(pca_pos)
    kmean_pred_mds = KMeans(n_clusters=n_clusters, random_state = random_state).fit_predict(mds_pos)
    
    plt.figure(figsize=[20,7])
    plt.subplot(121)
    plt.scatter(pca_pos[:, 0], pca_pos[:, 1], s=30, c=[colors[i] for i in kmean_pred_pca])
    plt.title('KMean - PCA - ' + attribute)

    plt.subplot(122)
    plt.scatter(pca_pos[:, 0], pca_pos[:, 1], s=30, c=[colors[1 if i else 0] for i in businesses_without_nans[attribute]])
    patches = [ mpatches.Patch(color=colors[i], label='False' if i == 0 else 'True') for i in range(2) ]
    plt.legend(handles=patches)
    plt.legend()
    plt.title('Groundtruth - PCA - ' + attribute)
    
    plt.figure(figsize=[20,7])
    plt.subplot(121)
    plt.scatter(mds_pos[:, 0], mds_pos[:, 1], s=30, c=[colors[i] for i in kmean_pred_pca])
    plt.title('KMean - MDS - ' + attribute)

    plt.subplot(122)
    plt.scatter(mds_pos[:, 0], mds_pos[:, 1], s=30, c=[colors[1 if i else 0] for i in businesses_without_nans[attribute]])
    patches = [ mpatches.Patch(color=colors[i], label='False' if i == 0 else 'True') for i in range(2) ]
    plt.legend(handles=patches)
    plt.legend()
    plt.title('Groundtruth - MDS - ' + attribute)
    
    instance = DecisionTreeClassifier()
    r = instance.fit(mds_pos, businesses_without_nans[attribute].astype(int))
    tree_predict_mds = instance.predict(mds_pos)
    
    instance = DecisionTreeClassifier()
    r = instance.fit(pca_pos, businesses_without_nans[attribute].astype(int))
    tree_predict_pca = instance.predict(pca_pos)
    
    acc = accuracy_score(tree_predict_mds, businesses_without_nans[attribute].astype(int))
    plt.figure(figsize=[20,7])
    plt.subplot(121)
    plt.scatter(mds_pos[:, 0], mds_pos[:, 1], s=30, c=[colors[i] for i in tree_predict_mds])
    plt.title('Decision tree - MDS - ' + attribute + ' - Prediction accuracy: ' + str(acc))

    plt.subplot(122)
    plt.scatter(mds_pos[:, 0], mds_pos[:, 1], s=30, c=[colors[1 if i else 0] for i in businesses_without_nans[attribute]])
    patches = [ mpatches.Patch(color=colors[i], label='False' if i == 0 else 'True') for i in range(2) ]
    plt.legend(handles=patches)
    plt.legend()
    plt.title('Groundtruth - MDS - ' + attribute)
    
    acc = accuracy_score(tree_predict_pca, businesses_without_nans[attribute].astype(int))
    plt.figure(figsize=[20,7])
    plt.subplot(121)
    plt.scatter(pca_pos[:, 0], pca_pos[:, 1], s=30, c=[colors[i] for i in tree_predict_pca])
    plt.title('Decision tree - PCA - ' + attribute + ' - Prediction accuracy: ' + str(acc))

    plt.subplot(122)
    plt.scatter(pca_pos[:, 0], pca_pos[:, 1], s=30, c=[colors[1 if i else 0] for i in businesses_without_nans[attribute]])
    patches = [ mpatches.Patch(color=colors[i], label='False' if i == 0 else 'True') for i in range(2) ]
    plt.legend(handles=patches)
    plt.legend()
    plt.title('Groundtruth - PCA - ' + attribute)
    
    data_train, data_test, target_train, target_test = sklearn.cross_validation.train_test_split(fit_transformed_businesses_array, businesses_without_nans[attribute].astype(int), test_size=0.20, random_state = random_state)
    instance = DecisionTreeClassifier()
    r = instance.fit(data_train, target_train)
    target_predict = instance.predict(data_test)
    print('Prediction accuracy 20% Testset: ',accuracy_score(target_predict, target_test))

Let's discuess the results:

Unsupervised techniques are not suitable, because there are no real clusters which can be detected.

Still, with supervised techniques, pretty nice results can be obtained. 