# Analyze Data

Korte data analyse voor het verkenne van de yelp data voor het verslag van week 1.

In [1]:
# Imports
import pandas as pd
import os
import json
import random
from IPython.display import display
from collections import defaultdict
import numpy as np

In [2]:
"""
This file loads the data from the data directory and shows you how.
Feel free to change the contents of this file!
Do ensure these functions remain functional:
    - get_business(city, business_id)
    - get_reviews(city, business_id=None, user_id=None, n=10)
    - get_user(username)
"""

import os
import json
import random

DATA_DIR = "../data"


def load_cities():
    """
    Finds all cities (all directory names) in ./data
    Returns a list of city names
    """
    return os.listdir(DATA_DIR)


def load(cities, data_filename):
    """
    Given a list of city names,
        for each city extract all data from ./data/<city>/<data_filename>.json
    Returns a dictionary of the form:
        {
            <city1>: [<entry1>, <entry2>, ...],
            <city2>: [<entry1>, <entry2>, ...],
            ...
        }
    """
    data = {}
    for city in cities:
        city_data = []
        with open(f"{DATA_DIR}/{city}/{data_filename}.json", "r") as f:
            for line in f:
                city_data.append(json.loads(line))
        data[city] = city_data
    return data


def get_business(city, business_id):
    """
    Given a city name and a business id, return that business's data.
    Returns a dictionary of the form:
        {
            name:str,
            business_id:str,
            stars:str,
            ...
        }
    """
    for business in BUSINESSES[city]:
        if business["business_id"] == business_id:
            return business
    raise IndexError(f"invalid business_id {business_id}")


def get_reviews(city, business_id=None, user_id=None, n=10):
    """
    Given a city name and optionally a business id and/or auser id,
    return n reviews for that business/user combo in that city.
    Returns a dictionary of the form:
        {
            text:str,
            stars:str,
            ...
        }
    """
    def should_keep(review):
        if business_id and review["business_id"] != business_id:
            return False
        if user_id and review["user_id"] != user_id:
            return False
        return True

    reviews = REVIEWS[city]
    reviews = [review for review in reviews if should_keep(review)]
    return random.sample(reviews, min(n, len(reviews)))


def get_user(username):
    """
    Get a user by its username
    Returns a dictionary of the form:
        {
            user_id:str,
            name:str,
            ...
        }
    """
    for city, users in USERS.items():
        for user in users:
            if user["name"] == username:
                return user
    raise IndexError(f"invalid username {username}")


CITIES = load_cities()
# USERS = load(CITIES, "user")
BUSINESSES = load(CITIES, "business")
# REVIEWS = load(CITIES, "review")
# TIPS = load(CITIES, "tip")
# CHECKINS = load(CITIES, "checkin")cmd


In [3]:
business = pd.DataFrame(BUSINESSES['ajax'])
business = business[business['is_open'] == 1 & business['categories'].notna()]
business['categories'] = business['categories'].str.split(', ')
print(business.latitude.mean(), business.longitude.mean())

43.855821985388744 -79.02810217534036


In [4]:
import operator
citycoord = defaultdict(dict)

for city in CITIES:
    df = pd.DataFrame(BUSINESSES[city])
    citycoord[city]['long'] = df['longitude'].mean()
    citycoord[city]['lat'] = df['latitude'].mean()
    
city_coord_df = pd.DataFrame(citycoord)

def make_region(target_city, threshold=100, coord_df=city_coord_df):
    city_list = [target_city]
    city_count = 0
    business_count = len(BUSINESSES[target_city])
    closest_cities = find_closest_cities(target_city)
    while business_count < threshold:
        business_count = 0
        city_count += 1
        city_list = closest_cities[0:city_count]
        for city in city_list:
            business_count += len(BUSINESSES[city])
    return city_list  

def find_closest_cities(target_city):
    distances = {}
    target_lat = city_coord_df[target_city]['lat']
    target_long = city_coord_df[target_city]['long']
    for city in city_coord_df:
        lat = city_coord_df[city]['lat']
        long = city_coord_df[city]['long']
        distance = np.sqrt(abs(lat-target_lat)**2 + abs(long-target_long)**2)
        distances[city] = distance
    sorted_dict = sorted(distances.items(), key=operator.itemgetter(1))
    city_list = []
    for i in range(len(sorted_dict)):
        city_list.append(sorted_dict[i][0])
    return city_list

def make_region_dict(cities):
    regions = {}
    for city in cities:
        regions[city] = make_region(city, coord_df=cities)
    return regions
        
regions = make_region_dict(city_coord_df)

DATA_DIR veranderen naar het pad waar de yelp-data staat.

In [5]:
# Directory waar de yelp-data in staat
DATA_DIR = "../data"

### Bekijken van opbouw Datafiles
Neem hier als voorbeeld de stad Agincourt (lekker klein).

In [8]:
CITY = 'westlake'

print('business.json')
display(pd.read_json(DATA_DIR+'/'+CITY+'/business.json', lines=True).head())

print('\n\ncheckin.json')
display(pd.read_json(DATA_DIR+'/'+CITY+'/checkin.json', lines=True).head())

print('\n\nreview.json')
display(pd.read_json(DATA_DIR+'/'+CITY+'/review.json', lines=True).head())

print('\n\ntip.json')
display(pd.read_json(DATA_DIR+'/'+CITY+'/tip.json', lines=True).head())

print('\n\nuser.json')
display(pd.read_json(DATA_DIR+'/'+CITY+'/user.json', lines=True).head())

business.json


Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,30808 Center Ridge Rd,,-MsRvdPnuw6QuLn5Vxjruw,"Shopping, Home Services, Interior Design, Rugs...",Westlake,"{'Monday': '11:0-20:0', 'Tuesday': '10:0-20:0'...",1,41.422794,-81.962215,Brian's Furniture,44145,3,5.0,OH
1,863 Bradley Rd,,C_oejk3EzfsxP7-owQDkbQ,"Pet Services, Pet Sitting, Pets",Westlake,"{'Monday': '6:30-18:30', 'Tuesday': '6:30-18:3...",1,41.47497,-81.959405,Camp Bow Wow Westlake,44145,22,4.0,OH
2,2004 Crocker Rd,"{'GoodForKids': 'False', 'RestaurantsTakeOut':...",nYvBZYg9rfqWFTYuxSVMdw,"Mexican, Bars, Restaurants, Vegetarian, Nightlife",Westlake,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,41.461569,-81.951303,Cantina Laredo,44145,62,4.0,OH
3,24481 Detroit Rd,"{'OutdoorSeating': 'True', 'Alcohol': 'u'full_...",0ZN2MfHyjNIkCx7qJvVhDg,"Restaurants, Nightlife, American (New), Dance ...",Westlake,"{'Monday': '11:0-2:0', 'Tuesday': '11:0-2:0', ...",0,41.470596,-81.891643,SB Eightyone,44145,34,2.5,OH
4,26410 Center Ridge Rd,{'ByAppointmentOnly': 'True'},VZKWW2zQbk-rxwpAcque8w,"Dermatologists, Doctors, Health & Medical",Westlake,"{'Monday': '9:0-17:0', 'Tuesday': '9:0-17:0', ...",1,41.452693,-81.913515,"Kish L Stephen, MD",44145,3,3.0,OH




checkin.json


Unnamed: 0,business_id,date
0,-MsRvdPnuw6QuLn5Vxjruw,2016-02-17 22:58:06
1,-ak1fx5L9cNjUE56as12MA,"2017-12-30 17:13:44, 2018-02-11 15:36:23, 2018..."
2,-lAV1uegafxCjGE306kBYQ,"2015-07-01 16:49:32, 2015-07-03 13:23:24, 2015..."
3,-mP3F3srknwKJdJ5FqcX5Q,"2010-07-04 04:42:04, 2010-08-22 18:31:16, 2010..."
4,-pmqS-odJCmxOvWfRFAQ9Q,"2011-03-20 21:32:56, 2011-06-27 17:00:25, 2011..."




review.json


Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,2lcK3d4K7FU6O8wXdWzOmA,1,2015-10-07 22:34:13,0,yO9uwcDlzcFBpp6xSq_wOg,5,Great and fresh food (had filet mignon and asp...,0,apP3CApEq6-z59tRLwEBYA
1,bzdb1jJ1j8Qn_RVHY97FnA,0,2011-10-27 13:11:48,0,ZrQ6PqgZZOcaH8pDlil1ww,2,I feel like Friday's food menu has gotten wors...,1,xtwoOTTOuZrXj4GQtsueuA
2,z71IqTCbQW7uzw2H2T1QrA,0,2015-01-18 16:00:39,0,zVBQuDrIBMPEZHyu9YJjLg,5,87 West is awesome! The decor and fireplace c...,0,ecz6xGzGm1-wwswYh8VT8A
3,KR2kRmHnRCaNzOUEGoB25w,1,2015-01-04 01:22:20,0,HzD24-WZ8pzGGOZeN_ktIA,5,"Awesome burgers, great atmosphere, attentive w...",0,bJ1ir7YZ-e-cigMahFLEIw
4,z58nyUVyDV-vC7nXFfvR5g,0,2010-05-15 00:58:29,0,DXGcEtOckOkrs-7HRp4eXQ,3,"Looking for a quick mexican fix? Good food, l...",1,FO8hILdACBsZrVDur5x8RA




tip.json


Unnamed: 0,business_id,compliment_count,date,text,user_id
0,yBPeUobSSaJQKaYYgiOKYA,0,2012-07-31 12:05:54,"For collision estimates, enter through John La...",xtwoOTTOuZrXj4GQtsueuA
1,MWcN6qLWTfRJqLySxwJNqQ,0,2015-09-12 17:05:52,They no longer have the weekend lunch buffet!,0tC5OOTuwvPGnnqhPCoRSw
2,uFL4KroMu-dFRsSJrcFzVQ,0,2012-03-15 20:31:12,Tuesday's (5p - close) are $1 Burger Nights ju...,xtwoOTTOuZrXj4GQtsueuA
3,uFL4KroMu-dFRsSJrcFzVQ,0,2012-03-15 20:29:44,"Starting March 26th - Happy Hour, Monday thru ...",xtwoOTTOuZrXj4GQtsueuA
4,cbjF6szaq2orE0BplGAKEA,0,2014-09-08 22:46:58,"The hooley hunks are great, big chunks of high...",owj_Ij0UvMi8ZiGa_Y_deg




user.json


Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,...,cool,elite,fans,friends,funny,name,review_count,useful,user_id,yelping_since
0,3.91,756,57,756,366,36,59,541,164,1696,...,6712,201020112012201320142015201620172018,1150,"vvqKD8Xow2IsmwOL1lqS9Q, -HLE-x7Lpkfprd6er-JFGg...",4257,Cara,1027,10719,NfU0zDaTMEQ4-X9dbQWd9A,2010-07-21 03:42:39
1,3.62,1,0,1,0,0,0,0,0,1,...,11,,4,"CH3YgwWreJHEU75vntW2XA, n6VF7X8cSEN3UjR2n21Rqw...",11,Brian,19,57,wVVH8vsOOhdJ-5nigb77xg,2012-09-24 02:21:48
2,4.35,32,4,32,20,1,10,19,4,48,...,361,201220132014201520162017,29,"atfOidazPQ0gtIKMc_xqIA, rBVubLc9uLDZLrIhd7OiQg...",709,Jennifer,302,840,tWVlSt-66fl6bk9_5UzltQ,2012-05-11 17:16:04
3,2.85,0,0,0,0,0,0,1,0,0,...,4,,0,"IhiSlJZKEZL0O1syeW3gvQ, mUXt36V1irLlA_UZtEEnfQ...",20,Kevin,19,25,Khecid8LZRq9n6xe4qjgLA,2009-08-09 16:33:01
4,4.38,6,0,6,3,0,1,10,2,13,...,285,2015201620172018,21,"HSWii1Xd63kXmR7Rp2rp3g, 9XJxkbzctH2eL12u-qAZUw...",127,Dawn,589,674,sxEzazybwfF86Mv8Dm3tgw,2009-11-04 14:13:29


### Kopie van functies/variabelen uit data.py bestand.

In [None]:
# Kopie van data.py-bestand van het project
"""
This file loads the data from the data directory and shows you how.
Feel free to change the contents of this file!
Do ensure these functions remain functional:
    - get_business(city, business_id)
    - get_reviews(city, business_id=None, user_id=None, n=10)
    - get_user(username)
"""

def load_cities():
    """
    Finds all cities (all directory names) in ./data
    Returns a list of city names
    """
    return os.listdir(DATA_DIR)


def load(cities, data_filename):
    """
    Given a list of city names,
        for each city extract all data from ./data/<city>/<data_filename>.json
    Returns a dictionary of the form:
        {
            <city1>: [<entry1>, <entry2>, ...],
            <city2>: [<entry1>, <entry2>, ...],
            ...
        }
    """
    data = {}
    for city in cities:
        city_data = []
        with open(f"{DATA_DIR}/{city}/{data_filename}.json", "r") as f:
            for line in f:
                city_data.append(json.loads(line))
        data[city] = city_data
    return data


def get_business(city, business_id):
    """
    Given a city name and a business id, return that business's data.
    Returns a dictionary of the form:
        {
            name:str,
            business_id:str,
            stars:str,
            ...
        }
    """
    for business in BUSINESSES[city]:
        if business["business_id"] == business_id:
            return business
    raise IndexError(f"invalid business_id {business_id}")


def get_reviews(city, business_id=None, user_id=None, n=10):
    """
    Given a city name and optionally a business id and/or auser id,
    return n reviews for that business/user combo in that city.
    Returns a dictionary of the form:
        {
            text:str,
            stars:str,
            ...
        }
    """
    def should_keep(review):
        if business_id and review["business_id"] != business_id:
            return False
        if user_id and review["user_id"] != user_id:
            return False
        return True

    reviews = REVIEWS[city]
    reviews = [review for review in reviews if should_keep(review)]
    return random.sample(reviews, min(n, len(reviews)))


def get_user(username):
    """
    Get a user by its username
    Returns a dictionary of the form:
        {
            user_id:str,
            name:str,
            ...
        }
    """
    for city, users in USERS.items():
        for user in users:
            if user["name"] == username:
                return user
    raise IndexError(f"invalid username {username}")


CITIES = load_cities()
USERS = load(CITIES, "user")
#BUSINESSES = load(CITIES, "business")
#REVIEWS = load(CITIES, "review")
#TIPS = load(CITIES, "tip")
#CHECKINS = load(CITIES, "checkin")

BUSINESSES, REVIEWS, TIPS en CHECKINS staan gecomment omdat deze naar mijn mening (nog) niet nodig zijn in deze analyse en het flink wat werkgeheugen en tijd scheelt. 

all_users is een DataFrame die enkel de review count van elke gebruiker bevat.

In [None]:
all_users = pd.DataFrame()
for city in CITIES:
    all_users = pd.concat([all_users, pd.DataFrame(USERS[city])['review_count']])

### Top 10 van gebruikers met meeste reviews.

In [None]:
all_users.sort_values(0, ascending=False).head(10)

### Grafiek die aantal reviews per gebruikers laat zien
Dit moet in logy-schaal anders is het niet goed zichtbaar vanwege de 'longtail'

In [None]:
display(all_users[0].sort_values(ascending=False).plot(use_index=False, logy=True, legend=False))

### Histogram van het aantal reviews per gebruiker

Deze is ook op een logy-schaal vanwege het reusachtige aantal gebruikers met relatief weinig reviews (meer dan 1 miljoen)

In [None]:
display(all_users.plot.hist(0, logy=True, bins=10, legend=False))