# Analyze Data

Korte data analyse voor het verkenne van de yelp data voor het verslag van week 1.

In [1]:
# Imports
import pandas as pd
import os
import json
import random
from IPython.display import display
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
"""
This file loads the data from the data directory and shows you how.
Feel free to change the contents of this file!
Do ensure these functions remain functional:
    - get_business(city, business_id)
    - get_reviews(city, business_id=None, user_id=None, n=10)
    - get_user(username)
"""

import os
import json
import random

DATA_DIR = "../data"


def load_cities():
    """
    Finds all cities (all directory names) in ./data
    Returns a list of city names
    """
    return os.listdir(DATA_DIR)


def load(cities, data_filename):
    """
    Given a list of city names,
        for each city extract all data from ./data/<city>/<data_filename>.json
    Returns a dictionary of the form:
        {
            <city1>: [<entry1>, <entry2>, ...],
            <city2>: [<entry1>, <entry2>, ...],
            ...
        }
    """
    data = {}
    for city in cities:
        city_data = []
        with open(f"{DATA_DIR}/{city}/{data_filename}.json", "r") as f:
            for line in f:
                city_data.append(json.loads(line))
        data[city] = city_data
    return data


def get_business(city, business_id):
    """
    Given a city name and a business id, return that business's data.
    Returns a dictionary of the form:
        {
            name:str,
            business_id:str,
            stars:str,
            ...
        }
    """
    for business in BUSINESSES[city]:
        if business["business_id"] == business_id:
            return business
    raise IndexError(f"invalid business_id {business_id}")


def get_reviews(city, business_id=None, user_id=None, n=10):
    """
    Given a city name and optionally a business id and/or auser id,
    return n reviews for that business/user combo in that city.
    Returns a dictionary of the form:
        {
            text:str,
            stars:str,
            ...
        }
    """
    def should_keep(review):
        if business_id and review["business_id"] != business_id:
            return False
        if user_id and review["user_id"] != user_id:
            return False
        return True

    reviews = REVIEWS[city]
    reviews = [review for review in reviews if should_keep(review)]
    return random.sample(reviews, min(n, len(reviews)))


def get_user(username):
    """
    Get a user by its username
    Returns a dictionary of the form:
        {
            user_id:str,
            name:str,
            ...
        }
    """
    for city, users in USERS.items():
        for user in users:
            if user["name"] == username:
                return user
    raise IndexError(f"invalid username {username}")


CITIES = load_cities()
USERS = load(CITIES, "user")
BUSINESSES = load(CITIES, "business")
REVIEWS = load(CITIES, "review")
TIPS = load(CITIES, "tip")
CHECKINS = load(CITIES, "checkin")


In [3]:
''' Schoonmaken BUSINESSES'''
#business = citymerge(BUSINESSES)
#business = business[business['is_open'] == 1 & business['categories'].notna()]
#business['categories'] = business['categories'].str.split(', ')
#print(business.latitude.mean(), business.longitude.mean())

' Schoonmaken BUSINESSES'

In [4]:
# Functie om data van alle steden samen te voegen in 1 DataFrame
def citymerge(var):
    return pd.concat([pd.DataFrame(var[city]) for city in var]).reset_index()

In [5]:
# Creeer Utility Matrix en Mean Utility Matrix uit een variabele van data.py (REVIEWS, USERS, BUSINESSES, etc.)
def create_utility_matrix(var):
    df = citymerge(var)

    utility_matrix  = pd.pivot_table(df, index='business_id', columns='user_id', values='stars')

    mean_ultility_matrix = utility_matrix - utility_matrix.mean()
    
    return utility_matrix, mean_ultility_matrix
    
utility_matrix, mean_utility_matrix = create_utility_matrix(REVIEWS)

In [6]:
# Creeer Similarity Matrix uit Mean Utility Matrix
def similarity(mum):
    return pd.DataFrame(cosine_similarity(mum.fillna(0)), index=mum.index, columns=mum.index).replace(0, np.nan)

similarity_matrix = similarity(mean_utility_matrix)

In [7]:
def select_neighborhood(similarity_matrix, utility_matrix, target_user, target_business):
    """selects all items with similarity > 0"""
    # Controleer of target_user en target_business wel in de matrix zijn te vinden.
    if (target_business in similarity_matrix.index) and (target_user in utility_matrix.columns):

        # Maak een boolean mask van bedrijven die de gebruiker beoordeeld heeft met een similarity hoger dan 0.
        SelectedBusinesses = (similarity_matrix[target_business].index.isin(utility_matrix[target_user].dropna().index)) & (similarity_matrix[target_business] > 0)
    
        # return de bedrijven met de similarity door gebruik te maken van de eerder gecreeerde boolean mask.
        return similarity_matrix[target_business][SelectedBusinesses].sort_values(ascending = False)
    
    # Bij waarden die niet gevonden kunnen worden geef None terug.
    else:
        return pd.Series()

%time neighborhood = select_neighborhood(similarity_matrix, utility_matrix, 'e3fdrK1tMCwWLr76LFe-cA', 'ZTg8adZipR3QDoJmFZZqJw')
display(neighborhood)

Wall time: 16.5 ms


business_id
ZTg8adZipR3QDoJmFZZqJw    1.000000
oZH3Ee7Yjk7u8B4Ed0oVOg    0.052780
eigkQ_PuRON8Se265NqQDQ    0.050649
r0DureDzsHpzs_VZem5k7g    0.046030
0VJ8tBxOpD2OxuioVjaAxA    0.038859
5MWWP4Kpmw0e8d2ib9G7Kg    0.031414
EpJhRvkGDFE-GDPHM32klw    0.019380
PzuyoHj3-VrYK7N8ZestNA    0.015228
spjaRNFn9Lmh4petKBuf5g    0.011917
6tK-R3BQ-GiMxsCGtxpJyw    0.001078
irwDkp2eMP2x-4MfunRt8g    0.000097
Name: ZTg8adZipR3QDoJmFZZqJw, dtype: float64

In [9]:
def weighted_mean(neighborhood, utility_matrix, user_id):
    # Controleer of neighborhood wel een Series is en utility_matrix wel een DataFrame, anders return 0.
    if isinstance(neighborhood, pd.Series) and isinstance(utility_matrix, pd.DataFrame):
        # Als neighborhood of de utility_matrix leeg zijn return dan 0.
        if (neighborhood.empty) or (utility_matrix.empty):
            return 0
        
        # Controleer of user_id als kolom te vinden is, anders return 0.
        elif user_id in utility_matrix.columns:
    
            # Gebruik de bovenstaande formule om het gewogen gemiddelde voor de neighborhood te berekenen.
            return ((utility_matrix[user_id] * neighborhood).dropna().sum()) / (neighborhood.sum())

        else:
            return 0
    else:
        return 0
    
prediction = weighted_mean(neighborhood, utility_matrix, 'e3fdrK1tMCwWLr76LFe-cA')
print (prediction)

3.9637712566801744


In [39]:
# Voorspel een score voor bedrijven en return deze als dict
def predictions(utility_matrix, similarity_matrix, user_id):
    predictdict = defaultdict()
    
    # Extract list of reviewed businesses to get new results
    review_list = utility_matrix[user_id][utility_matrix[user_id].notna()].index
    
    for business_id in similarity_matrix.index:
        if business_id in review_list:
            pass
        else:
            predictdict[business_id] = weighted_mean(select_neighborhood(similarity_matrix, utility_matrix, user_id, business_id), utility_matrix, user_id)
    
    return predictdict

In [106]:
%time userpredict = predictions(utility_matrix, similarity_matrix, 'zyGHTM-BbeY7umYiVFRoSQ')

Wall time: 7.91 s


In [37]:
test = utility_matrix['e3fdrK1tMCwWLr76LFe-cA'][utility_matrix['e3fdrK1tMCwWLr76LFe-cA'].notna()].index


0VJ8tBxOpD2OxuioVjaAxA
1NNBpiQ3rUT-7T8ch45SVA
47me-6Zme7RYR0zEonfaHg
5MWWP4Kpmw0e8d2ib9G7Kg
5rb2tq_AhxQuRYu_bh464g
6tK-R3BQ-GiMxsCGtxpJyw
EpJhRvkGDFE-GDPHM32klw
PzuyoHj3-VrYK7N8ZestNA
REfekXeUbPWGkgvlzSNbpw
ZTg8adZipR3QDoJmFZZqJw
cbjF6szaq2orE0BplGAKEA
dVvij7VRh55dISu02I0IGw
eigkQ_PuRON8Se265NqQDQ
irwDkp2eMP2x-4MfunRt8g
oZH3Ee7Yjk7u8B4Ed0oVOg
r0DureDzsHpzs_VZem5k7g
spjaRNFn9Lmh4petKBuf5g
uisfAopUaCje3w5_bv4nrQ
wTNWq7jrCZD1q2hSjUtTXg
z71IqTCbQW7uzw2H2T1QrA


In [107]:
#userpredict
import operator
sorted_userpredict = sorted(userpredict.items(), key=operator.itemgetter(1), reverse=True)

In [108]:
sorted_userpredict

[('NF6s6z1NLXuy4feyx2vwRA', 5.000000000000001),
 ('npIJIIypRW9kGSx0b-wUQw', 5.000000000000001),
 ('-4-MzST67P_jnX4mh3MIcw', 5.0),
 ('-YGQwikbX2fXUIjyegR7pw', 5.0),
 ('0Jyc53wOZt4dbbVPrKjuxA', 5.0),
 ('0PK7GImBWGwIJ5shGdYB0w', 5.0),
 ('0qZ9slxJIIDUocyCJefcdw', 5.0),
 ('1PPHo4ZldEQDMkKlqTgqzQ', 5.0),
 ('1mzjn8nQgipFTIrPfCeCmQ', 5.0),
 ('2AF_U-q3ChfeEtyMmJYD8Q', 5.0),
 ('2l7q7SiHvjCtAkQyTECbBA', 5.0),
 ('3PTX5LZr_qUQELqO42lNJg', 5.0),
 ('3Vi_I-TPwrrkfP99jxLVXg', 5.0),
 ('4MpINkjEExYsHCHQy-ZG7A', 5.0),
 ('5Kh5i4VhXj-Leg8gujIzjQ', 5.0),
 ('5SANr9BTMVe_-pe3QM3M-g', 5.0),
 ('7R4CfRmbTt3z1i26-VFlqA', 5.0),
 ('9uGTVWSioqn6sDPd87uVWg', 5.0),
 ('Cm3YN6rN9XOdeJU7pes4Hg', 5.0),
 ('EvDC40_758okpoPqFJy3Gg', 5.0),
 ('GLRwu_Hjf6gB05uUqXhgnQ', 5.0),
 ('GYmdm_Vy7sf9tji7WUcohg', 5.0),
 ('HeguZ8QHnYLk3rH6U72cJQ', 5.0),
 ('I6-Ex80aV62wEWQ4xKxQmA', 5.0),
 ('JQOg5iKV-c8e3b6ty8Jjbg', 5.0),
 ('LLGlAEwWsGR1MY087U9Bzw', 5.0),
 ('OxIw_dQ0kwwKUEEwZO9yAA', 5.0),
 ('OxSaGGTmIujsjDpDqwyGPQ', 5.0),
 ('Oyez2_wkRaPn_DCmN

In [52]:
businessdf = citymerge(BUSINESSES)

In [79]:
for k in userpredict.keys():
    print (k)

-2XMn8phKIqizvss9PBLCw
-4-MzST67P_jnX4mh3MIcw
-HbTh_spJOeyEbdj4geK2Q
-MRv4Q6MaA2w2rCJbrct_g
-MsRvdPnuw6QuLn5Vxjruw
-WOgxfwCEkt-6SVcSjdU0w
-XLjdLpTLJ672EMN5Dfrhg
-YGQwikbX2fXUIjyegR7pw
-_theFeQixeQT_V4xZexQg
-_xB9vYFTQDXKEwZrE5zIQ
-ak1fx5L9cNjUE56as12MA
-lAV1uegafxCjGE306kBYQ
-ldren1EOMh3PkExcpyRoA
-mG8f5khTz8aGVoSCSFmow
-mP3F3srknwKJdJ5FqcX5Q
-oi8V6CcxMT4pUcs1XmLEg
-pmqS-odJCmxOvWfRFAQ9Q
-yL5oGeXMwUaq4ysDwF3Lw
00so-3NuawQ98XKLPMVFLg
0I0yATfnNQJB-A-IPGgqDA
0J2QK4Hk8EvJKYGu65qH8w
0Jyc53wOZt4dbbVPrKjuxA
0P5LEcUbE-Pg8eHnLaBC5Q
0PK7GImBWGwIJ5shGdYB0w
0Tz3J6I9tPIwYpcpgMk6jg
0W-qCKguiieaKQM5Pc75zw
0Y3lHyqRHfWOBuQlS1bM0g
0ZN2MfHyjNIkCx7qJvVhDg
0ZNIAF_1d83uK1Z7mADTfA
0nAFblolSC3TfbHDq8yjGQ
0qZ9slxJIIDUocyCJefcdw
10yMh47KzS5Y3-upBuP2vA
12OphEDbFl3NS-A0ZtyC-Q
12_HZrcGfQny55r-bILQWQ
132kYoXPHORw_FgxvQfyGQ
13PSWBZyaP89VaQCRerwQA
14T3cCQcTjAHzoE_4r1bxw
19juhaq-H-bhbwV2qKH6rA
1AYsT9NunR_UXY26rVIjXw
1FnwvtISSL0HxIdPLLow1g
1GLJHF1aIrKujEmfEBHPYQ
1GbAh4ly5s0L3ILc1uTLLA
1IbrFRdTTdPUG0Ne1pEXJA
1LzgGMt7Uw3

In [124]:
#businessdf
pserie = pd.Series(userpredict)

test = pserie[pserie != 0].sort_values(ascending=False)

test2 = businessdf[businessdf['business_id'].isin(test.index)]

test2  = test2.assign(prediction=test2['business_id'].map(userpredict)).sort_values('prediction', ascending=False)

test2['stars'] = test2['stars'].astype(str)

a =test2[test2['is_open'] == 1].head(10)[['business_id', 'stars', 'name', 'city', 'address']].to_dict(orient='records')

a

[{'business_id': 'NF6s6z1NLXuy4feyx2vwRA',
  'stars': '2.5',
  'name': 'Lion and Dragon Pub',
  'city': 'Ajax',
  'address': '109 Old Kingston Road'},
 {'business_id': '9uGTVWSioqn6sDPd87uVWg',
  'stars': '3.5',
  'name': "Beryl's Pepperpot",
  'city': 'Ajax',
  'address': '467 Westney Road S, Unit 17'},
 {'business_id': 'cRDmso6337zu81snfJIZAg',
  'stars': '3.5',
  'name': "The Puppy's Place",
  'city': 'Ajax',
  'address': '33 Harwood Avenue S'},
 {'business_id': 'VPpeZPIS3WCyMPHJO__9Hg',
  'stars': '4.5',
  'name': 'Coffee Culture Cafe & Eatery',
  'city': 'Ajax',
  'address': '60 Salem Road'},
 {'business_id': 'I6-Ex80aV62wEWQ4xKxQmA',
  'stars': '3.5',
  'name': "Wimpy's Diner",
  'city': 'Ajax',
  'address': '5 Harwood Ave S'},
 {'business_id': '2l7q7SiHvjCtAkQyTECbBA',
  'stars': '3.5',
  'name': 'Il Fornello',
  'city': 'Ajax',
  'address': '95 Kingston Road E'},
 {'business_id': 'eoR96ZQ3r4t-oglD_a4Heg',
  'stars': '4.0',
  'name': 'Sunset Grill',
  'city': 'Ajax',
  'address'

In [105]:
get_user('Michael')

{'user_id': 'zyGHTM-BbeY7umYiVFRoSQ',
 'name': 'Michael',
 'review_count': 3,
 'yelping_since': '2016-09-25 23:25:24',
 'useful': 1,
 'funny': 1,
 'cool': 1,
 'elite': '',
 'friends': 'TAcgxvr3PTdNSHlj--kBTA, oDhC9wdsV1xdGzKmXSKYwg, 0cP_SBfOAdg_ddsGDSg92Q, UjS-Ev-8Tx2sMM32r2CdHQ, RUIcRc38N5jkeyYhj59_RA, sKi9jo1PAjB1-lJlv3h82w, A1JRZBs_Q367rBTa3x_KEQ, kMHOje0EFFNxPBxhfNF1zQ, eXGm7Ia1B-4rvOdm_CQXdQ, scUIuGLfR78Z7YLOQUyTKQ, nmxyv0TxXDfbaB1oj9IxGA, HovJuY1Ymwtb0I-w9it6iQ, Xc-pqoCvRtRW87O1iWH8FA, ZeYT3cYmDlKQJVl3GKH6sQ, qPrlyZDWAU9mlB_Gs5qviA, lYY_2cr2Jb-y43kZNoDJLg, uVFQCGiQ3TsKvT0kjJwfXg, V-F0cxsBq6vm-ARJ-6Hd_g, F61EiR5H31DIeq8URUYFgQ, irWdkM7anlq1X-lMYxDNJw, jw8H9aAlIjwf_BU1DwgFCg, rvyZlmxk9smQbVUmXVz_WQ, K40206USpgZVuaNR6rbCPw, JoGeZo9_7SddybfazZMTtg, 86Bxbs2xsBe1Yt6JPuDXaQ, TODY7r8cxUG9iylcPmu8Jw, I3ODZv74MEDhblwEzY4iyg, VZn1zXLL7FbMoyMacg5yGg, LWMc-kPe3NA7iPghXCOIbA, 7W8DKjubRBR-3saTg6sz0w, oKwtwE4YLbQd6e7NLZZLDQ, aHg2_U_A9w9RMwDzLzL_0A, pJa8BX-sYX8Pp5KNvumUnw, RhV9322zKFLJ576urk3rqQ,