# Analyze Data

Korte data analyse voor het verkenne van de yelp data voor het verslag van week 1.

In [1]:
# Imports
import pandas as pd
import os
import json
import random
from IPython.display import display
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
"""
This file loads the data from the data directory and shows you how.
Feel free to change the contents of this file!
Do ensure these functions remain functional:
    - get_business(city, business_id)
    - get_reviews(city, business_id=None, user_id=None, n=10)
    - get_user(username)
"""

import os
import json
import random

DATA_DIR = "../data"


def load_cities():
    """
    Finds all cities (all directory names) in ./data
    Returns a list of city names
    """
    return os.listdir(DATA_DIR)


def load(cities, data_filename):
    """
    Given a list of city names,
        for each city extract all data from ./data/<city>/<data_filename>.json
    Returns a dictionary of the form:
        {
            <city1>: [<entry1>, <entry2>, ...],
            <city2>: [<entry1>, <entry2>, ...],
            ...
        }
    """
    data = {}
    for city in cities:
        city_data = []
        with open(f"{DATA_DIR}/{city}/{data_filename}.json", "r") as f:
            for line in f:
                city_data.append(json.loads(line))
        data[city] = city_data
    return data


def get_business(city, business_id):
    """
    Given a city name and a business id, return that business's data.
    Returns a dictionary of the form:
        {
            name:str,
            business_id:str,
            stars:str,
            ...
        }
    """
    for business in BUSINESSES[city]:
        if business["business_id"] == business_id:
            return business
    raise IndexError(f"invalid business_id {business_id}")


def get_reviews(city, business_id=None, user_id=None, n=10):
    """
    Given a city name and optionally a business id and/or auser id,
    return n reviews for that business/user combo in that city.
    Returns a dictionary of the form:
        {
            text:str,
            stars:str,
            ...
        }
    """
    def should_keep(review):
        if business_id and review["business_id"] != business_id:
            return False
        if user_id and review["user_id"] != user_id:
            return False
        return True

    reviews = REVIEWS[city]
    reviews = [review for review in reviews if should_keep(review)]
    return random.sample(reviews, min(n, len(reviews)))


def get_user(username):
    """
    Get a user by its username
    Returns a dictionary of the form:
        {
            user_id:str,
            name:str,
            ...
        }
    """
    for city, users in USERS.items():
        for user in users:
            if user["name"] == username:
                return user
    raise IndexError(f"invalid username {username}")


CITIES = load_cities()
USERS = load(CITIES, "user")
BUSINESSES = load(CITIES, "business")
REVIEWS = load(CITIES, "review")
TIPS = load(CITIES, "tip")
CHECKINS = load(CITIES, "checkin")


In [3]:
''' Schoonmaken BUSINESSES'''
#business = citymerge(BUSINESSES)
#business = business[business['is_open'] == 1 & business['categories'].notna()]
#business['categories'] = business['categories'].str.split(', ')
#print(business.latitude.mean(), business.longitude.mean())

' Schoonmaken BUSINESSES'

In [4]:
# Functie om data van alle steden samen te voegen in 1 DataFrame
def citymerge(var):
    return pd.concat([pd.DataFrame(var[city]) for city in var]).reset_index()

In [75]:
# Creeer Utility Matrix en Mean Utility Matrix uit een variabele van data.py (REVIEWS, USERS, BUSINESSES, etc.)
def create_utility_matrix(var):
#     df = citymerge(var)
    df = var

    utility_matrix  = pd.pivot_table(df, index='business_id', columns='user_id', values='stars')

    mean_ultility_matrix = utility_matrix - utility_matrix.mean()
    
    return utility_matrix, mean_ultility_matrix
    
utility_matrix, mean_utility_matrix = create_utility_matrix(reviews)

In [6]:
# Creeer Similarity Matrix uit Mean Utility Matrix
def similarity(mum):
    return pd.DataFrame(cosine_similarity(mum.fillna(0)), index=mum.index, columns=mum.index).replace(0, np.nan)

similarity_matrix = similarity(mean_utility_matrix)

In [7]:
def select_neighborhood(similarity_matrix, utility_matrix, target_user, target_business):
    """selects all items with similarity > 0"""
    # Controleer of target_user en target_business wel in de matrix zijn te vinden.
    if (target_business in similarity_matrix.index) and (target_user in utility_matrix.columns):

        # Maak een boolean mask van bedrijven die de gebruiker beoordeeld heeft met een similarity hoger dan 0.
        SelectedBusinesses = (similarity_matrix[target_business].index.isin(utility_matrix[target_user].dropna().index)) & (similarity_matrix[target_business] > 0)
    
        # return de bedrijven met de similarity door gebruik te maken van de eerder gecreeerde boolean mask.
        return similarity_matrix[target_business][SelectedBusinesses].sort_values(ascending = False)
    
    # Bij waarden die niet gevonden kunnen worden geef None terug.
    else:
        return pd.Series()

%time neighborhood = select_neighborhood(similarity_matrix, utility_matrix, 'e3fdrK1tMCwWLr76LFe-cA', 'ZTg8adZipR3QDoJmFZZqJw')
display(neighborhood)

Wall time: 4.99 ms


business_id
ZTg8adZipR3QDoJmFZZqJw    1.000000
oZH3Ee7Yjk7u8B4Ed0oVOg    0.052780
eigkQ_PuRON8Se265NqQDQ    0.050649
r0DureDzsHpzs_VZem5k7g    0.046030
0VJ8tBxOpD2OxuioVjaAxA    0.038859
5MWWP4Kpmw0e8d2ib9G7Kg    0.031414
EpJhRvkGDFE-GDPHM32klw    0.019380
PzuyoHj3-VrYK7N8ZestNA    0.015228
spjaRNFn9Lmh4petKBuf5g    0.011917
6tK-R3BQ-GiMxsCGtxpJyw    0.001078
irwDkp2eMP2x-4MfunRt8g    0.000097
Name: ZTg8adZipR3QDoJmFZZqJw, dtype: float64

In [8]:
def weighted_mean(neighborhood, utility_matrix, user_id):
    # Controleer of neighborhood wel een Series is en utility_matrix wel een DataFrame, anders return 0.
    if isinstance(neighborhood, pd.Series) and isinstance(utility_matrix, pd.DataFrame):
        # Als neighborhood of de utility_matrix leeg zijn return dan 0.
        if (neighborhood.empty) or (utility_matrix.empty):
            return 0
        
        # Controleer of user_id als kolom te vinden is, anders return 0.
        elif user_id in utility_matrix.columns:
    
            # Gebruik de bovenstaande formule om het gewogen gemiddelde voor de neighborhood te berekenen.
            return ((utility_matrix[user_id] * neighborhood).dropna().sum()) / (neighborhood.sum())

        else:
            return 0
    else:
        return 0
    
prediction = weighted_mean(neighborhood, utility_matrix, 'e3fdrK1tMCwWLr76LFe-cA')
print (prediction)

3.9637712566801744


In [9]:
# Voorspel een score voor bedrijven en return deze als dict
def predictions(utility_matrix, similarity_matrix, user_id):
    predictdict = defaultdict()
    
    # Extract list of reviewed businesses to get new results
    review_list = utility_matrix[user_id][utility_matrix[user_id].notna()].index
    
    for business_id in similarity_matrix.index:
        if business_id in review_list:
            pass
        else:
            predictdict[business_id] = weighted_mean(select_neighborhood(similarity_matrix, utility_matrix, user_id, business_id), utility_matrix, user_id)
    
    return predictdict

In [10]:
%time userpredict = predictions(utility_matrix, similarity_matrix, 'zyGHTM-BbeY7umYiVFRoSQ')

Wall time: 3.37 s


In [11]:
test = utility_matrix['e3fdrK1tMCwWLr76LFe-cA'][utility_matrix['e3fdrK1tMCwWLr76LFe-cA'].notna()].index


In [12]:
#userpredict
import operator
# sorted_userpredict = sorted(userpredict.items(), key=operator.itemgetter(1), reverse=True)

In [77]:
# sorted_userpredict

In [14]:
# businessdf = citymerge(BUSINESSES)

In [16]:
#businessdf
pserie = pd.Series(userpredict)

test = pserie[pserie != 0].sort_values(ascending=False)

test2 = businessdf[businessdf['business_id'].isin(test.index)]

test2  = test2.assign(prediction=test2['business_id'].map(userpredict)).sort_values('prediction', ascending=False)

test2['stars'] = test2['stars'].astype(str)

a =test2[test2['is_open'] == 1].head(10)[['business_id', 'stars', 'name', 'city', 'address']].to_dict(orient='records')


In [51]:
'-2XMn8phKIqizvss9PBLCw' in ibcf_predictions.keys()

True

In [76]:
CITY = 'westlake'
reviews = pd.read_json(DATA_DIR+'/'+CITY+'/review.json', lines=True)
businesses = pd.read_json(DATA_DIR+'/'+CITY+'/business.json', lines=True)
# display(pd.read_json(DATA_DIR+'/'+CITY+'/user.json', lines=True).head())
# reviews[reviews['user_id'] == 'apP3CApEq6-z59tRLwEBYA']['business_id']

def predictions_combiner(ibcf_predictions, cb_predictions, ibcf_weight=0.8, reviews=reviews, businesses=businesses):
    hybrid_predictions = {}
    cb_weight = 1 - ibcf_weight
    for business_id in cb_predictions:
        if business_id in ibcf_predictions:
            ibcf_prediction = ibcf_predictions[business_id]
            cb_prediction = cb_predictions[business_id]
            hybrid_predictions[business_id] = ibcf_prediction*ibcf_weight + cb_prediction*cb_weight
            if ibcf_prediction == 0:
                hybrid_predictions[business_id] = cb_prediction
            if cb_prediction == 0:
                hybrid_predictions[business_id] = ibcf_prediction
        else:
            hybrid_predictions[business_id] = cb_prediction
            
    return  dict(sorted(hybrid_predictions.items(), key=operator.itemgetter(1), reverse=True))
    
def content_based_recommender(user_id, businesses=businesses):
    predicted_ratings = {}
    category_dict = make_category_dict(user_id)
    for business_id in businesses['business_id']:
        predicted_ratings[business_id] = content_based_predictor(user_id, business_id, category_dict)
    return predicted_ratings
    
def content_based_predictor(user_id, business_id, category_dict, reviews=reviews, businesses=businesses):
    ratings = []
    categories = businesses[businesses['business_id'] == business_id]['categories'].str.split(', ')
    for x in categories:
        for category in x:
            if category in category_dict:
                ratings.append(category_dict[category])
    if len(ratings) == 0:
        return 0
    else:
        return sum(ratings)/len(ratings)
        
def make_category_dict(user_id, reviews=reviews, businesses=businesses):
    category_dict = {}
    user_reviews = reviews[reviews['user_id'] == user_id]
    business_ids = user_reviews['business_id']
    for business_id in business_ids:
        categories = businesses[businesses['business_id'] == business_id]['categories'].str.split(', ')
        stars = user_reviews[user_reviews['business_id'] == business_id]['stars'].values[0]
        for x in categories:
            for category in x:
                if category in category_dict:
                    category_dict[category].append(stars)
                else:
                    category_dict[category] = [stars]
    for category in category_dict:
        category_dict[category] = sum(category_dict[category])/len(category_dict[category])
    return category_dict

%time cb_predictions = content_based_recommender('e3fdrK1tMCwWLr76LFe-cA')
%time ibcf_predictions = predictions(utility_matrix, similarity_matrix, 'e3fdrK1tMCwWLr76LFe-cA')
# cb_predictions
# ibcf_predictions
# only_icbf = predictions_combiner(ibcf_predictions, cb_predictions)
all_predictions = predictions_combiner(ibcf_predictions, cb_predictions)

Wall time: 332 ms
Wall time: 2.97 s


In [115]:
def select_items(predictions):
    pool = []
    average = sum(list(predictions.values())[0:10])/10
    while len(pool) < 10:
        average -= 0.1 * average
        for prediction in predictions:
            if predictions[prediction] > average:
                pool.append(prediction)
            else:
                break
    return random.sample(pool, k=max10)

select_items(all_predictions)

['hSOT9HNTINzBVV5MbNBMVQ',
 '7BIMKg5DAVi_vHNBd7pmiA',
 'MgO_JJB7k_Szs1sjnGfpOA',
 'JicFOJCipO_JeHzpjcmzCQ',
 'HW7JPZBImm3tyEpDgGxKGQ',
 'NGAUlZFlt8WPjWJqa_kfyQ',
 'Gd5jcl9smnwASaFYmJX75A',
 'tqvs47tAS6i2Z_IOlZbGsg',
 '8x3rDQCKUJfnJSdsU_6Y3g',
 'Nc8VF2ZbwSUIQ8w2CUJvQw']

In [68]:
all_predictions

{'lWCpbAnV232FjLo7FbHLkQ': 5.0,
 'tqvs47tAS6i2Z_IOlZbGsg': 5.0,
 'Gd5jcl9smnwASaFYmJX75A': 4.648648648648648,
 'Cw0mtFh9u2_4SVXe8Tl1rw': 4.333333333333333,
 'BUkk5nwestqU5uH04km_yg': 4.333333333333333,
 '6tK-R3BQ-GiMxsCGtxpJyw': 4.333333333333333,
 'QYPUE-U7bC4YOlSQDSGXLQ': 4.249550243713876,
 'XaqWUocr0MlYVi5yusjk0Q': 4.142745944345377,
 'NGAUlZFlt8WPjWJqa_kfyQ': 4.1110673933906705,
 'HW7JPZBImm3tyEpDgGxKGQ': 4.106779118731299,
 'j_oKMbKv4vlkyRxD9p1fKA': 4.066666666666666,
 'nYvBZYg9rfqWFTYuxSVMdw': 4.064734561544528,
 '-MsRvdPnuw6QuLn5Vxjruw': 4.0,
 'i7lFu1-iadoXW5Hn-JWaeg': 4.0,
 '9vsqbJgjUqQNJpHrSj6jKw': 4.0,
 'iu3nhUGL69utOEyKzOIy3w': 4.0,
 'zOdx4EdRSb2BM06hbq5Tww': 4.0,
 'op0sZT-TNyeTMw0m3HUUDQ': 4.0,
 'uro64fxfGuoUXF159W0n_w': 4.0,
 'Na2HX930AMuPw5eKKy5iiQ': 4.0,
 '5e5lFXdPWUZlAzHi-AUDYQ': 4.0,
 'wAP6tV4HtTHPjVTPP0OBGw': 4.0,
 'vWzfFvSxZCjSWPoVFn6idw': 4.0,
 'pYad8gX0DTOGFDcF5p8dxw': 4.0,
 'Gq9rNXqgV2qnTXy2GXGdtQ': 4.0,
 'GCTVuBcDZZolh8ND4KSwWA': 4.0,
 '8x3rDQCKUJfnJSdsU_6Y3g': 