# Analyze Data

Korte data analyse voor het verkenne van de yelp data voor het verslag van week 1.

In [1]:
# Imports
import pandas as pd
import os
import json
import random
from IPython.display import display
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances
from sklearn.model_selection import train_test_split
import numpy as np
import operator

In [2]:
"""
This file loads the data from the data directory and shows you how.
Feel free to change the contents of this file!
Do ensure these functions remain functional:
    - get_business(city, business_id)
    - get_reviews(city, business_id=None, user_id=None, n=10)
    - get_user(username)
"""

import os
import json
import random

DATA_DIR = "../data"


def load_cities():
    """
    Finds all cities (all directory names) in ./data
    Returns a list of city names
    """
    return os.listdir(DATA_DIR)


def load(cities, data_filename):
    """
    Given a list of city names,
        for each city extract all data from ./data/<city>/<data_filename>.json
    Returns a dictionary of the form:
        {
            <city1>: [<entry1>, <entry2>, ...],
            <city2>: [<entry1>, <entry2>, ...],
            ...
        }
    """
    data = {}
    for city in cities:
        city_data = []
        with open(f"{DATA_DIR}/{city}/{data_filename}.json", "r") as f:
            for line in f:
                city_data.append(json.loads(line))
        data[city] = city_data
    return data


def get_business(city, business_id):
    """
    Given a city name and a business id, return that business's data.
    Returns a dictionary of the form:
        {
            name:str,
            business_id:str,
            stars:str,
            ...
        }
    """
    for business in BUSINESSES[city]:
        if business["business_id"] == business_id:
            return business
    raise IndexError(f"invalid business_id {business_id}")


def get_reviews(city, business_id=None, user_id=None, n=10):
    """
    Given a city name and optionally a business id and/or auser id,
    return n reviews for that business/user combo in that city.
    Returns a dictionary of the form:
        {
            text:str,
            stars:str,
            ...
        }
    """
    def should_keep(review):
        if business_id and review["business_id"] != business_id:
            return False
        if user_id and review["user_id"] != user_id:
            return False
        return True

    reviews = REVIEWS[city]
    reviews = [review for review in reviews if should_keep(review)]
    return random.sample(reviews, min(n, len(reviews)))


def get_user(username):
    """
    Get a user by its username
    Returns a dictionary of the form:
        {
            user_id:str,
            name:str,
            ...
        }
    """
    for city, users in USERS.items():
        for user in users:
            if user["name"] == username:
                return user
    raise IndexError(f"invalid username {username}")


CITIES = load_cities()
USERS = load(CITIES, "user")
BUSINESSES = load(CITIES, "business")
REVIEWS = load(CITIES, "review")
TIPS = load(CITIES, "tip")
CHECKINS = load(CITIES, "checkin")


In [3]:
''' Schoonmaken BUSINESSES'''
#business = citymerge(BUSINESSES)
#business = business[business['is_open'] == 1 & business['categories'].notna()]
#business['categories'] = business['categories'].str.split(', ')
#print(business.latitude.mean(), business.longitude.mean())

' Schoonmaken BUSINESSES'

In [4]:
# Functie om data van alle steden samen te voegen in 1 DataFrame
def citymerge(var):
    return pd.concat([pd.DataFrame(var[city]) for city in var]).reset_index()

In [5]:
CITY = 'westlake'
reviews = pd.read_json(DATA_DIR+'/'+CITY+'/review.json', lines=True)
businesses = pd.read_json(DATA_DIR+'/'+CITY+'/business.json', lines=True)
#businesses = citymerge(BUSINESSES)
reviews_train, reviews_test = train_test_split(reviews)#citymerge(REVIEWS)

In [6]:
# Creeer Utility Matrix en Mean Utility Matrix uit een variabele van data.py (REVIEWS, USERS, BUSINESSES, etc.)
def create_utility_matrix(df):

    utility_matrix  = pd.pivot_table(df, index='business_id', columns='user_id', values='stars')

    mean_ultility_matrix = utility_matrix - utility_matrix.mean()
    
    return utility_matrix, mean_ultility_matrix
    
utility_matrix, mean_utility_matrix = create_utility_matrix(reviews_train)

In [15]:
def manhattan_similarity(matrix, id1, id2):
    """Compute manhattan similarity between two rows."""
    # compute distance
    distance = manhattan_distances(matrix, id1, id2)
    
    # if no distance could be computed (no shared features) return a similarity of 0
    if distance is np.nan:
        return 0
    
    # else return similarity
    return 1 / (1 + distance)

In [16]:
# Creeer Similarity Matrix uit Mean Utility Matrix
def similarity(mum):
    similarity_matrix_cosine = pd.DataFrame(cosine_similarity(mum.fillna(0)), index=mum.index, columns=mum.index).replace(0, np.nan)
    similarity_matrix_manhattan = 1 / (pd.DataFrame(manhattan_distances(mum.fillna(0)), index=mum.index, columns=mum.index).replace(0, np.nan) +1)
    similarity_matrix_euclidean = 1 / (pd.DataFrame(euclidean_distances(mum.fillna(0)), index=mum.index, columns=mum.index).replace(0, np.nan) +1)
    
    return similarity_matrix_cosine, similarity_matrix_manhattan, similarity_matrix_euclidean   
    
similarity_matrix, similarity_matrix_manhattan, similarity_matrix_euclidean = similarity(mean_utility_matrix)

In [17]:
def select_neighborhood(similarity_matrix, utility_matrix, target_user, target_business):
    """selects all items with similarity > 0"""
    # Controleer of target_user en target_business wel in de matrix zijn te vinden.
    if (target_business in similarity_matrix.index) and (target_user in utility_matrix.columns):

        # Maak een boolean mask van bedrijven die de gebruiker beoordeeld heeft met een similarity hoger dan 0.
        SelectedBusinesses = (similarity_matrix[target_business].index.isin(utility_matrix[target_user].dropna().index)) & (similarity_matrix[target_business] > 0)
    
        # return de bedrijven met de similarity door gebruik te maken van de eerder gecreeerde boolean mask.
        return similarity_matrix[target_business][SelectedBusinesses].sort_values(ascending = False)
    
    # Bij waarden die niet gevonden kunnen worden geef None terug.
    else:
        return pd.Series()

%time neighborhood = select_neighborhood(similarity_matrix, utility_matrix, 'e3fdrK1tMCwWLr76LFe-cA', 'ZTg8adZipR3QDoJmFZZqJw')
display(neighborhood)

Wall time: 4 ms


business_id
ZTg8adZipR3QDoJmFZZqJw    1.000000
oZH3Ee7Yjk7u8B4Ed0oVOg    0.065292
EpJhRvkGDFE-GDPHM32klw    0.043266
eigkQ_PuRON8Se265NqQDQ    0.040849
PzuyoHj3-VrYK7N8ZestNA    0.023105
cbjF6szaq2orE0BplGAKEA    0.020043
r0DureDzsHpzs_VZem5k7g    0.016938
0VJ8tBxOpD2OxuioVjaAxA    0.013842
spjaRNFn9Lmh4petKBuf5g    0.012479
6tK-R3BQ-GiMxsCGtxpJyw    0.012072
irwDkp2eMP2x-4MfunRt8g    0.003959
Name: ZTg8adZipR3QDoJmFZZqJw, dtype: float64

In [18]:
def weighted_mean(neighborhood, utility_matrix, user_id):
    # Controleer of neighborhood wel een Series is en utility_matrix wel een DataFrame, anders return np.nan.
    if isinstance(neighborhood, pd.Series) and isinstance(utility_matrix, pd.DataFrame):
        # Als neighborhood of de utility_matrix leeg zijn return dan 0.
        if (neighborhood.empty) or (utility_matrix.empty):
            return np.nan
        
        # Controleer of user_id als kolom te vinden is, anders return 0.
        elif user_id in utility_matrix.columns:
    
            # Gebruik de bovenstaande formule om het gewogen gemiddelde voor de neighborhood te berekenen.
            return ((utility_matrix[user_id] * neighborhood).dropna().sum()) / (neighborhood.sum())

        else:
            return np.nan
    else:
        return np.nan
    
prediction = weighted_mean(neighborhood, utility_matrix, 'e3fdrK1tMCwWLr76LFe-cA')
print (prediction)

3.8803899821903323


In [19]:
# Voorspel een score voor bedrijven en return deze als dict
def predictions(utility_matrix, similarity_matrix, user_id):
    predictdict = defaultdict()
    
    # Extract list of reviewed businesses to get new results
    review_list = utility_matrix[user_id][utility_matrix[user_id].notna()].index
    
    for business_id in similarity_matrix.index:
        if business_id in review_list:
            pass
        else:
            predictdict[business_id] = weighted_mean(select_neighborhood(similarity_matrix, utility_matrix, user_id, business_id), utility_matrix, user_id)
    
    return predictdict

In [20]:
# display(pd.read_json(DATA_DIR+'/'+CITY+'/user.json', lines=True).head())
# reviews[reviews['user_id'] == 'apP3CApEq6-z59tRLwEBYA']['business_id']

def predictions_combiner(ibcf_predictions, cb_predictions, ibcf_weight=0.8, reviews=reviews_train, businesses=businesses):
    hybrid_predictions = {}
    cb_weight = 1 - ibcf_weight
    for business_id in cb_predictions:
        if business_id in ibcf_predictions:
            ibcf_prediction = ibcf_predictions[business_id]
            cb_prediction = cb_predictions[business_id]
            hybrid_predictions[business_id] = ibcf_prediction*ibcf_weight + cb_prediction*cb_weight
            if ibcf_prediction == np.nan:
                hybrid_predictions[business_id] = cb_prediction
            if cb_prediction == np.nan:
                hybrid_predictions[business_id] = ibcf_prediction
        else:
            hybrid_predictions[business_id] = cb_prediction
            
    return  dict(sorted(hybrid_predictions.items(), key=operator.itemgetter(1), reverse=True))
    
def content_based_recommender(user_id, businesses=businesses):
    predicted_ratings = {}
    category_dict = make_category_dict(user_id)
    for business_id in businesses['business_id']:
        predicted_ratings[business_id] = content_based_predictor(user_id, business_id, category_dict)
    return predicted_ratings
    
def content_based_predictor(user_id, business_id, category_dict, reviews=reviews_train, businesses=businesses):
    ratings = []
    categories = businesses[businesses['business_id'] == business_id]['categories'].str.split(', ')
    for x in categories:
        for category in x:
            if category in category_dict:
                ratings.append(category_dict[category])
    if len(ratings) == 0:
        return np.nan
    else:
        return sum(ratings)/len(ratings)
        
def make_category_dict(user_id, reviews=reviews, businesses=businesses):
    category_dict = {}
    user_reviews = reviews[reviews['user_id'] == user_id]
    business_ids = user_reviews['business_id']
    for business_id in business_ids:
        categories = businesses[businesses['business_id'] == business_id]['categories'].str.split(', ')
        stars = user_reviews[user_reviews['business_id'] == business_id]['stars'].values[0]
        for x in categories:
            for category in x:
                if category in category_dict:
                    category_dict[category].append(stars)
                else:
                    category_dict[category] = [stars]
    for category in category_dict:
        category_dict[category] = sum(category_dict[category])/len(category_dict[category])
    return category_dict

%time cb_predictions = content_based_recommender('e3fdrK1tMCwWLr76LFe-cA')
%time ibcf_predictions = predictions(utility_matrix, similarity_matrix, 'e3fdrK1tMCwWLr76LFe-cA')
# cb_predictions
# ibcf_predictions
# only_icbf = predictions_combiner(ibcf_predictions, cb_predictions)
all_predictions = predictions_combiner(ibcf_predictions, cb_predictions)

Wall time: 340 ms
Wall time: 1.08 s


In [21]:
def select_items(predictions):
    pool = []
    safety = 0
    average = sum(list(predictions.values())[0:10])/10
    average -= 0.1 * average
    for prediction in predictions:
        if predictions[prediction] > average:
            pool.append(prediction)
        else:
            break
    if len(pool) < 10:
        pool = list(predictions.keys())[0:10]
    return random.sample(pool, k=10)

select_items(all_predictions)

['KR2kRmHnRCaNzOUEGoB25w',
 'mW28NmePxX7pZv8lCv7v2Q',
 'i7lFu1-iadoXW5Hn-JWaeg',
 '0ZN2MfHyjNIkCx7qJvVhDg',
 'K5iqS0JXnKFFujZHIWQsag',
 '-MsRvdPnuw6QuLn5Vxjruw',
 'C_oejk3EzfsxP7-owQDkbQ',
 'OD88wvH-9LxM_Gz4oHOIDg',
 '2lcK3d4K7FU6O8wXdWzOmA',
 'VZKWW2zQbk-rxwpAcque8w']

In [22]:
pd.Series(all_predictions).dropna()

0ZN2MfHyjNIkCx7qJvVhDg    3.704260
K5iqS0JXnKFFujZHIWQsag    3.267227
2lcK3d4K7FU6O8wXdWzOmA    3.550670
mW28NmePxX7pZv8lCv7v2Q    3.468254
KR2kRmHnRCaNzOUEGoB25w    3.441204
nYvBZYg9rfqWFTYuxSVMdw    3.027455
LhW7pWkVgu_bAMiRhO3Wow    3.900000
UT0FUBRmpnGHE1U4Jpegxw    3.900000
_v7lMUtdd6WlgKUEBLA_VA    2.708929
QcW0360vpeEilCLBlKEiLw    3.610239
bzdb1jJ1j8Qn_RVHY97FnA    4.689697
7dlCzYnXDaubTAfvgAX6sQ    3.687283
z58nyUVyDV-vC7nXFfvR5g    2.874526
op0sZT-TNyeTMw0m3HUUDQ    4.000000
5kCRty4p7tBwM9P7MAXgvA    3.742369
b--I9Ed6N5P9fwEb2qaiqg    4.405028
95Efv0xKoUsP5lvTnaeK5w    4.090507
m9Cme6bUPuZFtqhN1AgmdQ    3.868025
kkNWzhSpAjxm0zIxol3IzQ    3.848649
OiR6kP_mAD47vKU1WiPshg    3.824324
x5lN8HdgDVWXwoL9N7MDoQ    3.180093
hTdV1_Xn1j6yhpIR7CgCJQ    2.958851
t10X85k_m8RuAZZgoMzvXg    4.000000
EpJhRvkGDFE-GDPHM32klw    3.781081
yG9SpxEQVIhk6e1o3jbSKg    3.724324
SPb3oLIVZynmiS670Vo1kQ    3.637547
BsLxCm_DMxCNVEPHoaj8hw    3.109976
axGP6FXAZlBdmdN9juu1tQ    3.104543
mEx2NJBepbnVSjZG6Mqw

In [25]:
def predict_ratings_item_based(similarity, utility, test_data):
    # For loop door alle index waarden
    for ID in test_data.index:
        # Bereken neighbourhood
        neighborhood = select_neighborhood(similarity, utility, test_data.loc[ID, 'user_id'], test_data.loc[ID, 'business_id'])
        # Voeg het gewogen gemiddelde toe als predicted rating aan de test_data Dataframe
        test_data.loc[ID, 'predicted rating'] = weighted_mean(neighborhood, utility, test_data.loc[ID, 'user_id'])
    
    return test_data
    
review_test = reviews
predicted_item_based_cosine = predict_ratings_item_based(similarity_matrix_cosine, utility_matrix, review_test[['user_id', 'business_id', 'stars']])
# predicted_item_based_manhattan = predict_ratings_item_based(similarity_matrix_manhattan, utility_matrix, review_test[['user_id', 'business_id', 'stars']])
# predicted_item_based_euclidean = predict_ratings_item_based(similarity_matrix_euclidean, utility_matrix, review_test[['user_id', 'business_id', 'stars']])

In [27]:
def mse(predicted_ratings):
    # Bereken mse uit de formule hierboven
    return ((predicted_ratings['stars'] - predicted_ratings['predicted rating']).pow(2).sum()) / (predicted_ratings.shape[0])
    
print('Cosine Similarity MSE: '+str(mse(predicted_item_based_cosine)))
# print('Manhattan Similarity MSE: '+str(mse(predicted_item_based_manhattan)))
# print('Euclidean Similarity MSE: '+str(mse(predicted_item_based_euclidean)))

Cosine Similarity MSE: 0.15951995956208295


In [28]:
test = review_test[['user_id', 'business_id', 'stars']]
test[test['user_id'] == 'e3fdrK1tMCwWLr76LFe-cA']

Unnamed: 0,user_id,business_id,stars
186,e3fdrK1tMCwWLr76LFe-cA,cbjF6szaq2orE0BplGAKEA,3
317,e3fdrK1tMCwWLr76LFe-cA,z71IqTCbQW7uzw2H2T1QrA,3
2137,e3fdrK1tMCwWLr76LFe-cA,EpJhRvkGDFE-GDPHM32klw,3
3345,e3fdrK1tMCwWLr76LFe-cA,0VJ8tBxOpD2OxuioVjaAxA,5
3491,e3fdrK1tMCwWLr76LFe-cA,0VJ8tBxOpD2OxuioVjaAxA,5
3871,e3fdrK1tMCwWLr76LFe-cA,0VJ8tBxOpD2OxuioVjaAxA,5
3878,e3fdrK1tMCwWLr76LFe-cA,1NNBpiQ3rUT-7T8ch45SVA,1
4258,e3fdrK1tMCwWLr76LFe-cA,uisfAopUaCje3w5_bv4nrQ,2
4739,e3fdrK1tMCwWLr76LFe-cA,uisfAopUaCje3w5_bv4nrQ,3
4902,e3fdrK1tMCwWLr76LFe-cA,47me-6Zme7RYR0zEonfaHg,4


In [44]:
def predict_ratings_content_based(test_data):
    # For loop door alle index waarden
    total = len(test_data.index)
    current = 0
    for ID in test_data.index:
        current += 1
        if current % 100 == 0:
            print(str(current) + "\ " + str(total))
        # Bereken neighbourhood
        category_dict = make_category_dict(test_data.loc[ID, 'user_id'])
        test_data.loc[ID, 'predicted rating'] = content_based_predictor(test_data.loc[ID, 'user_id'], test_data.loc[ID, 'business_id'], category_dict)
    
    return test_data

predicted_content_based = predict_ratings_content_based(review_test[['user_id', 'business_id', 'stars']])

print (mse(predicted_content_based))

100\ 8895
200\ 8895
300\ 8895
400\ 8895
500\ 8895
600\ 8895
700\ 8895
800\ 8895
900\ 8895
1000\ 8895
1100\ 8895
1200\ 8895
1300\ 8895
1400\ 8895
1500\ 8895
1600\ 8895
1700\ 8895
1800\ 8895
1900\ 8895
2000\ 8895
2100\ 8895
2200\ 8895
2300\ 8895
2400\ 8895
2500\ 8895
2600\ 8895
2700\ 8895
2800\ 8895
2900\ 8895
3000\ 8895
3100\ 8895
3200\ 8895
3300\ 8895
3400\ 8895
3500\ 8895
3600\ 8895
3700\ 8895
3800\ 8895
3900\ 8895
4000\ 8895
4100\ 8895
4200\ 8895
4300\ 8895
4400\ 8895
4500\ 8895
4600\ 8895
4700\ 8895
4800\ 8895
4900\ 8895
5000\ 8895
5100\ 8895
5200\ 8895
5300\ 8895
5400\ 8895
5500\ 8895
5600\ 8895
5700\ 8895
5800\ 8895
5900\ 8895
6000\ 8895
6100\ 8895
6200\ 8895
6300\ 8895
6400\ 8895
6500\ 8895
6600\ 8895
6700\ 8895
6800\ 8895
6900\ 8895
7000\ 8895
7100\ 8895
7200\ 8895
7300\ 8895
7400\ 8895
7500\ 8895
7600\ 8895
7700\ 8895
7800\ 8895
7900\ 8895
8000\ 8895
8100\ 8895
8200\ 8895
8300\ 8895
8400\ 8895
8500\ 8895
8600\ 8895
8700\ 8895
8800\ 8895
0.15410956512692073


In [45]:
print(predicted_content_based)

def predictions_combiner(ibcf_predictions, cb_predictions, ibcf_weight=0.8, reviews=reviews_train, businesses=businesses):
    hybrid_predictions = {}
    cb_weight = 1 - ibcf_weight
    for business_id in cb_predictions['business_id']:
        if business_id in ibcf_predictions['business_id']:
            ibcf_prediction = ibcf_predictions[ibcf_predictions['business_id'] == business_id]['predicted rating']
            cb_prediction = cb_predictions[cb_predictions['business_id'] == business_id]['predicted rating']
            hybrid_predictions[business_id] = ibcf_prediction*ibcf_weight + cb_prediction*cb_weight
            if ibcf_prediction == np.nan:
                hybrid_predictions[business_id] = cb_prediction
            if cb_prediction == np.nan:
                hybrid_predictions[business_id] = ibcf_prediction
        else:
            hybrid_predictions[business_id] = cb_prediction
            
    return  dict(sorted(hybrid_predictions.items(), key=operator.itemgetter(1), reverse=True))

predictions_combiner(predicted_item_based_cosine, predicted_content_based)

                     user_id             business_id  stars  predicted rating
0     apP3CApEq6-z59tRLwEBYA  2lcK3d4K7FU6O8wXdWzOmA      5          5.000000
1     xtwoOTTOuZrXj4GQtsueuA  bzdb1jJ1j8Qn_RVHY97FnA      2          3.033333
2     ecz6xGzGm1-wwswYh8VT8A  z71IqTCbQW7uzw2H2T1QrA      5          5.000000
3     bJ1ir7YZ-e-cigMahFLEIw  KR2kRmHnRCaNzOUEGoB25w      5          5.000000
4     FO8hILdACBsZrVDur5x8RA  z58nyUVyDV-vC7nXFfvR5g      3          3.500000
5     _SoUBkOHsToeOM14bjnraA  J8klQB6PHeR5vZpz6L0S9w      3          3.000000
6     jo50q0nYHPaOrKKyFVgZzQ  4wpeWu0lCR_eEdGiyZB9rg      5          5.000000
7     feCCr_lY8yBgeVtpO23dPQ  vNPJbqYuJEFDPOLhoq3bFA      1          1.000000
8     t6LdjDcOVkeAKbDMWqHroA  nYvBZYg9rfqWFTYuxSVMdw      2          2.000000
9     XW4sF5LLN5i-Gilpn1h1vQ  KR2kRmHnRCaNzOUEGoB25w      4          3.800000
10    rvJHU_zW4xK9F7j9oLSEYg  bzdb1jJ1j8Qn_RVHY97FnA      1          1.000000
11    ZGnxn9YOWAFRxBQZjlri9A  _v7lMUtdd6WlgKUEBLA_VA      5     