# Groeps Opdracht CI
## Joost Vledder, Sadjia Safdari, Simon Kreulen & Jasper van Eck

### Inhoudsopgave <a name='Top'></a>

[Imports](#import)

[Load data](#loaddata)

[Content Based Filtering Algorithm](#contentbased)

[Data Clean Up](#cleanup)

[Basic Data Stats](#basicdata)

[Plots](#plots)

[Cosine Similiraty Content Based](#cossim)

[Query vector](#queryvector)

[SVD](#svd)

[K-Means](#kmeans)

[Cohen Kappa](#cohenkappa)

[Item-Based Collaborative Filtering Algorithm](#itembasedcollab)

[Basic Data Stats for Collabaritive Filtering](#basicdatacollab)

[Implementation](#implementation)

[Deel 1: Mesa](#mesa)

[Deel 2: Henderson](#henderson)


### Imports <a name='import'></a>

[Top](#Top)

In [None]:
import os
import pandas as pd
import numpy as np
np.seterr(divide='raise', over='raise', under='raise', invalid='raise')
import matplotlib.pyplot as plt
from collections import defaultdict
import json
#pip install geopy
from geopy import geocoders
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import TruncatedSVD

### Load data <a name='loaddata'></a>

[Top](#Top)

In [None]:
#rootdir = './yelp/data'
rootdir = './yelp/data/Henderson'

df_business = pd.DataFrame()
df_users = pd.DataFrame()
df_reviews = pd.DataFrame()

count = 0

def load_jsons(data_path, file):
    # function to help load json files, since sometimes they give utf8 encoding errors, sometimes they don't
    file_path = os.path.join(subdir, file)
    lines = []
    with open(file_path) as jsons:
        try:
            lines = [json.loads(json_line) for json_line in jsons]
        except:
            print(file_path)
    return pd.DataFrame(lines)

for subdir, dirs, files in os.walk(rootdir):
    if count == 200: #1078 is the total amount of cities
        break
        
    for file in files:
        if os.path.basename(file) == 'business.json':
            df_business = df_business.append(load_jsons(subdir, file))
        if os.path.basename(file) == 'user.json':
            df_users = df_users.append(load_jsons(subdir, file))
        if os.path.basename(file) == 'review.json':
            df_reviews = df_reviews.append(load_jsons(subdir, file))
    count += 1

df_business = df_business.set_index('business_id')
df_users = df_users.set_index('user_id')
df_reviews = df_reviews.set_index('review_id')

display(df_business.head())
display(df_users.head())
display(df_reviews.head())

In [None]:
df_business.to_pickle('./business_pickle.pkl')
df_users.to_pickle('./users_pickle.pkl')
df_reviews.to_pickle('./reviews_pickle.pkl')

In [None]:
df_business = pd.read_pickle('./business_pickle.pkl')
df_users = pd.read_pickle('./users_pickle.pkl')
df_reviews = pd.read_pickle('./reviews_pickle.pkl')

### Clean up Data For Content Based <a name='cleanup'></a>

[Top](#Top)

In [None]:
cat_dict = defaultdict(lambda: defaultdict(int))

# Drop na categories
df_business = df_business[df_business['categories'].notna()]

# Normalize long, lat, stars & review_count
#x = cos(lat) * cos(lon)
#y = cos(lat) * sin(lon)
df_business['stars'] = (df_business['stars']-df_business['stars'].min()) / (df_business['stars'].max()-df_business['stars'].min())
df_business['review_count'] = (df_business['review_count']-df_business['review_count'].min()) / (df_business['review_count'].max()-df_business['review_count'].min())
#df_business['x_axis'] = np.cos(df_business['latitude']) * np.cos(df_business['longitude'])
#df_business['y_axis'] = np.cos(df_business['latitude']) * np.sin(df_business['longitude'])

# Dataframe of categories one hot encoded per business_id
df_tmp = df_business['categories'].str.split(pat=', ',expand=True)

# Create dict of categories
for index, row in df_tmp.iterrows():
    for _,elem in row.items():
        if elem and elem not in cat_dict[index]:
            cat_dict[index][elem] += 1  

In [None]:
# Create DF from dict
df_cats = pd.DataFrame.from_dict(cat_dict, orient='index')

#Drop NaN column
df_cats = df_cats.drop(columns='NaN',errors='ignore')

# Fillna with 0
df_cats = df_cats.fillna(0)

# Join back into business
df_business = df_business.join(df_cats,on='business_id')

# Drop cols
df_business = df_business.drop(columns=['address','state','postal_code','attributes','categories','hours'],errors='ignore')

# Remove elite & friends columns because of non use
df_users = df_users.drop(columns=['elite','friends'],axis=1,errors='ignore')

In [None]:
df_business.to_pickle('./business_pickle.pkl')
df_users.to_pickle('./users_pickle.pkl')
df_reviews.to_pickle('./reviews_pickle.pkl')

In [None]:
df_business = pd.read_pickle('./business_pickle.pkl')
df_users = pd.read_pickle('./users_pickle.pkl')
df_reviews = pd.read_pickle('./reviews_pickle.pkl')

### Basic Data Stats for Content Based <a name='basicdata'></a>

[Top](#Top)

In [None]:
# Some information about the category amounts per business

# series of amount of categories per business
cats_business = df_cats.sum(axis=1).sort_values(ascending=False)
display(cats_business)

avg_cats_bus = cats_business.mean()
max_cats_bus = cats_business.max()
min_cats_bus = cats_business.min()

print('Average amount of categories for businesses: ', avg_cats_bus)
print('Minimum amount of categories for businesses: ', min_cats_bus)
print('Maximum amount of categories for businesses: ', max_cats_bus)

# series of amount of businesses with the index amount of categories
cat_bus_distribution = cats_business.value_counts()
display(cat_bus_distribution)

In [None]:
# series of number of businesses with index as category
cats_presences = df_cats.sum(axis=0).sort_values(ascending=False)
display(cats_presences)

avg_cats = cats_presences.mean()
max_cats = cats_presences.max()
min_cats = cats_presences.min()

print('Average amount of presence of a category: ', avg_cats)
print('Minimum amount of presence of a category: ', min_cats)
print('Maximum amount of presences of a category: ', max_cats)

### Plots <a name='plots'></a>

[Top](#Top)

In [None]:
hist_bus = df_business.hist(column=['stars','review_count'])
#df_business.plot(x='x_axis',y='y_axis',kind='scatter')
df_business.plot(x='longitude',y='latitude',kind='scatter')
plt.show()
cats_presences.plot()
plt.show()

cats_business.plot(kind='hist',subplots=False,sharex=False,sharey=False,title='Amount of businesses per amount of categories')
plt.show()

total_cats = sum(cat_bus_distribution.values)
percentages = [(value/total_cats) * 100 for value in cat_bus_distribution.values]

fig1, circle = plt.subplots()
circle.pie(percentages, labels=cat_bus_distribution.index, autopct='%1.1f%%')
circle.axis('equal')
plt.title('Percentual distribution of amount of catergories per business')
plt.show()

In [None]:
df_business = pd.read_pickle('./business_pickle.pkl')
df_cats = pd.read_pickle('./cats_pickle.pkl')
df_users = pd.read_pickle('./users_pickle.pkl')
df_reviews = pd.read_pickle('./reviews_pickle.pkl')

### Cosine Similarity <a name='cossim'></a>

[Top](#Top)

In [None]:
# Input vectors are series, convert to np array
def cosineSim(vector, docVector):
    vector = vector.to_numpy()
    docVector = docVector.to_numpy()
    
    # Get lengths of vectors
    sim = (np.sqrt(sum(vector**2))*np.sqrt(sum(docVector**2)))
    
    # Ensure no division by 0
    if sim == 0.:
        return np.nan
    
    # Calculate cosine sim
    return vector.dot(docVector)/sim

In [None]:
def rankedBusinessList(df_business, queryVector):
    df_bus_features = df_business.copy().drop(columns=['name','city','stars','review_count','is_open','Score Cos'],errors='ignore')
    scoreList = np.zeros(len(df_business))
    for i in range(len(df_bus_features)):
        scoreList[i] = cosineSim(queryVector, df_bus_features.iloc[i])
    
    df_business['Score Cos'] = scoreList
    return df_business.sort_values(by=['Score Cos','stars'],ascending=False)

### Create Query Vector <a name='queryvector'></a>

[Top](#Top)

This first cell get the longitude and latitude of a given city and state, and normalizes both values.

In [None]:
display(df_business['city'].value_counts().sort_values(ascending=False))

In [None]:
loc_string = 'Scottsdale AZ'
gn = Nominatim(user_agent='WalterGKurtz')
location = gn.geocode(loc_string)
queryVector = pd.Series(data=0.,index=df_business.columns)
queryVector.at['latitude'] = location.latitude
queryVector.at['longitude'] = location.longitude
#x = cos(lat) * cos(lon)
#y = cos(lat) * sin(lon)
#queryVector.at['x_axis'] = np.cos(location.latitude) * np.cos(location.longitude)
#queryVector.at['y_axis'] = np.cos(location.latitude) * np.sin(location.longitude)


The second cell inputs the preferences of the user.

In [None]:
# Preferences
queryVector = queryVector.drop(labels=['name','city','stars','review_count','is_open'],errors='ignore')
queryVector.at['Automotive'] = 1.
queryVector.at['Auto Repair'] = 1.
queryVector.at['Fast Food'] = 1.
queryVector.at['Restaurants'] = 1.
queryVector.at['Pizza'] = 1.
queryVector.at['Men\'s Clothing'] = 1.
queryVector.at['Women\'s Clothing'] = 1.
queryVector.at['Fashion'] = 1.
queryVector.at['Bars'] = 1.
display(queryVector)
#queryVector.to_csv('test.csv', header=False)

In [None]:
display(rankedBusinessList(df_business,queryVector).head(20))

### SVD <a name='svd'></a>

[Top](#Top)

In [None]:
stars = df_business['stars'].values
cities = df_business['city'].values
names = df_business['name'].values
bus_ids = df_business.index
df_tmp_svd = df_business.copy().drop(columns=['name','city', 'stars','review_count','is_open', 'Score Cos'],errors='ignore').to_numpy()

In [None]:
# This code can be used to find the optimal amount of dimension by the SVD

# trunc_svd = TruncatedSVD(n_components=len(df_tmp_svd[0])-1).fit(df_tmp_svd)
# reduced_x = trunc_svd.transform(df_tmp_svd)
# singles = trunc_svd.singular_values_

# eigvals = singles**2 / np.sum(singles**2)
# fig = plt.figure(figsize=(8,5))
# sing_vals = np.arange(len(df_tmp_svd[0])-1) + 1
# plt.plot(sing_vals[2:200], eigvals[2:200], 'ro-', linewidth=2)
# plt.title('Scree Plot')
# plt.xlabel('Component number')
# plt.ylabel('Eigenvalue')
# plt.show()

In [None]:
optimal_dims = 25
trunc_svd = TruncatedSVD(n_components=optimal_dims).fit(df_tmp_svd)
reduced_x = trunc_svd.transform(df_tmp_svd)

In [None]:
queryVectorReduced = trunc_svd.transform(np.array([queryVector]))

df_reduced_x = pd.DataFrame(data=reduced_x, index=bus_ids)
df_reduced_x.insert(0, 'stars', stars)
df_reduced_x.insert(0, 'city', cities)
df_reduced_x.insert(0, 'name', names)
df_queryVectorReduced = pd.Series(data=queryVectorReduced[0])
display(rankedBusinessList(df_reduced_x,df_queryVectorReduced).head(20))

### K-means <a name='kmeans'></a>

[Top](#Top)

In [None]:
# # This block of code was used to find the optimal K, it takes very long to run, so that's why it is commented now.

# from yellowbrick.cluster import KElbowVisualizer

# df_bus_kmeans = df_reduced_x.copy().drop(columns=['name','city','stars','Score Cos'],errors='ignore').to_numpy()

# # choose k

# model = KMeans(random_state=0)
# visualizer = KElbowVisualizer(model, k=[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000])
# visualizer.fit(df_bus_kmeans)        # Fit the data to the visualizer
# visualizer.show() 

In [None]:
df_bus_kmeans = df_reduced_x.copy().drop(columns=['name','city','stars','Score Cos'],errors='ignore').to_numpy()
kmeans = KMeans(n_clusters=300,random_state=0).fit(df_bus_kmeans)

In [None]:
# Predict query vector
labels = np.array(kmeans.predict(df_bus_kmeans))
prediction = kmeans.predict([df_queryVectorReduced.to_numpy()])

matches = [y[0] for y,val in np.ndenumerate(labels) if val == prediction]

df_matched = df_business.iloc[matches]

display(df_matched.sort_values(by=['stars'],ascending=False).head(20))

### Cohen's Kappa <a name='cohenkappa'></a>

[Top](#Top)

In [None]:
# Dataformat
# 1 | 0
# 1 | 1
# 0 | 1

def PE(data):
    '''On input data, return the P(E) (expected agreement).'''
    relevant = 0
    nonrelevant = 0
    # Iterate over the data
    for i in data:
        for j in i:
            
            # Top up the relevant documents by one if 1 is encountered
            if j == 1:
                relevant += 1
            # Top up the nonrelevant documents by one if 0 is encountered
            if j == 0:
                nonrelevant += 1

    # Calculates the total of inspected documents for the judges combined
    total = len(data)*2

    # Calculates the pooled marginals
    rel = relevant/total
    nonrel = nonrelevant/total

    # Calculates the P(E)
    P_E = nonrel**2 + rel **2    
    return    P_E 


def kappa(data, P_E):
    agree = 0
    for i in data:
        temp = None
        for j in i:
            if temp == j:
                agree += 1
            temp = j
    P_A = agree / len(data)
    if P_E == 1:
        kappa = 1
    else:
        kappa = (P_A - P_E)/(1 - P_E)   
    return kappa

In [None]:
def AveragePrecision(ranked_list_of_results, list_of_relevant_objects):
    total = len(list_of_relevant_objects)
    sumPk = 0
    rank = 0
    relevant = 0
    for result in ranked_list_of_results:
        rank += 1
        if result in list_of_relevant_objects:
            relevant += 1
            sumPk += relevant/rank
            
    aprecision = sumPk/total
    return aprecision

In [None]:
ranked_result_list = list(range(20))
relevant_objects_cos = [0,1,2,3,4,5,6,7,8,10,11,12,19]
relevant_objects_kmeans = [0,1,4,5,6,7,8,9,10,11,15,16,17,18]
print('Average Precision of cosine sim: ', AveragePrecision(ranked_result_list,relevant_objects_cos))
print('Average Precision of K-Means: ', AveragePrecision(ranked_result_list,relevant_objects_kmeans))

## Item-Based Collaborative Filtering Algorithm <a name='itembasedcollab'></a>

[Top](#Top)


### Basic Data Stats for Collaborative Based <a name='basicdatacollab'></a>

[Top](#Top)


In [None]:
# reset index
df_reviews = df_reviews.reset_index()

# count reviews per user and reviews per business
reviews_per_user = df_reviews.groupby("user_id")["review_id"].count().sort_values(ascending=False).to_frame()
reviews_per_business = df_reviews.groupby("business_id")["review_id"].count().sort_values(ascending=False).to_frame()


# plot both review counts
user_plot = reviews_per_user.plot()
business_plot = reviews_per_business.plot()

In [None]:
"""""
userplot --> the distribution of the reviews over the users (long-tail): considering the fact that there are more
reviews on business than users (one user can rate more than 1 business), it is probably better to choose
item-based CF than user-based CF. 

"""""
users_series = df_users['review_count'].sort_values(ascending=False).drop_duplicates(keep='first')

display(users_series)
userplot = users_series.plot()

userplot.set(xlabel='user_id', ylabel='review_count')
plt.show()

### Implementation <a name='implementation'></a>


[Top](#Top)

### Deel 1: Mesa <a name='mesa'></a>

[Top](#Top)

In [None]:
reviews = df_reviews[:5000]
reviews

In [None]:
def pivot_ratings(ratings):
    pivottable = ratings.pivot_table(index="business_id", columns="user_id", values="stars").fillna(0)
    return pivottable

utility_matrix = pivot_ratings(reviews)
display(utility_matrix.loc['mRUVMJkUGxrByzMQ2MuOpA'][utility_matrix.loc['mRUVMJkUGxrByzMQ2MuOpA'].notna()])
display(utility_matrix)

In [None]:
def cosine_distance(matrix, id1, id2):
    selected_features = matrix.loc[id1].notna() & matrix.loc[id2].notna()
    
    # if no matching features, return 'not a number' (NaN)
    if not selected_features.any():
        return np.nan
    
    # get the features from the matrix
    features1 = matrix.loc[id1][selected_features]
    features2 = matrix.loc[id2][selected_features]

    #if sum(features1)==0. or sum(features2)==0.:
        #return 1.
    
    if id1 == id2:
        return 1.
    
    sim = (np.sqrt(sum(features1**2))*np.sqrt(sum(features2**2)))
    
    if sim == 0.:
        return np.nan
    
    return sum(features1*features2)/sim
    
def create_similarity_matrix_cosine(matrix):
    similarity_matrix = pd.DataFrame(0, index=matrix.index, columns=matrix.index, dtype=float)
    # Iter of rows & columns, cause I couldnt get apply/-map to work
    for index, row in similarity_matrix.iteritems():
        for i in row.index:
            similarity_matrix.at[index,i] = cosine_distance(matrix,index,i)
    
    return similarity_matrix

#display(create_similarity_matrix_euclid(matrix))
    
#similarity = create_similarity_matrix_cosine(utility_matrix)
#display(similarity)

In [None]:
def mean_center_columns(matrix):
#     return matrix.apply(lambda x: x - x.mean())

# centered_utility_matrix = mean_center_columns(utility_matrix)
# display(centered_utility_matrix)

# similarity = create_similarity_matrix_cosine(centered_utility_matrix)
# display(similarity)

    mean_centered = matrix.copy()
    for col in list(matrix.columns):
        mean_centered[col] -= matrix[col].mean()
        
    return mean_centered


centered_utility_matrix = mean_center_columns(utility_matrix)
display(centered_utility_matrix)


similarity = create_similarity_matrix_cosine(centered_utility_matrix)
display(similarity)

In [None]:
similarity.to_pickle('./similarity_pickle.pkl')
similarity_matrix_mesa = pd.read_pickle('./similarity_pickle.pkl')

In [None]:
# TEST neighborhood op eentje

#def select_neighborhood(similarities, ratings, k):
#    """ selects all items with similarity > 0  """
#    moviesNotWatched = ratings[ratings.isna()].index
    
#    similarities = similarities.where(lambda x : x > 0.00).dropna()
#    similarities = similarities.drop(labels=moviesNotWatched,errors='ignore')
    
#    return similarities.sort_values(ascending=False)[:k]

def select_neighborhood(similarities, ratings, k):
    """ selects all items with similarity > 0  """
    # drop all non watched movies
    
    for key,v in ratings.iteritems():
        if np.isnan(v):
            similarities = similarities.drop(labels=key)
    
    for key,v in similarities.iteritems():
        if not v > 0.:
            similarities = similarities.drop(labels=key)
            
    similarities = similarities.sort_values(ascending=False)
    
    return similarities[:k]

neighborhood = select_neighborhood(similarity["mRUVMJkUGxrByzMQ2MuOpA"], utility_matrix["6G6_qNcvzRgAQdr2AWvkKw"], 10)
display(neighborhood)

In [None]:
# TEST
def weighted_mean(neighborhood, ratings):  
    upper = 0.
    bottom = 0.
    
    for index in neighborhood.index:
        similarity = neighborhood.get(index)
        upper += ratings.get(index) * similarity
        bottom += similarity
    
    if bottom == 0.:
        return np.nan
    else:
        return upper / bottom

neighborhood1 = select_neighborhood(similarity["mRUVMJkUGxrByzMQ2MuOpA"], utility_matrix["6G6_qNcvzRgAQdr2AWvkKw"], 10)
prediction1 = weighted_mean(neighborhood1, utility_matrix["6G6_qNcvzRgAQdr2AWvkKw"])


print(f"User 6G6_qNcvzRgAQdr2AWvkKw predicted rating for business mRUVMJkUGxrByzMQ2MuOpA is {prediction1:.2f}")


In [None]:
test_data = df_reviews[:500][["business_id", "user_id", "stars"]].copy()

def predict_ratings_item_based(similarity, utility, user_item_pairs):
    ratings_test_c = user_item_pairs.copy()
    ratings_test_c['predicted rating'] = 0.
    for index,row in user_item_pairs.iterrows():
        neighborhood = select_neighborhood(similarity[row['business_id']],utility[row['user_id']],1000)
        ratings_test_c.at[index,'predicted rating'] = weighted_mean(neighborhood,utility[row['user_id']])
    return ratings_test_c

predicted_item_based = predict_ratings_item_based(similarity, utility_matrix, test_data)
# display(predicted_item_based.head())

### Predicted Ratings

In [None]:
test_data = df_reviews[:500][["business_id", "user_id", "stars"]].copy()

PR = predict_ratings_item_based(similarity, utility_matrix, test_data)
# display(PR)

### MSE Predicted Ratings

In [None]:
def mse(predicted_ratings):
    return sum((predicted_ratings['stars']-predicted_ratings['predicted rating'])**2)/len(predicted_ratings)
    

mse_item_based = mse(PR)
print(mse_item_based)

### Random Predictions

In [None]:
mse_random = 0
predicted_random = test_data.copy()[['user_id', 'business_id', 'stars']]
predicted_random['predicted rating'] = 4.5 * np.random.random_sample((len(predicted_random),)) + 0.5

display(predicted_random.head())

mse_random = mse(predicted_random)

print(f"mse for item based prediction: {mse_item_based:.2f}")
print(f"mse for random prediction: {mse_random:.2f}")

### Deel 2: Henderson <a name='henderson'></a>

[Top](#Top)

### Utility Matrix & Similarity Matrix

In [None]:
reviews = df_reviews[:5000]

### Utility Matrix

In [None]:
def pivot_ratings(ratings):
    pivottable = ratings.pivot_table(index="business_id", columns="user_id", values="stars").fillna(0)
    return pivottable

utility_matrix = pivot_ratings(reviews)
display(utility_matrix)

### Similarity

In [None]:
def cosine_similarity(matrix, id1, id2):
    
    selected_features = matrix.loc[id1].notna() & matrix.loc[id2].notna()
    features1 = matrix.loc[id1][selected_features]
    features2 = matrix.loc[id2][selected_features]
    
    if id1 == id2:
        return 1
    
    elif selected_features.any() == False:
        return np.nan
                                                                                                
    if (features1==0).all() and (features2==0).all() == True:
         return np.nan
    elif (features1==0).all() or (features2==0).all() == True:
         return np.nan
                                                   
    else:
         
        teller = sum(features1*features2)
        noemer = np.sqrt(sum(features1**2)) * np.sqrt(sum(features2**2))
        cos = teller / noemer   
        
        return cos 


def create_similarity_matrix_cosine(matrix):
    
    cosine_similarity_matrix = pd.DataFrame(0, index=matrix.index, columns=matrix.index, dtype=float)
    
    for i1 in matrix.index.values:
        for i2 in matrix.index.values:
            similarity = cosine_similarity(matrix, i1, i2)
            cosine_similarity_matrix[i1][i2] = similarity
    
    return cosine_similarity_matrix


#similarity = create_similarity_matrix_cosine(utility_matrix)
#display(similarity)

### Mean Centering

In [None]:
def mean_center_columns(matrix):
    result = matrix.copy()
    
    for i in result.columns.values:
        m = result[i].mean()
        for j in result.index.values:
            result[i][j] = result[i][j] - m
            
    return result


centered_utility_matrix = mean_center_columns(utility_matrix)
display(centered_utility_matrix)


similarity = create_similarity_matrix_cosine(centered_utility_matrix)
display(similarity)

In [None]:
similarity.to_pickle('./similarity_pickle.pkl')

In [None]:
similarity = pd.read_pickle('./similarity_pickle.pkl')

### Neighborhood

In [None]:
def select_neighborhood(similarities, ratings, k):
    """ selects all items with similarity > 0  """
   
    results = {}
    
    for i in similarities.index.values:
        if similarities[i]>0.000 and not np.isnan(ratings[i]):
            print(similarities[i])
            results[i] = similarities[i]
    
    df = pd.Series(results)
        
    
    return df.sort_values(ascending=False)[:k]  
        
    
#print(similarity["-3n__pVgU99k4jaSANVFgw"], utility_matrix["-InhDRRVG7wrwsgAUvN4Qw"])
neighborhood = select_neighborhood(similarity["znRorbwFubHZaACq8qj2Rg"], utility_matrix["zyFu57CLm1q752bkG9OjXQ"], 10)
display(neighborhood)

In [None]:
reviews_copy = reviews.copy()
training_set = reviews_copy.sample(frac=0.80, random_state=0)[["business_id", "user_id", "stars"]]
test_set = reviews_copy.drop(training_set.index)[["business_id", "user_id", "stars"]]

display(training_set)
display(test_set)

### Voorspelling

In [None]:
def weighted_mean(neighborhood, ratings):  
    
    a = 0.
    b = 0.
    
    for (i, m) in neighborhood.iteritems():
        a += m * ratings[i]
        b += m 

    print(ratings)   
    
    if b == 0.:
        return np.nan
    else:
        return a/b
    
neighborhood = select_neighborhood(similarity["znRorbwFubHZaACq8qj2Rg"], utility_matrix["zyFu57CLm1q752bkG9OjXQ"], 10)
prediction = weighted_mean(neighborhood, utility_matrix["zyFu57CLm1q752bkG9OjXQ"])
print(f'User "-0HhZbPBlB1YZx3BhAfaEA" predicted rating for "zyFu57CLm1q752bkG9OjXQ" is {prediction:.3f}')

### Baseline

### Predict Ratings

In [None]:
def predict_ratings_item_based(similarity, utility, user_item_pairs):
    ratings_test_c = user_item_pairs.copy()
    ratings_test_c['predicted rating'] = 0.
    for index,row in user_item_pairs.iterrows():
        neighborhood = select_neighborhood(similarity[row['business_id']],utility[row['user_id']],1000)
        ratings_test_c.at[index,'predicted rating'] = weighted_mean(neighborhood,utility[row['user_id']])
    return ratings_test_c

In [None]:
PR_training = predict_ratings_item_based(similarity, utility_matrix, training_set)
PR_test = predict_ratings_item_based(similarity, utility_matrix, test_set)

#display(PR_training)
#display(PR_test)
#PR = predict_ratings_item_based(similarity, utility_matrix, test_set)
#test_data = df_reviews[:500][['business_id','user_id','stars']].copy()
#display(PR)

### MSE Predicted Ratings


In [None]:
def mse(predicted_ratings):
    return sum((predicted_ratings['stars']-predicted_ratings['predicted rating'])**2)/len(predicted_ratings)

mse_item_based_training = mse(PR_training)
mse_item_based_test = mse(PR_test)

display(mse_item_based_training)
display(mse_item_based_test)

### MSE Random Predictions

In [None]:
mse_random = 0
predicted_random = test_set.copy()[['user_id', 'business_id', 'stars']]
predicted_random['predicted rating'] = 4.5 * np.random.random_sample((len(predicted_random),)) + 0.5

display(predicted_random.head())

mse_random = mse(predicted_random)
print(f'mse for item based prediction: {mse_item_based_test:.2f}')
print(f'mse for random prediction: {mse_random:.2f}')

# Evaluatie

In [None]:
# Create a copy of the DataFrame
# Split data in training and test set

reviews_copy = reviews.copy()
training_set = reviews_copy.sample(frac=0.80, random_state=0)
test_set = reviews_copy.drop(training_set.index)

In [None]:
def recommended(predictions, threshold):
    return predictions[predictions['predicted rating']>=threshold]
    

def hidden(predictions, threshold):
    return predictions[predictions['predicted rating']<threshold]
    
#predicted_item_based    
treshold_recommended = 3.75
recommended_items = recommended(PR_test, treshold_recommended)
hidden_items = hidden(PR_test, treshold_recommended)

print(f'Test items : {PR_test.shape[0]}')
print(f'Recommended: {recommended_items.shape[0]}')
print(f'Hidden     : {hidden_items.shape[0]}')
display(recommended_items.head())
display(hidden_items.head())

In [None]:
def used(predictions, threshold):
    return predictions[predictions['stars']>=threshold]
    

def unused(predictions, threshold):
    return predictions[predictions['stars']<threshold]
    
    
treshold_used = 4.0
used_items = used(PR_test, treshold_used)
unused_items = unused(PR_test, treshold_used)

print(f'Test items: {PR_test.shape[0]}')
print(f'Used      : {used_items.shape[0]}')
print(f'Unused    : {unused_items.shape[0]}')
display(used_items.head())
display(unused_items.head())

In [None]:
def confusion(recommended, hidden, used, unused): 
    TP = len(recommended.merge(used)) 
    FP = len(recommended.merge(unused)) 
    TN = len(hidden.merge(unused))
    FN = len(hidden.merge(used))
    
    return pd.DataFrame([[TP, FP], [FN, TN]], index=['recommended', 'hidden'], columns=['used', 'unused'])

confusion_matrix = confusion(recommended_items, hidden_items, used_items, unused_items)
display(confusion_matrix)

In [None]:
def precision(confusion_matrix):
    return confusion_matrix.at['recommended','used']/(confusion_matrix.at['recommended','used']+confusion_matrix.at['recommended','unused'])

precision_item_based = precision(confusion_matrix)

print(f'precision for item based prediction: {precision_item_based:.2f}')

In [None]:
def recall(confusion_matrix):
    return confusion_matrix.at['recommended','used']/(confusion_matrix.at['recommended','used']+confusion_matrix.at['hidden','used'])

recall_item_based = recall(confusion_matrix)

print(f'recall for item based prediction: {recall_item_based:.2f}')

In [None]:
treshold_recommended = 3.75
treshold_used = 4.0

recommended_item_based = recommended(PR_test, treshold_recommended)
hidden_item_based = hidden(PR_test, treshold_recommended)
used_item_based = used(PR_test, treshold_used)
unused_item_based = unused(PR_test, treshold_used)

recommended_random = recommended(predicted_random, treshold_recommended)
hidden_random = hidden(predicted_random, treshold_recommended)
used_random = used(predicted_random, treshold_used)
unused_random = unused(predicted_random, treshold_used)

confusion_item_based = confusion(recommended_item_based,hidden_item_based,used_item_based,unused_item_based)
confusion_random = confusion(recommended_random, hidden_random,used_random,unused_random)

precision_random = precision(confusion_random)
recall_random = recall(confusion_random)

print('           | precision | recall')
print(f'item based |      {precision_item_based:.2f} |   {recall_item_based:.2f}')
print(f'random     |      {precision_random:.2f} |   {recall_random:.2f}')

In [None]:
treshold_used = 3.75
treshold_recommended = [2.5,3.,3.5,4.,4.5]

recommended_item_based = [recommended(PR_test, thresh) for thresh in treshold_recommended]
hidden_item_based = [hidden(PR_test, thresh) for thresh in treshold_recommended]
used_item_based = used(PR_test, treshold_used)
unused_item_based = unused(PR_test, treshold_used)

recommended_random = [recommended(predicted_random, thresh) for thresh in treshold_recommended]
hidden_random = [hidden(predicted_random, thresh) for thresh in treshold_recommended]
used_random = used(predicted_random, treshold_used)
unused_random = unused(predicted_random, treshold_used)

confusion_item_based = [confusion(recommended_item_based[x],hidden_item_based[x],used_item_based,unused_item_based) for x in range(len(treshold_recommended))]
confusion_random = [confusion(recommended_random[x],hidden_random[x],used_random,unused_random) for x in range(len(treshold_recommended))]

precision_item_based = [precision(confusion_item_based[x]) for x in range(len(treshold_recommended))]
recall_item_based = [recall(confusion_item_based[x]) for x in range(len(treshold_recommended))]

precision_random = [precision(confusion_random[x]) for x in range(len(treshold_recommended))]
recall_random = [recall(confusion_random[x]) for x in range(len(treshold_recommended))]

plt.plot(recall_item_based, precision_item_based)
for r, p, t in zip(recall_item_based, precision_item_based, treshold_recommended):
    plt.text(r, p, t)

plt.plot(recall_random, precision_random)
for r, p, t in zip(recall_random, precision_random, treshold_recommended):
    plt.text(r, p, t)
    
plt.xlim(0.0, 1.0)
plt.ylim(0.3, 1.0)
plt.xlabel('recall')
plt.ylabel('precision')


plt.legend(['item_based','random'], loc = 'lower left')