## Description：
This notebook tries to implement the ItemCF method we tried to improve. The data set used is the Montréal data set in the Yelp data set, and the reviews part of the data set is mainly used because it contains the user's rating of the business. The code is mainly divided into the following steps：
1. Import the yelp data set, filter the data set and divide it into training set and test set.
2. Make 5 recommendations by using training set, and each recommendation generates a recommendation list for the first 200 users in the training set.
3. During the recommendation process, if the popularity of the business is greater than the threshold (here the threshold is 0.6), the business will be punished.
4. Calculate the PRU.

**We should have found the optimal threshold through cross-validation, but we didn't have time to do it before the presentation 3 deadline.**

In [97]:
import os
import time

# For reading JSON files
import json

from operator import itemgetter

# data science imports
import math
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

# utils import
from fuzzywuzzy import fuzz

# visualization imports
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as colors

# For random sampling
import random

# Calculate the similarity matrix by using sparse matrix
import scipy.sparse as sp

# For data Spliting
from sklearn.model_selection import train_test_split

# Compute SRC
from scipy import stats

## Load Json data:

In [3]:
def read_josn (path):
    file = open(path, "r", encoding = 'utf-8')
    df = []
    for line in file.readlines():
        dic = json.loads(line)
        df.append(dic)
    return df

In [163]:
# load business data
path = "data/Charlotte_Data/Charlotte_business.json"
business_list = read_josn(path)
# load reviews data
path = "data/Charlotte_Data/Charlotte_Review.json"
review_list = read_josn(path)
# load user data
path = "data/Charlotte_Data/Charlotte_User.json"
user_list = read_josn(path)

## Transform to dataframe:

In [164]:
# business list into dataframe
df_business = pd.DataFrame()
business_id = []
name = []
for element in business_list:
    business_id.append(element["business_id"])
    name.append(element["name"])

Business_Num_Id = {}
U = 1
B = 1
Business_Id = []
for index in range(len(business_id)):
    if business_id[index] not in Business_Num_Id.keys():
        Business_Num_Id[business_id[index]] = B
        B = B + 1
    Business_Id.append(Business_Num_Id[business_id[index]])
    

df_business["RawBusinessId"] = business_id
df_business["businessId"] = Business_Id
df_business["title"] = name

df_business.head()

Unnamed: 0,RawBusinessId,businessId,title
0,gnKjwL_1w79qoiV3IC_xQQ,1,Musashi Japanese Restaurant
1,HhyxOkGAM07SRYtlQ4wMFQ,2,Queen City Plumbing
2,irft4YkdNsww4DNf_Aftew,3,So Cool Frozen Yogurt
3,BvYU3jvGd0TJ7IyZdfiN2Q,4,Manzetti's Tavern
4,Qnz3ywR7BosTr8qDk6G-Pw,5,Mattress Firm Final Markdown


In [165]:
# User list into dataframe
df_user = pd.DataFrame()
user_id = []
name = []
for element in user_list:
    user_id.append(element["user_id"])
    name.append(element["name"])

User_Num_Id = {}
U = 1
User_Id = []
for index in range(len(user_id)):
    if user_id[index] not in User_Num_Id.keys():
        User_Num_Id[user_id[index]] = U
        U = U + 1
    User_Id.append(User_Num_Id[user_id[index]])
    
df_user["RawUserId"] = user_id
df_user["UserId"] = User_Id
df_user["name"] = name

df_user.head()

Unnamed: 0,RawUserId,UserId,name
0,Ps_zkoSnuv2Gy-QIt0jEJg,1,Lea
1,djQLJTLA4Tx7TpzYCKIqJQ,2,Kris
2,eSlOI3GhroEtcbaD_nFXJQ,3,Jason
3,NmC_ZemjAQ6TvzgeaAuTmA,4,Lindsay
4,gTAMqkDSj8z84XN8YvsSJg,5,Morgan


In [166]:
# review list into dataframe
df_ratings = pd.DataFrame()
#review_id = []
user_id = []
business_id = []
ratings = []
for element in review_list:
    uid = User_Num_Id[element["user_id"]]
    bid = Business_Num_Id[element["business_id"]]
    
    user_id.append(uid)
    business_id.append(bid)
    ratings.append(element["stars"])

#df_ratings["review_id"] = review_id
df_ratings["userId"] = user_id
df_ratings["businessId"] = business_id
df_ratings["rating"] = ratings

df_ratings.head()

Unnamed: 0,userId,businessId,rating
0,5282,166,5.0
1,9782,521,1.0
2,5280,166,5.0
3,1934,245,4.0
4,4678,36,5.0


## Filter data:
### For business:

In [167]:
# get rating frequency
df_business_cnt = pd.DataFrame(df_ratings.groupby('businessId').size(), columns=['count'])
df_business_cnt.head()

Unnamed: 0_level_0,count
businessId,Unnamed: 1_level_1
1,178
2,4
3,6
4,18
5,4


In [168]:
# top quantiles of rating counts
df_business_cnt['count'].quantile(np.arange(1, 0.6, -0.05))

1.00    1895.0
0.95     139.0
0.90      77.0
0.85      50.0
0.80      36.0
0.75      27.0
0.70      21.0
0.65      17.0
Name: count, dtype: float64

In [169]:
# filter data - movies count less than 20
popularity_thres = 10
popular_business = list(set(df_business_cnt.query('count >= @popularity_thres').index))
df_ratings_drop_business = df_ratings[df_ratings.businessId.isin(popular_business)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping unpopular business: ', df_ratings_drop_business.shape)

shape of original ratings data:  (309425, 3)
shape of ratings data after dropping unpopular business:  (285809, 3)


### For users:

In [170]:
# get number of ratings given by every user
df_users_cnt = pd.DataFrame(df_ratings_drop_business.groupby('userId').size(), columns=['count'])
df_users_cnt.head()

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
1,3
2,11
3,11
4,62
5,3


In [171]:
df_users_cnt['count'].quantile(np.arange(1, 0.5, -0.05))

1.00    799.0
0.95      9.0
0.90      5.0
0.85      4.0
0.80      3.0
0.75      2.0
0.70      2.0
0.65      2.0
0.60      1.0
0.55      1.0
Name: count, dtype: float64

In [172]:
# filter data
ratings_thres = 2
active_users = list(set(df_users_cnt.query('count >= @ratings_thres').index))
df_ratings_drop_users = df_ratings_drop_business[df_ratings_drop_business.userId.isin(active_users)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping both unpopular business and inactive users: ', df_ratings_drop_users.shape)

shape of original ratings data:  (309425, 3)
shape of ratings data after dropping both unpopular business and inactive users:  (231296, 3)


In [173]:
df_ratings_drop_users.index = list(range(0, df_ratings_drop_users.shape[0]))
df_ratings_drop_users = df_ratings_drop_users.drop_duplicates(keep='last', subset=['userId','businessId'])

## Split data into training set and test set:

In [175]:
training,test = train_test_split(df_ratings_drop_users, test_size=0.2, random_state=0)
training.index = list(range(0, training.shape[0]))
training
print(training)

        userId  businessId  rating
0         2596         830     4.0
1        38371        1493     5.0
2        15845        5934     5.0
3        43901        3444     4.0
4        35146        7098     3.0
...        ...         ...     ...
177430   28238        7247     5.0
177431   30452        4852     4.0
177432   71989        7588     5.0
177433    4845        1407     1.0
177434   58014        7949     3.0

[177435 rows x 3 columns]


In [176]:
test.index = list(range(0, test.shape[0]))
print(test)

       userId  businessId  rating
0       27340        5468     5.0
1        4544        6224     3.0
2       20966         372     1.0
3       14194        7448     4.0
4       16673         329     5.0
...       ...         ...     ...
44354   57748        9415     5.0
44355   42697        3371     5.0
44356     528        8350     4.0
44357    5397        2534     4.0
44358    1275        3444     5.0

[44359 rows x 3 columns]


## Compute the user similarity matrix:

In [177]:
row_i = np.array(training['userId'])
col_i = np.array(training['businessId'])
train_m = sp.csr_matrix((np.array(training['rating']), (row_i, col_i)), shape = (max(row_i) + 1, max(col_i) + 1))

In [178]:
train_multiplied = train_m.dot(train_m.T)
diag = np.sqrt(1 / train_multiplied.diagonal())
diag[np.isnan(diag)] = 0
diag[np.isinf(diag)] = 0
diag = sp.csr_matrix(diag)
similarity_m = train_multiplied.multiply(diag)
similarity_m = (similarity_m.T).multiply(diag)

  diag = np.sqrt(1 / train_multiplied.diagonal())


## Compute the RMSEs for both the training set and test set by using the original method:

In [179]:
def get_pred(rt_df, similarity_m, top_k):
    # For each business in test set, find users who have rated it.
    u_buy = {}
    for b in list(rt_df['businessId']):
        u_buy[b] = list(rt_df[rt_df['businessId'] == b]['userId'])
        
    # Calculate the predictive ratings for the given rating dataframe rt_df.
    predict = []
    for i in rt_df.index:
        ui = rt_df['userId'][i]
        bi = rt_df['businessId'][i]
        numerator = 0
        denominator = 0
        top_k_i = list((np.array(similarity_m[ui, u_buy[bi]].todense()).flatten()).argsort()[::-1][0:top_k]) # Use the k users who rated the business b and are most similar to user u_i to predict the ratings.
        if ui in [u_buy[bi][j] for j in top_k_i]:
            # If u_i is in the k users who rated the business b and are most similar to user u_i to predict the ratings,
            # we should delete from the candidate users since we don't want to use u_i's information to predict u_i's rating. 
            top_k_i = list((np.array(similarity_m[ui, u_buy[bi]].todense()).flatten()).argsort()[::-1][1:top_k+1])
        for u in [u_buy[bi][j] for j in top_k_i]:
            numerator += similarity_m[ui, u] * float(rt_df[(rt_df['userId'] == u) & (rt_df['businessId'] == bi)]['rating'])
            denominator += similarity_m[ui, u]
        if denominator == 0:
            predict.append(0)
        else:
            predict.append(numerator/denominator)
    pred_df = pd.DataFrame()
    pred_df['userId'] = list(rt_df['userId'])
    pred_df['businessId'] = list(rt_df['businessId'])
    pred_df['prediction'] = predict
    pred_df.index = rt_df.index
    return pred_df 

In [180]:
pred_train_df = get_pred(training, similarity_m, 20) 
pred_train_df

KeyboardInterrupt: 

In [None]:
RMSE_train = (sum((pred_train_df['prediction'] - training['rating'])**2) / pred_train_df.shape[0])**(1/2)
print('The overall RMSE on the train data is %s'%RMSE_train)

The overall RMSE on the test data is 1.0687550632441847


In [181]:
pred_test_df = get_pred(test, similarity_m, 20) 
pred_test_df

Unnamed: 0,userId,businessId,prediction
0,27340,5468,4.599344
1,4544,6224,3.289313
2,20966,372,4.694030
3,14194,7448,3.962721
4,16673,329,4.352063
...,...,...,...
44354,57748,9415,0.000000
44355,42697,3371,5.000000
44356,528,8350,0.000000
44357,5397,2534,0.000000


In [182]:
RMSE_test = (sum((pred_test_df['prediction'] - test['rating'])**2) / pred_test_df.shape[0])**(1/2)
print('The overall RMSE on the test data is %s'%RMSE_test)

The overall RMSE on the test data is 2.2253182126618145


## Compute the RMSEs for both the training set and test set by using the method with penalty:

### Calculate the normalized popularity for each business:

In [183]:
# Calculate the norm popularity for each business, which is how many times each business is rated by users.
training_popular = training.groupby('businessId').size().reset_index(name = 'popular')
training_popular['popular'] = (training_popular['popular'] -  np.min(training_popular['popular']))/ (np.max(training_popular['popular'])-np.min(training_popular['popular']))

In [184]:
# Calculate the norm popularity for each business, which is how many times each business is rated by users.
test_popular = test.groupby('businessId').size().reset_index(name = 'popular')
test_popular['popular'] = (test_popular['popular'] - np.min(test_popular['popular']))/ (np.max(test_popular['popular'])-np.min(test_popular['popular']))

In [185]:
def get_pred_with_penalty(rt_df, similarity_m, top_k, threshold, popular):
    # For each business in test set, find users who have rated it.
    u_buy = {}
    for b in list(rt_df['businessId']):
        u_buy[b] = list(rt_df[rt_df['businessId'] == b]['userId'])
        
    # Calculate the predictive ratings for the given rating dataframe rt_df.
    predict = []
    for i in rt_df.index:
        ui = rt_df['userId'][i]
        bi = rt_df['businessId'][i]
        numerator = 0
        denominator = 0
        top_k_i = list((np.array(similarity_m[ui, u_buy[bi]].todense()).flatten()).argsort()[::-1][0:top_k]) # Use the k users who rated the business b and are most similar to user u_i to predict the ratings.
        if ui in [u_buy[bi][j] for j in top_k_i]:
            # If u_i is in the k users who rated the business b and are most similar to user u_i to predict the ratings,
            # we should delete from the candidate users since we don't want to use u_i's information to predict u_i's rating. 
            top_k_i = list((np.array(similarity_m[ui, u_buy[bi]].todense()).flatten()).argsort()[::-1][1:top_k+1])
        for u in [u_buy[bi][j] for j in top_k_i]:
            if float(popular[popular['businessId'] == bi]['popular']) > threshold:
                numerator += similarity_m[ui, u] * float(rt_df[(rt_df['userId'] == u) & (rt_df['businessId'] == bi)]['rating']) * float(popular[popular['businessId'] == bi]['popular'])
            else:
                numerator += similarity_m[ui, u] * float(rt_df[(rt_df['userId'] == u) & (rt_df['businessId'] == bi)]['rating']) / float(popular[popular['businessId'] == bi]['popular'])
            denominator += similarity_m[ui, u]
        if denominator == 0:
            predict.append(0)
        else:
            predict.append(numerator/denominator)
    pred_df = pd.DataFrame()
    pred_df['userId'] = list(rt_df['userId'])
    pred_df['businessId'] = list(rt_df['businessId'])
    pred_df['prediction'] = predict
    pred_df.index = rt_df.index
    return pred_df

In [141]:
pred_train_penalty_df = get_pred_with_penalty(training, similarity_m, 20, 0.6, training_popular) 
pred_train_penalty_df

Unnamed: 0,userId,businessId,prediction
0,7962,5291,4.456879
1,935,2146,4.412000
2,3421,4333,4.215910
3,895,2348,4.832512
4,2511,6201,4.527690
...,...,...,...
85079,1975,1219,4.670157
85080,2280,3135,2.837746
85081,19795,1961,4.483163
85082,7269,2737,4.697386


In [143]:
pred_train_penalty_df['prediction']

0        4.456879
1        4.412000
2        4.215910
3        4.832512
4        4.527690
           ...   
85079    4.670157
85080    2.837746
85081    4.483163
85082    4.697386
85083    4.675303
Name: prediction, Length: 85084, dtype: float64

In [145]:
RMSE_train_penalty = (sum((pred_train_penalty_df['prediction'] - training['rating'])**2) / pred_train_penalty_df.shape[0])**(1/2)
print('The overall RMSE on the test data is %s'%RMSE_train_penalty)

The overall RMSE on the test data is 1.0715517656184912


In [186]:
pred_test_penalty_df = get_pred_with_penalty(test, similarity_m, 20, 0.6, test_popular) 
pred_test_penalty_df

Unnamed: 0,userId,businessId,prediction
0,27340,5468,67.150426
1,4544,6224,38.419180
2,20966,372,195.808120
3,14194,7448,19.285242
4,16673,329,45.385802
...,...,...,...
44354,57748,9415,0.000000
44355,42697,3371,121.666667
44356,528,8350,0.000000
44357,5397,2534,0.000000


Normalize the predictive ratings to the interval [0, 5]:

In [204]:
k = (5-0) / (np.max(pred_test_penalty_df['prediction']) - np.min(pred_test_penalty_df['prediction']))
pred_test_penalty_df['prediction'] = k*(pred_test_penalty_df['prediction']-np.min(pred_test_penalty_df['prediction']))

In [206]:
RMSE_test_penalty = (sum((pred_test_penalty_df['prediction'] - test['rating'])**2) / pred_test_penalty_df.shape[0])**(1/2)
print('The overall RMSE on the test data is %s'%RMSE_test_penalty)

The overall RMSE on the test data is 3.673848792136394


## Make recommendations for 10 random user in the test set:

For each user in the test set, we should find out the businesses they havn't rated yet:

In [207]:
u_no_rate_test = {}
bid_test = np.unique(list(test['businessId']))
group_u_test = test.groupby('userId')
for name, group in group_u_test:
    u_no_rate_test[name] = list(set(bid_test) - set(list(group['businessId'])))

In [208]:
# Randomly select 100 users as target users
random.seed(0)
target_u = random.sample(list(u_no_rate_test.keys()), 10)

Make recommendations for 10 random users by using the original method:

In [209]:
rec = {}
for tu in target_u: # For each user in the target users.
    tu_no_rate = u_no_rate_test[tu] # Find all the businesses that hasn't been rated by user tu.

    # Find users who has rated the businesses that hasn't been rated by user tu.
    u_buy = {}
    for b in tu_no_rate:
        u_buy[b] = list(test[test['businessId'] == b]['userId'])

    tu_rec = {}

    for b in tu_no_rate: # For each business b that has not been rated by user tu
        numerator = 0
        denominator = 0
        top_k_i = list((np.array(similarity_m[tu, u_buy[b]].todense()).flatten()).argsort()[::-1][0:20])
        # print('u_buy[b]...')
        # print(u_buy[b])
        # print('top_20...')
        # print([u_buy[b][j] for j in top_k_i])
        for u_rated in [u_buy[b][j] for j in top_k_i]: # For each user who has rated buisness b 
            numerator += similarity_m[tu, u_rated] * float(test[(test['userId'] == u_rated) & (test['businessId'] == b)]['rating'])
            denominator += similarity_m[tu, u_rated]
        if denominator == 0:
            tu_rec[b] = 0
        else:
            tu_rec[b] = numerator/ denominator
    
    rec[tu] = tu_rec

In [210]:
final_rec = {}
for u in rec.keys():
    final_rec[u] = sorted(rec[u].items(), key = lambda x:x[1],reverse = True)[0:20]

Make recommendations for 10 random users by using the penalty method:

In [211]:
rec_penalty = {}
for tu in target_u: # For each user in the target users.
    tu_no_rate = u_no_rate_test[tu] # Find all the businesses that hasn't been rated by user tu.

    # Find users who has rated the businesses that hasn't been rated by user tu.
    u_buy = {}
    for b in tu_no_rate:
        u_buy[b] = list(test[test['businessId'] == b]['userId'])

    tu_rec = {}

    for b in tu_no_rate: # For each business b that has not been rated by user tu
        numerator = 0
        denominator = 0
        top_k_i = list((np.array(similarity_m[tu, u_buy[b]].todense()).flatten()).argsort()[::-1][0:20])
        # print('u_buy[b]...')
        # print(u_buy[b])
        # print('top_20...')
        # print([u_buy[b][j] for j in top_k_i])
        for u_rated in [u_buy[b][j] for j in top_k_i]: # For each user who has rated buisness b
            if float(test_popular[test_popular['businessId'] == b]['popular']) > 0.5:
                numerator += similarity_m[tu, u_rated] * float(test[(test['userId'] == u_rated) & (test['businessId'] == b)]['rating']) * float(test_popular[test_popular['businessId'] == b]['popular']) 
            else:
                numerator += similarity_m[tu, u_rated] * float(test[(test['userId'] == u_rated) & (test['businessId'] == b)]['rating']) / float(test_popular[test_popular['businessId'] == b]['popular']) 
            denominator += similarity_m[tu, u_rated]
        if denominator == 0:
            tu_rec[b] = 0
        else:
            tu_rec[b] = numerator / denominator
    
    rec_penalty[tu] = tu_rec

Normalize the predictive ratings to the interval [0, 5]:

In [212]:
for u in rec_penalty:
    r_l = list(rec_penalty[u].values())
    if max(r_l) == min(r_l):
        k = 0
    else:
        k = (5-0) / (max(r_l) - min(r_l))
    r_l = [k*(x-min(r_l)) for x in r_l]
    j = 0
    for b in rec_penalty[u].keys():
        rec_penalty[u][b] = r_l[j]
        j += 1

ZeroDivisionError: division by zero

In [216]:
final_rec_penalty = {}
for u in rec_penalty.keys():
    final_rec_penalty[u] = sorted(rec_penalty[u].items(), key = lambda x:x[1],reverse = True)[0:20]

## Calculate the PRUs and PRIs:

Define a function to compute SRC:

In [217]:
def get_SRC(recom_list, rank_posi, business_popular):
    # popularity list
    popularity = []
    for pair in recom_list:
        popularity.append(float(business_popular[business_popular['businessId'] == pair[0]]['popular']))
    
    SRC,_ = stats.spearmanr(rank_posi, popularity)
    
    return SRC

Compute the PRU and PRI for the original method:

In [218]:
SRC_list = []
SRC = 0
pop_list = []
for uid in final_rec:
    #print(uid)
    rank_posi = list(np.arange(1, 21))
    SRC = get_SRC(final_rec[uid], rank_posi, test_popular)
    SRC_list.append(SRC)
    
PRU = -np.mean(SRC_list)
PRU

-0.009387276698548127

In [219]:
ave_rank = {}
rank_count = {}
pop = {}
for uid in final_rec:
    recom_list = final_rec[uid] 
    rank_posi = list(np.arange(1, 21))
    for index in range(len(recom_list)):
        bid = recom_list[index][0]
        ave_rank.setdefault(bid,0)
        rank_count.setdefault(bid,0)
        ave_rank[bid] += rank_posi[index]
        rank_count[bid] += 1

for business in ave_rank:
    ave_rank[business] = ave_rank[business]/rank_count[business]
    pop.setdefault(business,0)
    pop[business] = float(test_popular[test_popular['businessId'] == business]['popular'])


SRC,_ = stats.spearmanr(list(ave_rank.values()), list(pop.values()))
PRI = -SRC
PRI

0.021214723802199267

Compute the PRU and PRI for the method with penalty:

In [220]:
SRC_list_penalty = []
SRC = 0
pop_list = []
for uid in final_rec_penalty:
    #print(uid)
    rank_posi = list(np.arange(1, 21))
    # print(final_rec_penalty)
    SRC = get_SRC(final_rec_penalty[uid], rank_posi, test_popular)
    if math.isnan(SRC):
        continue
    SRC_list_penalty.append(SRC)
    
PRU_penalty = -np.mean(SRC_list_penalty)
PRU_penalty



-0.558868152048498

In [221]:
ave_rank = {}
rank_count = {}
pop = {}
for uid in final_rec_penalty:
    recom_list = final_rec_penalty[uid] 
    rank_posi = list(np.arange(1, 21))
    for index in range(len(recom_list)):
        bid = recom_list[index][0]
        ave_rank.setdefault(bid,0)
        rank_count.setdefault(bid,0)
        ave_rank[bid] += rank_posi[index]
        rank_count[bid] += 1

for business in ave_rank:
    ave_rank[business] = ave_rank[business]/rank_count[business]
    pop.setdefault(business,0)
    pop[business] = float(test_popular[test_popular['businessId'] == business]['popular'])


SRC,_ = stats.spearmanr(list(ave_rank.values()), list(pop.values()))
PRI = -SRC
PRI

-0.15567061479339828