# Model Evaluation 
This book is used to build and hybrid user-based and item-based CF and assign weights to both by calculating the Root Mean Square Error (RMSE). In this way, we can make our prediction more precisely.

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
ratings_matrix = pd.read_pickle('./Dataset/ratings_matrix_new.pkl')
review = pd.read_pickle('./Dataset/review_new.pkl')
user_index = pd.read_pickle('./Dataset/unique_user.pkl')
business_index = pd.read_pickle('./Dataset/unique_business.pkl')

In [4]:
business_index
user_index

Unnamed: 0,business_num,business_id
0,0,kxX2SOes4o-D3ZQBkiMRfA
1,1,YtSqYv1Q_pOltsVPSx54SA
2,2,eFvzHawVJofxSnD7TgbZtg
3,3,kq5Ghhh14r-eCxlVmlyd8w
4,4,oBhJuukGRqPVvYBfTkhuZA
...,...,...
1700,1700,3ut1fzbMfQ1VhFvHpeLOMw
1701,1701,5R3-eCIk4dRBtXo0A5MAzQ
1702,1702,KTgZXj6xh8aN_tLfI-YZ1Q
1703,1703,saVXla5i8TjE51S5uCaf6w


Unnamed: 0,user_num,user_id
0,0,_7bHUi9Uuf5__HHc_Q8guQ
1,1,kSMOJwJXuEUqzfmuFncK4A
2,2,mqBWACmaHflW4eh_Ofp16Q
3,3,Z-xgVb4nM42943m2wbBkFw
4,4,2SEoXb6r6hPKrl9V9VzBgA
...,...,...
168671,168671,-2qfrhPeLqUfcfyZURBPmg
168672,168672,hncXq9D32g-KQKa8hiF9uQ
168673,168673,-2ZzM5wRWnYR3g0aTtLfeg
168674,168674,X-PJ2iZyw_zqhnwgyoqxyw


In [5]:
ratings_matrix
review

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1695,1696,1697,1698,1699,1700,1701,1702,1703,1704
0,5.0,,,,,,,,,,...,,,,,,,,,,
1,2.0,,,,,,,,,,...,,,,,,,,,,
2,5.0,,,,3.0,,,,,,...,,,,,,,,,,
3,5.0,,,,,,,,,,...,,,,,,,,,,
4,5.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168671,,,,,,,,,,,...,,,,,,,,,,
168672,,,,,,,,,,,...,,,,,,,,,,
168673,,,,,,,,,,,...,,,,,,,,,,
168674,,,,,,,,,,,...,,,,,,,,,,


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,date
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,2015-01-04 00:01:03
1,HME_ksGph3se7Aze5hxa-Q,kSMOJwJXuEUqzfmuFncK4A,kxX2SOes4o-D3ZQBkiMRfA,2,0,0,1,2014-07-13 17:25:47
2,EJWyA5wpdVMji1j4TwSZqQ,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,5,13,6,5,2010-08-20 19:16:04
3,T_kAb2NeylB-JdNDKphryw,Z-xgVb4nM42943m2wbBkFw,kxX2SOes4o-D3ZQBkiMRfA,5,1,1,1,2017-01-02 14:25:26
4,NENaCqb6TNj5CyY1LOdI6Q,2SEoXb6r6hPKrl9V9VzBgA,kxX2SOes4o-D3ZQBkiMRfA,5,0,0,0,2015-07-28 17:15:20
...,...,...,...,...,...,...,...,...
474985,ral4AK_Zglae9IS1LOivOw,Gflk362qyCMGLF1bxJJEhw,J8UPVO_FTALzvJ0tlMdr8w,4,1,0,0,2019-04-13 02:22:15
474986,H6jwqzViHdpAR_6iAxjKtQ,o95300V4zO8GXSnBqhKgVQ,_7V_3b2dSSVIXqVMJwGmoQ,1,0,0,1,2021-03-29 01:38:26
474987,_0Vnle-wo_IZEQQMwY_jww,o95300V4zO8GXSnBqhKgVQ,_7V_3b2dSSVIXqVMJwGmoQ,2,0,0,0,2021-02-11 09:11:27
474988,Tll1m0G3TslbKj96Kc6XPQ,JOrDiXIgpb0sjtd7Cr3CdA,_7V_3b2dSSVIXqVMJwGmoQ,3,1,0,1,2015-10-26 14:48:50


In [6]:
ratings_matrix.shape

(168676, 1705)

# Item_based CF

In [7]:
def find_business_similarity(businessA, businessB, ratings_matrix):
    
    users_rated_businessA = ~ratings_matrix.loc[:, businessA].isna()
    users_rated_businessB = ~ratings_matrix.loc[:, businessB].isna()

    users_rated_AB = users_rated_businessA & users_rated_businessB

    ratings_of_businessA = ratings_matrix.loc[users_rated_AB, businessA].values.reshape(1, -1)
    ratings_of_businessB = ratings_matrix.loc[users_rated_AB, businessB].values.reshape(1, -1)
    
    similarity = cosine_similarity(ratings_of_businessA, ratings_of_businessB)[0][0]

    return similarity

In [8]:
def item_item_rating_prediction(target_user, target_business, ratings_matrix):

    similarities_to_target_business = []
    ratings_given_by_target_user = []
    
    list_of_businesses_rated_by_target_user = list(ratings_matrix.loc[:, ~ratings_matrix.iloc[target_user, :].isna()].columns)
    
    for other_business in list_of_businesses_rated_by_target_user:
        try:
            similarity = find_business_similarity(target_business, other_business, ratings_matrix)
            similarities_to_target_business.append(similarity)
            ratings_given_by_target_user.append(ratings_matrix.loc[target_user, other_business])
        except:
            pass
        
    return np.dot(ratings_given_by_target_user, similarities_to_target_business)/np.sum(similarities_to_target_business)

In [9]:
def find_user_similarity(userA, userB, ratings_matrix):
    
    businesses_rated_by_userA = ~ratings_matrix.loc[userA, :].isna()
    businesses_rated_by_userB = ~ratings_matrix.loc[userB, :].isna()
    
    businesses_rated_by_AB = businesses_rated_by_userA & businesses_rated_by_userB
    
    ratings_of_userA = ratings_matrix.loc[userA, businesses_rated_by_AB].values.reshape(1, -1)
    ratings_of_userB = ratings_matrix.loc[userB, businesses_rated_by_AB].values.reshape(1, -1)
    
    similarity = cosine_similarity(ratings_of_userA, ratings_of_userB)[0][0]
    
    return similarity

# User_based CF

In [10]:
def user_item_rating_prediction(target_user, target_business, ratings_matrix):
   
    similarities_to_target_user = []
    ratings_given_to_target_business = []
    
    list_of_users_rating_target_business = list(ratings_matrix[~ratings_matrix.iloc[:, target_business].isna()].index)
    
    for other_user in list_of_users_rating_target_business:
        try:
            similarity = find_user_similarity(target_user, other_user, ratings_matrix)
            similarities_to_target_user.append(similarity)
            ratings_given_to_target_business.append(ratings_matrix.loc[other_user, target_business])
        except:
            pass
    return np.dot(ratings_given_to_target_business, similarities_to_target_user)/np.sum(similarities_to_target_user)

# Split as train/test group

In [11]:
train_df, test_df = train_test_split(ratings_matrix, test_size = 0.00020)
train_df.shape
test_df.shape

(168642, 1705)

(34, 1705)

In [12]:
test_users = pd.Series(list(test_df.index), name = 'user_num')
test_users

0      69121
1      93059
2       7790
3       4877
4     113217
5     144528
6      57456
7      39557
8      54738
9     157366
10    154304
11     17123
12    113614
13    116287
14    102083
15    118941
16     47694
17    136323
18    146477
19    132226
20     41050
21     94537
22     21310
23     55556
24    105408
25    116370
26     40712
27     36292
28     10529
29     20483
30     25034
31    163847
32    144076
33    159158
Name: user_num, dtype: int64

In [13]:
review_edit = review.copy()
review_edit = pd.merge(review_edit,user_index,how = 'left')
review_edit = pd.merge(review_edit,business_index,how = 'left')
review_edit.head(1)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,date,user_num,business_num
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,2015-01-04 00:01:03,0,0


In [14]:
review_edit = review_edit.sort_values('date', ascending = False)

In [15]:
review_edit = review_edit.drop_duplicates(subset = ['user_num','business_num'],keep='first')
review_edit.shape

(460362, 10)

In [16]:
review_edit = review_edit[['user_num', 'business_num', 'stars']]
review_edit.tail(1)

Unnamed: 0,user_num,business_num,stars
339782,135575,1230,5


In [17]:
review_edit = pd.merge(review_edit, test_users, on = 'user_num', how = 'inner')
review_edit

Unnamed: 0,user_num,business_num,stars
0,57456,353,5
1,25034,1525,5
2,25034,134,5
3,146477,1253,4
4,146477,1493,5
...,...,...,...
126,17123,418,3
127,7790,922,5
128,7790,6,5
129,39557,272,3


In [18]:
review_edit.insert(3, 'check', 0)

In [19]:
review_edit

Unnamed: 0,user_num,business_num,stars,check
0,57456,353,5,0
1,25034,1525,5,0
2,25034,134,5,0
3,146477,1253,4,0
4,146477,1493,5,0
...,...,...,...,...
126,17123,418,3,0
127,7790,922,5,0
128,7790,6,5,0
129,39557,272,3,0


In [20]:
for record in review_edit.index:
    review_edit.loc[record, ['check']] = ratings_matrix.loc[review_edit.loc[record, ['user_num'][0]], review_edit.loc[record, ['business_num'][0]]]

In [21]:
(review_edit['stars'] - review_edit['check']).value_counts()

0    131
dtype: int64

In [22]:
review_edit = review_edit.drop(columns = {'check'})

In [23]:
review_edit.insert(3, 'item-item', np.nan)
review_edit.insert(4, 'user-item', np.nan)
review_edit

Unnamed: 0,user_num,business_num,stars,item-item,user-item
0,57456,353,5,,
1,25034,1525,5,,
2,25034,134,5,,
3,146477,1253,4,,
4,146477,1493,5,,
...,...,...,...,...,...
126,17123,418,3,,
127,7790,922,5,,
128,7790,6,5,,
129,39557,272,3,,


In [24]:
%%time

for record in review_edit.index:
    user_num = review_edit.loc[record, ['user_num'][0]]
    business_num = review_edit.loc[record, ['business_num'][0]]
    
    review_edit.loc[record, ['user-item']] = round(user_item_rating_prediction(user_num, business_num, ratings_matrix),2)
    review_edit.loc[record, ['item-item']] = round(item_item_rating_prediction(user_num, business_num, ratings_matrix),2)

CPU times: user 1min 27s, sys: 1.98 s, total: 1min 29s
Wall time: 1min 29s


In [25]:
review_edit

Unnamed: 0,user_num,business_num,stars,item-item,user-item
0,57456,353,5,5.00,3.70
1,25034,1525,5,5.00,4.45
2,25034,134,5,5.00,4.30
3,146477,1253,4,4.49,4.06
4,146477,1493,5,4.51,4.27
...,...,...,...,...,...
126,17123,418,3,3.98,3.74
127,7790,922,5,5.00,4.26
128,7790,6,5,5.00,3.67
129,39557,272,3,3.00,3.43


From the above result, we can see item_based is closer to the real 'stars' most of the time.

In [26]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

√[∑di^2/n]=Re: Reflects the degree of dispersion of a data set.

In [27]:
rmse(review_edit['user-item'], review_edit['stars'])
rmse(review_edit['item-item'], review_edit['stars'])

1.0814374092979249

0.9626129357754019

# Hybrid. Give them different weights.

In [28]:
review_edit.insert(5, 'hybrid1-9', (review_edit['user-item']*.1 + review_edit['item-item']*.9))
review_edit.insert(6, 'hybrid2-8', (review_edit['user-item']*.2 + review_edit['item-item']*.8))
review_edit.insert(7, 'hybrid3-7', (review_edit['user-item']*.3 + review_edit['item-item']*.7))
review_edit.insert(8, 'hybrid4-6', (review_edit['user-item']*.4 + review_edit['item-item']*.6))
review_edit.insert(9, 'hybrid5-5', (review_edit['user-item']*.5 + review_edit['item-item']*.5))
review_edit.insert(10, 'hybrid6-4', (review_edit['user-item']*.6 + review_edit['item-item']*.4))
review_edit.insert(11, 'hybrid7-3', (review_edit['user-item']*.7 + review_edit['item-item']*.3))
review_edit.insert(12, 'hybrid8-2', (review_edit['user-item']*.8 + review_edit['item-item']*.2))
review_edit.insert(13, 'hybrid9-1', (review_edit['user-item']*.9 + review_edit['item-item']*.1))

In [29]:
review_edit

Unnamed: 0,user_num,business_num,stars,item-item,user-item,hybrid1-9,hybrid2-8,hybrid3-7,hybrid4-6,hybrid5-5,hybrid6-4,hybrid7-3,hybrid8-2,hybrid9-1
0,57456,353,5,5.00,3.70,4.870,4.740,4.610,4.480,4.350,4.220,4.090,3.960,3.830
1,25034,1525,5,5.00,4.45,4.945,4.890,4.835,4.780,4.725,4.670,4.615,4.560,4.505
2,25034,134,5,5.00,4.30,4.930,4.860,4.790,4.720,4.650,4.580,4.510,4.440,4.370
3,146477,1253,4,4.49,4.06,4.447,4.404,4.361,4.318,4.275,4.232,4.189,4.146,4.103
4,146477,1493,5,4.51,4.27,4.486,4.462,4.438,4.414,4.390,4.366,4.342,4.318,4.294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,17123,418,3,3.98,3.74,3.956,3.932,3.908,3.884,3.860,3.836,3.812,3.788,3.764
127,7790,922,5,5.00,4.26,4.926,4.852,4.778,4.704,4.630,4.556,4.482,4.408,4.334
128,7790,6,5,5.00,3.67,4.867,4.734,4.601,4.468,4.335,4.202,4.069,3.936,3.803
129,39557,272,3,3.00,3.43,3.043,3.086,3.129,3.172,3.215,3.258,3.301,3.344,3.387


In [30]:
# Print the range of RMSE values for each of the rating prediction columns
# Ranging from using 100% of the item-item ratings to 100% of the user-item ratings
print('item-item:', rmse(review_edit['item-item'], review_edit['stars']))
print('hybrid 1-9:', rmse(review_edit['hybrid1-9'], review_edit['stars']))
print('hybrid 2-8:', rmse(review_edit['hybrid2-8'], review_edit['stars']))
print('hybrid 3-7:', rmse(review_edit['hybrid3-7'], review_edit['stars']))
print('hybrid 4-6:', rmse(review_edit['hybrid4-6'], review_edit['stars']))
print('hybrid 5-5:', rmse(review_edit['hybrid5-5'], review_edit['stars']))
print('hybrid 6-4:', rmse(review_edit['hybrid6-4'], review_edit['stars']))
print('hybrid 7-7:', rmse(review_edit['hybrid7-3'], review_edit['stars']))
print('hybrid 8-8:', rmse(review_edit['hybrid8-2'], review_edit['stars']))
print('hybrid 9-1:', rmse(review_edit['hybrid9-1'], review_edit['stars']))
print('user-item:', rmse(review_edit['user-item'], review_edit['stars']))

item-item: 0.9626129357754019
hybrid 1-9: 0.946348664020365
hybrid 2-8: 0.936389264655374
hybrid 3-7: 0.932936679163299
hybrid 4-6: 0.9360629097407749
hybrid 5-5: 0.9457027154207596
hybrid 6-4: 0.9616602374091071
hybrid 7-7: 0.983628044420752
hybrid 8-8: 1.0112145067323521
hybrid 9-1: 1.043974309443784
user-item: 1.0814374092979249


Conclusion:
1. Item-based approach is much better than User-based filtering system;
2. Hybrid approach can further reduce our Root Squared Mean Error (RSME);
3. 10% item-based and 90% user-based，gives an answer that is close to the correct answer and minimizes RSME at most time.