# Install and load necesary packages

In [1]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import math
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Please don't change this cell
df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


# Split dataset
## Random Train and Test Split

In [3]:
from sklearn.model_selection import train_test_split

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

train_df, test_df = train_test_split(df, test_size=0.2, random_state = 10)
train_df, test_df

# Training Dataset
train_ds = np.zeros((n_users, n_items))
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
train_ds = pd.DataFrame(train_ds)

# Testing Dataset
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    test_ds[row[1]-1, row[2]-1] = row[3]
test_ds = pd.DataFrame(test_ds)

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)



943 users
1682 items
Construct the rating matrix based on train_df:
     0     1     2     3     4     5     6     7     8     9     ...  1672  \
0     0.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   0.0   
1     4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0  ...   0.0   
2     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
3     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
4     4.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
..    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
938   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   5.0   0.0  ...   0.0   
939   0.0   0.0   0.0   2.0   0.0   0.0   4.0   5.0   3.0   0.0  ...   0.0   
940   5.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
941   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
942   0.0   5.0   0.0   0.0   0.0   0.0   0.0   0.0   3.0   0.0  ...   0.0

# Utils

In [4]:
# Please don't change this cell
EPSILON = 1e-9

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Your Solution

## Get User Similarities

In [5]:
#get user similarities
GAMMA = 30
EPSILON = 1e-9

np_user_pearson_corr = np.zeros((n_users, n_users))

for i, user_i_vec in enumerate(train_ds.values):
    for j, user_j_vec in enumerate(train_ds.values):

        # ratings corated by the current pair of users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0

        # corrated item index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of user_i_vec and user_j_vec
        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

        r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
        r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

        sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim

        np_user_pearson_corr[i][j] = weighted_sim

np_user_pearson_corr

array([[ 1.        ,  0.15598017,  0.01121976, ...,  0.08421613,
        -0.02888167, -0.05569836],
       [ 0.15598017,  1.        ,  0.04418078, ..., -0.01518162,
         0.02540037,  0.1535033 ],
       [ 0.01121976,  0.04418078,  1.        , ...,  0.0566994 ,
        -0.07264523,  0.03333333],
       ...,
       [ 0.08421613, -0.01518162,  0.0566994 , ...,  0.56666667,
        -0.06666667,  0.00212055],
       [-0.02888167,  0.02540037, -0.07264523, ..., -0.06666667,
         1.        ,  0.17427709],
       [-0.05569836,  0.1535033 ,  0.03333333, ...,  0.00212055,
         0.17427709,  1.        ]])

## Get Deviations

In [6]:
def deviation_j_i(item_j,item_i,u_prime,common_users):
    
    # co-rated index of users who have rated both item i and j (u ∈ S_sub_j,i(χ))
    corrated_index = common_users
    
    #return a deviation of 0 if no users are present that rated both items
    if len(corrated_index) == 0:
            return 0
       
    #to hold: card((Sj,i(χ)))
    card_SX = len(corrated_index) + EPSILON
    
    #to hold: sum[(uj - ui) / card(S_sub_j,i(χ))]
    sum_uj_minus_ui_over_card_Sx = 0
    
    #to hold: sum u∈Sj,i(χ)[((uj − ui) · exp(sim(u, u′)))]
    sum_uj_minus_ui_into_exp_sim=0
    
    #to hold: sum u∈Sj,i(χ)[exp(sim(u, u′)) · card(Sj,i(χ))]
    sum_exp_sim_card=0
    
    #iterate through corated users (u∈Sj,i(χ))
    for user_index in corrated_index:
        #skip same user
        if(user_index==u_prime):
            continue
            
        #to hold uj - ui
        uj_minus_ui = np_train_ds[user_index][item_j]-np_train_ds[user_index][item_i]
        
        #to hold: exp(sim(u, u′))
        exp_sim = math.pow(2,np_user_pearson_corr[user_index,u_prime])
        
        #calc sum[(uj - ui) / card(S_sub_j,i(χ))]
        sum_uj_minus_ui_over_card_Sx += uj_minus_ui/card_SX
        
        #calc sum u∈Sj,i(χ)[((uj − ui) · exp(sim(u, u′)))]
        sum_uj_minus_ui_into_exp_sim += uj_minus_ui*exp_sim
        
        #calc sum u∈Sj,i(χ)[exp(sim(u, u′)) · card(Sj,i(χ))]
        sum_exp_sim_card += exp_sim * card_SX
    
#   LHS of plus sign in dev(j,i) equation
    dev_lhs = LAMDA * sum_uj_minus_ui_over_card_Sx
        
#   RHS of plus sign in dev(j,i) equation
    dev_rhs = (1-LAMDA) * (sum_uj_minus_ui_into_exp_sim/sum_exp_sim_card)
    
    #return final deviation of item i and j
    dev = dev_lhs+dev_rhs    
    return dev      

## Get co-rated users for each item pair

In [7]:
#this cell took -  1 min 30 secs to run
#this cell calculates all users who have rated all pairs of items i and j and stores them in a dictionary to improve efficiency
#the dictionary will hold a key which is a pair of items, eg: co_rated_users_dict[(item_i,item_j)] would return a np array of users who has rated both items

#convert train_ds into numpy array for faster calculations
np_train_ds = train_ds.values

#keep dictionary to store co-rated user of items i and j
co_rated_users_dict = {}

# Iterate over all pairs of items (i, j)
for i in range(n_items):   
    # Get the indices of users who have rated item i
    mask_i = np.where(np_train_ds[:, i] > 0)[0]
    
    for j in range(n_items):
        if(j==i):
            continue
        # Get the indices of users who have rated item j
        mask_j = np.where(np_train_ds[:, j] > 0)[0]

        # Find the common users who have rated both item i and j
        corrated_index = np.intersect1d(mask_i, mask_j)


        # Store the common users in the dictionary
        co_rated_users_dict[(i, j)] = corrated_index
    #uncomment print to track i value, once i reaches 1681 run ends
#     print(i)

## Make Predictions

In [8]:
#this cell took -  15 mins to run
#predict ratings personalised weighted slope one method

np_predictions = np.zeros((n_users, n_items))

K = 100
EPSILON = 1e-9
LAMDA = 0.8

for (u_prime, j), rating in np.ndenumerate(test_ds.values):
    if rating <= 0:
        continue
    
    prediction = 0
    sum_numerator = 0
    sum_denominator = EPSILON
    
    #skip if user has already rated item j
    if(np_train_ds[u_prime][j]!=0):
        continue
    
    #calc Si∈S(u′)−{j}[((devj,i + u′i)cj,i)] / Si∈S(u′)−{j}[C_ji] 
    for i in range(n_items):
        #skip same item
        if(j==i):
            continue

        # co-rated index of users who have rated both item i and j
        corrated_index = co_rated_users_dict[(i, j)]
        #C_sub_ji is all co-rated users of items i,j
        c_j_i = len(corrated_index) + EPSILON
#         print(c_j_i)
        
        #get deviation of item i with respect to item j
        dev_j_i = deviation_j_i(j,i,u_prime,corrated_index)
        
        #get rating of item i by u_prime
        u_prime_i_rating = np_train_ds[u_prime][i]
        
        sum_numerator += (dev_j_i+u_prime_i_rating)*c_j_i
#         sum_denominator += c_j_i
        sum_denominator += 1
        
            
    prediction = sum_numerator/sum_denominator
    np_predictions[u_prime,j] = np.clip(prediction, 0, 5)
    
    #uncomment print(u_prime), when u_prime reaches 942 run ends
#     print(u_prime)
    
np_predictions       

array([[5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## MAE and RMSE

In [9]:
#MAE and RMSE results
MAE, RMSE = evaluate(test_ds, pd.DataFrame(np_predictions))

MAE = np.mean(MAE)
RMSE = np.mean(RMSE)

In [10]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 1.838357066554049, RMSE: 2.0683103676073196
