# Download the MovieLens-100K dataset

In [1]:
import pandas as pd
import numpy as np

# Load MovieLens 100K dataset into a dataframe of pandas
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [2]:
# Select 500 most active users and 500 most active items from the dataset
n_most_active_users = 500
n_most_active_items = 500

user_ids = df.groupby('user_id').count().sort_values(by='rating', ascending=False).head(n_most_active_users).index
item_ids = df.groupby('item_id').count().sort_values(by='rating', ascending=False).head(n_most_active_items).index
df = df[(df['user_id'].isin(user_ids)) & (df['item_id'].isin(item_ids))]

In [3]:
# Map new internal ID for items
i_ids = df['item_id'].unique().tolist()
item_dict = dict(zip(i_ids, [i for i in range(len(i_ids))]))
df['item_id'] = df['item_id'].map(item_dict)

# Split Dataset

In [4]:
# The number of training users and active users
n_training_users = 300
n_active_users = n_most_active_users - n_training_users

# The number of GIVEN ratings for active users
GIVEN = 20

# Randomly select users from the most active users as training set
random_uids = np.random.choice(df.user_id.unique(), n_training_users, replace=False)
train_df = df[df['user_id'].isin(random_uids)]
# Map new internal ID for all users in the training set
u_ids = train_df['user_id'].unique().tolist()
user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
train_df['user_id'] = train_df['user_id'].map(user_dict)

# The rest of users are active users for testing
remain_df = df[~df['user_id'].isin(random_uids)]
# Map new internal ID for all active users
u_ids = remain_df['user_id'].unique().tolist()
user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
remain_df['user_id'] = remain_df['user_id'].map(user_dict)

# Randomly select GIVEN ratings for active users
active_df = remain_df.groupby('user_id').sample(n=GIVEN, random_state=1024)

test_df = remain_df[~remain_df.index.isin(active_df.index)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['user_id'] = train_df['user_id'].map(user_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remain_df['user_id'] = remain_df['user_id'].map(user_dict)


In [5]:
# Convert the format of datasets to matrices
df_zeros = pd.DataFrame({'user_id': np.tile(np.arange(0, n_training_users), n_most_active_items), 'item_id': np.repeat(np.arange(0, n_most_active_items), n_training_users), 'rating': 0})
train_ds = df_zeros.merge(train_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')

df_zeros = pd.DataFrame({'user_id': np.tile(np.arange(0, n_active_users), n_most_active_items), 'item_id': np.repeat(np.arange(0, n_most_active_items), n_active_users), 'rating': 0})
active_ds = df_zeros.merge(active_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')
test_ds = df_zeros.merge(test_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')

train_ds, active_ds, test_ds

(item_id  0    1    2    3    4    5    6    7    8    9    ...  490  491  492  \
 user_id                                                    ...                  
 0        0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  4.0  ...  0.0  0.0  0.0   
 1        4.0  0.0  5.0  0.0  0.0  3.0  4.0  2.0  0.0  2.0  ...  0.0  2.0  0.0   
 2        0.0  0.0  0.0  0.0  5.0  5.0  0.0  3.0  5.0  0.0  ...  0.0  4.0  0.0   
 3        0.0  0.0  0.0  5.0  4.0  4.0  0.0  0.0  0.0  5.0  ...  0.0  0.0  0.0   
 4        0.0  4.0  0.0  0.0  0.0  0.0  3.0  0.0  0.0  4.0  ...  0.0  0.0  3.0   
 ...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
 295      4.0  0.0  4.0  0.0  0.0  4.0  3.0  4.0  0.0  0.0  ...  0.0  0.0  0.0   
 296      0.0  0.0  5.0  4.0  0.0  1.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  0.0   
 297      4.0  0.0  3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 298      0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 299      0.0  1

# Algorithm that you need to implement

In [6]:
## Put all your implementation for your algorithm in this cell only to handle the missing values; 
## and please DO NOT change anything in the rest of the cells in this framework. 

## Note: 
## The user-item rating matrix is train_ds, and the missing values are those 0s in train_ds. 

## The following parameters are required in the given report "", please do not change them. 
LAMBDA = 0.7    # λ
GAMMA = 10      # γ
DELTA = 10      # δ
ITA = 0.7       # η
THETA = 0.7     # θ
EPSILON = 1e-9

# Compute Pearson Correlation Coefficient for All Pairs of Users in training set
user_pearson_corr = np.zeros((train_ds.shape[0], train_ds.shape[0]))

for i, user_i_vec in enumerate(train_ds.values):
    for j, user_j_vec in enumerate(train_ds.values):

        if i == j:
            continue

        # ratings corated by the current pair od users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0

        # corrated item index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of user_i_vec and user_j_vec
        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

        r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
        r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

        sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim

        user_pearson_corr[i][j] = weighted_sim


# Compute Pearson Correlation Coefficient for All Pairs of Items in training set
item_pearson_corr = np.zeros((train_ds.shape[1], train_ds.shape[1]))

for i, item_i_vec in enumerate(train_ds.T.values):
    for j, item_j_vec in enumerate(train_ds.T.values):

        if i == j:
            continue

        # ratings corated by the current pair od items
        mask_i = item_i_vec > 0
        mask_j = item_j_vec > 0

        # corrated index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of item_i_vec and item_j_vec
        mean_item_i = np.sum(item_i_vec) / (np.sum(np.clip(item_i_vec, 0, 1)) + EPSILON)
        mean_item_j = np.sum(item_j_vec) / (np.sum(np.clip(item_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        item_i_sub_mean = item_i_vec[corrated_index] - mean_item_i
        item_j_sub_mean = item_j_vec[corrated_index] - mean_item_j

        r_ui_sub_ri_sq = np.square(item_i_sub_mean)
        r_uj_sub_rj_sq = np.square(item_j_sub_mean)

        r_ui_sub_ri_sq_sum_sqrt = np.sqrt(np.sum(r_ui_sub_ri_sq))
        r_uj_sub_rj_sq_sum_sqrt = np.sqrt(np.sum(r_uj_sub_rj_sq))

        sim = np.sum(item_i_sub_mean * item_j_sub_mean) / (r_ui_sub_ri_sq_sum_sqrt * r_uj_sub_rj_sq_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), DELTA) / DELTA) * sim

        item_pearson_corr[i][j] = weighted_sim


# Predicting All Missing Data in training set
imputed_train_ds = train_ds.values.copy()

for (i, j), rating in np.ndenumerate(train_ds.values):
    if rating > 0:
        continue
    sim_user_ids = np.where(user_pearson_corr[i] > ITA)[0]
    sim_item_ids = np.where(item_pearson_corr[j] > THETA)[0]

    if len(sim_user_ids) > 0 and len(sim_item_ids) > 0:
        #==================user-based==================#
        # the coefficient values of similar users
        sim_val = user_pearson_corr[i][sim_user_ids]

        # the average value of the current user's ratings
        sim_users = train_ds.values[sim_user_ids]
        user_mean = np.sum(train_ds.values[i]) / (np.sum(np.clip(train_ds.values[i], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # select the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        # sim(u, v) * (r_vj - mean_v)
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
        
        user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        user_based_pred = np.clip(user_based_pred, 0, 5)

        #==================item-based==================#
        # the coefficient values of similar items
        sim_val = item_pearson_corr[j][sim_item_ids]

        # the average value of the current item's ratings
        sim_items = train_ds.T.values[sim_item_ids]
        item_mean = np.sum(train_ds.T.values[j]) / (np.sum(np.clip(train_ds.T.values[j], 0, 1)) + EPSILON)
        sim_item_mean = np.sum(sim_items, axis=1) / (np.sum(np.clip(sim_items, 0, 1), axis=1) + EPSILON)

        # sim(u, v) * (r_v - mean_v)
        sim_r_sum_mean = sim_val * (sim_items[:, i] - sim_item_mean) 

        # filter unrated items
        w = np.clip(sim_items[:, i], 0, 1)
        sim_r_sum_mean *= w

        item_based_pred = item_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val * w) + EPSILON)    
        item_based_pred = np.clip(item_based_pred, 0, 5)

        imputed_train_ds[i][j] = LAMBDA * user_based_pred + (1 - LAMBDA) * item_based_pred

    if len(sim_user_ids) > 0 and len(sim_item_ids) == 0:
        #==================user-based==================#
        # the coefficient values of similar users
        sim_val = user_pearson_corr[i][sim_user_ids]

        # the average value of the current user's ratings
        sim_users = train_ds.values[sim_user_ids]
        user_mean = np.sum(train_ds.values[i]) / (np.sum(np.clip(train_ds.values[i], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # select the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        # sim(u, v) * (r_vj - mean_v)
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
        
        user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        user_based_pred = np.clip(user_based_pred, 0, 5)

        imputed_train_ds[i][j] = user_based_pred
    
    if len(sim_user_ids) == 0 and len(sim_item_ids) > 0:
        #==================item-based==================#
        # the coefficient values of similar items
        sim_val = item_pearson_corr[j][sim_item_ids]

        # the average value of the current item's ratings
        sim_items = train_ds.T.values[sim_item_ids]
        item_mean = np.sum(train_ds.T.values[j]) / (np.sum(np.clip(train_ds.T.values[j], 0, 1)) + EPSILON)
        sim_item_mean = np.sum(sim_items, axis=1) / (np.sum(np.clip(sim_items, 0, 1), axis=1) + EPSILON)

        # sim(u, v) * (r_v - mean_v)
        sim_r_sum_mean = sim_val * (sim_items[:, i] - sim_item_mean) 

        # filter unrated items
        w = np.clip(sim_items[:, i], 0, 1)
        sim_r_sum_mean *= w

        item_based_pred = item_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val * w) + EPSILON)    
        item_based_pred = np.clip(item_based_pred, 0, 5)
        
        imputed_train_ds[i][j] = item_based_pred
        
imputed_train_ds = pd.DataFrame(imputed_train_ds)
imputed_train_ds


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,4.224719,3.333333,0.000000,3.958333,5.000000,3.307692,3.934211,0.000000,3.874647,4.000000,...,2.994085,2.878049,3.866667,4.074074,3.200000,2.161290,1.239088,4.169421,2.931034,3.679737
1,4.000000,3.333333,5.000000,3.958333,2.477002,3.000000,4.000000,2.000000,1.874647,2.000000,...,2.765957,2.000000,3.342878,3.981593,3.156220,1.820865,0.858333,4.000000,2.931034,1.705810
2,5.000000,3.333333,0.000000,2.333333,5.000000,5.000000,3.934211,3.000000,5.000000,0.000000,...,3.375511,4.000000,4.924124,4.185191,3.200000,2.261290,3.000000,4.393315,3.000000,1.941064
3,5.000000,3.333333,0.000000,5.000000,4.000000,4.000000,3.934211,0.000000,3.043478,5.000000,...,3.180592,2.878049,3.866667,5.000000,2.587755,2.161290,3.401805,3.470539,2.000000,3.625815
4,3.584754,4.000000,3.241596,3.395747,3.657221,3.533828,3.000000,3.662102,2.537958,4.000000,...,2.943897,2.675406,3.000000,2.892008,3.168247,2.856635,3.017338,2.813486,3.087558,3.144278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,4.000000,3.333333,4.000000,4.333333,3.477002,4.000000,3.000000,4.000000,2.874647,0.000000,...,4.144336,2.878049,2.166667,4.395050,3.200000,2.261290,2.747183,3.169421,2.931034,1.692655
296,3.276001,1.970600,5.000000,4.000000,3.645833,1.000000,3.934211,0.000000,3.043478,1.000000,...,1.388100,1.761770,3.866667,3.156877,2.426667,2.261290,1.808079,2.736219,1.000000,1.431598
297,4.000000,3.333333,3.000000,3.958333,3.645833,3.234522,3.934211,0.000000,3.043478,0.000000,...,2.765957,2.520084,3.260912,4.849807,3.100100,2.161290,2.636559,2.808316,2.931034,3.670467
298,4.297266,4.432506,4.496675,5.000000,4.123601,4.702862,4.210114,4.903579,3.942894,4.903579,...,3.809638,4.087383,4.030767,4.273906,3.686870,3.678238,3.729851,4.336236,3.909161,3.734818


# Evaluation

### Compute Pearson Correlation Coefficient of All Pairs of Items between active set and imputed training set

In [8]:
active_item_pearson_corr = np.zeros((active_ds.shape[1], imputed_train_ds.shape[1]))

# Compute Pearson Correlation Coefficient of All Pairs of Items between active set and imputed training set
for i, item_i_vec in enumerate(active_ds.T.values):
    for j, item_j_vec in enumerate(imputed_train_ds.T.values):

        if i == j:
            continue

        # Put all your implementation for your algorithm here
        # active_item_pearson_corr[i, j] = ?

        # ratings corated by the current pair od items
        mask_i = item_i_vec > 0
        mask_j = item_j_vec > 0

        # corrated index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of item_i_vec and item_j_vec
        mean_item_i = np.sum(item_i_vec) / (np.sum(np.clip(item_i_vec, 0, 1)) + EPSILON)
        mean_item_j = np.sum(item_j_vec) / (np.sum(np.clip(item_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        item_i_sub_mean = item_i_vec[corrated_index] - mean_item_i
        item_j_sub_mean = item_j_vec[corrated_index] - mean_item_j

        r_ui_sub_ri_sq = np.square(item_i_sub_mean)
        r_uj_sub_rj_sq = np.square(item_j_sub_mean)

        r_ui_sub_ri_sq_sum_sqrt = np.sqrt(np.sum(r_ui_sub_ri_sq))
        r_uj_sub_rj_sq_sum_sqrt = np.sqrt(np.sum(r_uj_sub_rj_sq))

        sim = np.sum(item_i_sub_mean * item_j_sub_mean) / (r_ui_sub_ri_sq_sum_sqrt * r_uj_sub_rj_sq_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), DELTA) / DELTA) * sim

        active_item_pearson_corr[i][j] = weighted_sim

active_item_pearson_corr

array([[ 0.00000000e+00,  2.31472290e-01, -1.95760684e-01, ...,
        -6.60895559e-02,  1.17677792e-01, -3.79290715e-01],
       [-1.29397347e-01,  0.00000000e+00,  1.89826929e-01, ...,
         2.01058086e-01,  6.58238709e-02,  4.82247840e-02],
       [-3.57056212e-02,  2.63336521e-01,  0.00000000e+00, ...,
         1.51089219e-01,  1.79837871e-01,  3.09124939e-02],
       ...,
       [-1.83995613e-01,  2.88952749e-01, -1.52412416e-01, ...,
         0.00000000e+00, -4.08620055e-10, -4.30270005e-01],
       [-1.68880626e-01,  6.41550043e-02,  0.00000000e+00, ...,
        -1.96464476e-01,  0.00000000e+00, -3.80446161e-02],
       [-1.69460879e-01,  1.96843792e-01,  9.99999997e-02, ...,
         1.99906694e-01,  1.93676411e-01,  0.00000000e+00]])

## Predict Ratings of Testing Set

In [None]:
test_ds_pred = np.zeros_like(test_ds.values)

for (i, j), rating in np.ndenumerate(test_ds.values):

    if rating > 0:

        sim_item_ids = np.argsort(active_item_pearson_corr[j])[-10:]

        #==================item-based==================#
        # the coefficient values of similar items
        sim_val = active_item_pearson_corr[j][sim_item_ids]

        # the average value of the current item's ratings
        sim_items = train_ds.T.values[sim_item_ids]
        item_mean = np.sum(active_ds.T.values[j]) / (np.sum(np.clip(active_ds.T.values[j], 0, 1)) + EPSILON)
        sim_item_mean = np.sum(sim_items, axis=1) / (np.sum(np.clip(sim_items, 0, 1), axis=1) + EPSILON)

        # sim(u, v) * (r_v - mean_v)
        sim_r_sum_mean = sim_val * (sim_items[:, i] - sim_item_mean) 

        # filter unrated items
        w = np.clip(sim_items[:, i], 0, 1)
        sim_r_sum_mean *= w

        item_based_pred = item_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val * w) + EPSILON)    
        item_based_pred = np.clip(item_based_pred, 0, 5)

        test_ds_pred[i][j] = item_based_pred

        
test_ds_pred

In [9]:
test_ds_pred = np.zeros_like(test_ds.values)

for (i, j), rating in np.ndenumerate(test_ds.values):

    if rating > 0:

        sim_user_ids = np.where(active_user_pearson_corr[i] > ITA)[0]
        sim_item_ids = np.where(active_item_pearson_corr[j] > THETA)[0]

        if len(sim_user_ids) == 0 and len(sim_item_ids) == 0:
            # Put all your implementation for your algorithm here
            # test_ds_pred[i, j] = ?
            user_mean = np.sum(active_ds.values[i]) / (np.sum(np.clip(active_ds.values[i], 0, 1)) + EPSILON)
            item_mean = np.sum(active_ds.values.T[j]) / (np.sum(np.clip(active_ds.values.T[j], 0, 1)) + EPSILON)
            test_ds_pred[i][j] = LAMBDA * user_mean + (1 - LAMBDA) * item_mean

        if len(sim_user_ids) > 0 and len(sim_item_ids) > 0:
            # Put all your implementation for your algorithm here
            # test_ds_pred[i, j] = ?
            #==================user-based==================#
            # the coefficient values of similar users
            sim_val = active_user_pearson_corr[i][sim_user_ids]

            # the average value of the current user's ratings
            sim_users = imputed_train_ds.values[sim_user_ids]
            user_mean = np.sum(active_ds.values[i]) / (np.sum(np.clip(active_ds.values[i], 0, 1)) + EPSILON)
            sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

            # select the users who rated item j
            mask_rated_j = sim_users[:, j] > 0
            
            # sim(u, v) * (r_vj - mean_v)
            sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
            
            user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
            user_based_pred = np.clip(user_based_pred, 0, 5)

            #==================item-based==================#
            # the coefficient values of similar items
            sim_val = active_item_pearson_corr[j][sim_item_ids]

            # the average value of the current item's ratings
            sim_items = imputed_train_ds.T.values[sim_item_ids]
            item_mean = np.sum(active_ds.T.values[j]) / (np.sum(np.clip(active_ds.T.values[j], 0, 1)) + EPSILON)
            sim_item_mean = np.sum(sim_items, axis=1) / (np.sum(np.clip(sim_items, 0, 1), axis=1) + EPSILON)

            # sim(u, v) * (r_v - mean_v)
            sim_r_sum_mean = sim_val * (sim_items[:, i] - sim_item_mean) 

            # filter unrated items
            w = np.clip(sim_items[:, i], 0, 1)
            sim_r_sum_mean *= w

            item_based_pred = item_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val * w) + EPSILON)    
            item_based_pred = np.clip(item_based_pred, 0, 5)

            test_ds_pred[i][j] = LAMBDA * user_based_pred + (1 - LAMBDA) * item_based_pred

        if len(sim_user_ids) > 0 and len(sim_item_ids) == 0:
            # Put all your implementation for your algorithm here
            # test_ds_pred[i, j] = ?
            #==================user-based==================#
            # the coefficient values of similar users
            sim_val = active_user_pearson_corr[i][sim_user_ids]

            # the average value of the current user's ratings
            sim_users = imputed_train_ds.values[sim_user_ids]
            user_mean = np.sum(active_ds.values[i]) / (np.sum(np.clip(active_ds.values[i], 0, 1)) + EPSILON)
            sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

            # select the users who rated item j
            mask_rated_j = sim_users[:, j] > 0
            
            # sim(u, v) * (r_vj - mean_v)
            sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
            
            user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
            user_based_pred = np.clip(user_based_pred, 0, 5)

            test_ds_pred[i][j] = user_based_pred
        
        if len(sim_user_ids) == 0 and len(sim_item_ids) > 0:
            # Put all your implementation for your algorithm here
            # test_ds_pred[i, j] = ?
            #==================item-based==================#
            # the coefficient values of similar items
            sim_val = active_item_pearson_corr[j][sim_item_ids]

            # the average value of the current item's ratings
            sim_items = imputed_train_ds.T.values[sim_item_ids]
            item_mean = np.sum(active_ds.T.values[j]) / (np.sum(np.clip(active_ds.T.values[j], 0, 1)) + EPSILON)
            sim_item_mean = np.sum(sim_items, axis=1) / (np.sum(np.clip(sim_items, 0, 1), axis=1) + EPSILON)

            # sim(u, v) * (r_v - mean_v)
            sim_r_sum_mean = sim_val * (sim_items[:, i] - sim_item_mean) 

            # filter unrated items
            w = np.clip(sim_items[:, i], 0, 1)
            sim_r_sum_mean *= w

            item_based_pred = item_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val * w) + EPSILON)    
            item_based_pred = np.clip(item_based_pred, 0, 5)
            
            test_ds_pred[i][j] = item_based_pred
        
test_ds_pred


array([[0.   , 0.   , 0.   , ..., 0.   , 3.165, 0.   ],
       [0.   , 3.34 , 0.   , ..., 0.   , 0.   , 0.   ],
       [0.   , 0.   , 4.1  , ..., 0.   , 0.   , 0.   ],
       ...,
       [0.   , 0.   , 0.   , ..., 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , ..., 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , ..., 0.   , 0.   , 0.   ]])

## Compute MAE and RMSE

In [10]:
# MAE
MAE = np.sum(np.abs(test_ds_pred - test_ds.values)) / np.sum(np.clip(test_ds.values, 0, 1))

# RMSE
RMSE = np.sqrt(np.sum(np.square(test_ds_pred - test_ds.values)) / np.sum(np.clip(test_ds.values, 0, 1)))

print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.8197537148201074, RMSE: 1.0419547104577915
