# Effective Missing Data Prediction for Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np

# Load MovieLens 100K dataset into a dataframe of pandas
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [2]:
# Select 500 most active users and 500 most active items from the dataset
n_most_active_users = 500
n_most_active_items = 500

user_ids = df.groupby('user_id').count().sort_values(by='rating', ascending=False).head(n_most_active_users).index
item_ids = df.groupby('item_id').count().sort_values(by='rating', ascending=False).head(n_most_active_items).index
df = df[(df['user_id'].isin(user_ids)) & (df['item_id'].isin(item_ids))]

In [3]:
# Map new internal ID for items
i_ids = df['item_id'].unique().tolist()
item_dict = dict(zip(i_ids, [i for i in range(len(i_ids))]))
df['item_id'] = df['item_id'].map(item_dict)

# Split Dataset

In [4]:
# The number of training users and active users
n_training_users = 300
n_active_users = n_most_active_users - n_training_users

# The number of GIVEN ratings for active users
GIVEN = 20

# Randomly select users from the most active users as training set
random_uids = np.random.choice(df.user_id.unique(), n_training_users, replace=False)
train_df = df[df['user_id'].isin(random_uids)]
# Map new internal ID for all users in the training set
u_ids = train_df['user_id'].unique().tolist()
user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
train_df['user_id'] = train_df['user_id'].map(user_dict)

# The rest of users are active users for testing
remain_df = df[~df['user_id'].isin(random_uids)]
# Map new internal ID for all active users
u_ids = remain_df['user_id'].unique().tolist()
user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
remain_df['user_id'] = remain_df['user_id'].map(user_dict)

# Randomly select GIVEN ratings for active users
active_df = remain_df.groupby('user_id').sample(n=GIVEN, random_state=1024)

test_df = remain_df[~remain_df.index.isin(active_df.index)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['user_id'] = train_df['user_id'].map(user_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remain_df['user_id'] = remain_df['user_id'].map(user_dict)


In [5]:
# Convert the format of datasets to matrices
df_zeros = pd.DataFrame({'user_id': np.tile(np.arange(0, n_training_users), n_most_active_items), 'item_id': np.repeat(np.arange(0, n_most_active_items), n_training_users), 'rating': 0})
train_ds = df_zeros.merge(train_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')

df_zeros = pd.DataFrame({'user_id': np.tile(np.arange(0, n_active_users), n_most_active_items), 'item_id': np.repeat(np.arange(0, n_most_active_items), n_active_users), 'rating': 0})
active_ds = df_zeros.merge(active_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')
test_ds = df_zeros.merge(test_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')

train_ds, active_ds, test_ds

(item_id  0    1    2    3    4    5    6    7    8    9    ...  490  491  492  \
 user_id                                                    ...                  
 0        0.0  2.0  0.0  4.0  0.0  4.0  4.0  0.0  0.0  2.0  ...  0.0  4.0  4.0   
 1        4.0  0.0  5.0  0.0  1.0  0.0  3.0  2.0  0.0  0.0  ...  0.0  0.0  0.0   
 2        0.0  0.0  0.0  0.0  5.0  5.0  0.0  3.0  5.0  0.0  ...  0.0  4.0  0.0   
 3        0.0  0.0  0.0  5.0  4.0  4.0  0.0  0.0  0.0  5.0  ...  0.0  0.0  0.0   
 4        0.0  4.0  0.0  0.0  0.0  0.0  3.0  0.0  0.0  4.0  ...  0.0  0.0  3.0   
 ...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
 295      4.0  0.0  4.0  0.0  0.0  4.0  3.0  4.0  0.0  0.0  ...  0.0  0.0  0.0   
 296      0.0  0.0  5.0  4.0  0.0  1.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  0.0   
 297      4.0  0.0  3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 298      0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 299      0.0  1

In [6]:
# Predicting All Missing Data in training set
imputed_train_ds = train_ds.values.copy()

# Full implementation to predict the missing values

Note: 
The user-item rating matrix is imputed_train_ds, and the missing values are those 0s in imputed_train_ds. 

The following parameters are required in the given report, which is named "Effective Missing Data Prediction for Collaborative Filtering".

In [7]:


LAMBDA = 0.7    # λ
GAMMA = 10      # γ
DELTA = 10      # δ
ITA = 0.7       # η
THETA = 0.7     # θ
EPSILON = 1e-9

# a K value of 50 because it gave the best results in my end
K = 50

#=======================================PCC FOR USER BASED========================================#

imputed_train_ds = pd.DataFrame(imputed_train_ds)
imputed_train_ds

imputed_trained_pearson_corr = np.zeros((train_ds.shape[0], train_ds.shape[0]))

# Compute Pearson Correlation Coefficient of All Pairs of Users in imputed training set
for i, user_i_vec in enumerate(imputed_train_ds.values):
    for j, user_j_vec in enumerate(imputed_train_ds.values):

        # a mask of the ratings corrated by the current pair of users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0

        #===The nominator part===#
        
        # the intersection or the corrated item index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # calculating the average value of user_i_vec and user_j_vec
        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr by passing in the corrated index and substract the mean
        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

        
        #===The denominator part===#
        
        #the square of (ra,i − ra) ==> the user_i_sub_mean which is the first part of the nominator
        r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
        
        # the square of (ru,i − ru) ==> the ser_j_sub_mean
        r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

        #the square root of the sum of the square of (ra,i − ra) ==> the user_i_sub_mean
        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
        #the square root of the sum of the square of (ru,i − ru) ==> the ser_j_sub_mean
        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

        # finally the similarity calculation for users ==> Equation 1
        sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)
        

        # significance weighting
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim
        imputed_trained_pearson_corr[i][j] = weighted_sim






#=======================================PCC FOR ITEM BASED========================================#


#The reason for making the size 500x500 is because there was an error when setting 
#the size to the correct true size of the dataset, and it only works with a minimum of 500x500
item_trained_pearson_corr = np.zeros((500,500))    

#In the big loop, we use T or transpose, the reason is- 
#we want to interchange each row with its corresponding column


for i, item_i_vec in enumerate(imputed_train_ds.T.values):
    for j, item_j_vec in enumerate(imputed_train_ds.T.values):

        
        mask_i = item_i_vec > 0
        mask_j = item_j_vec > 0

        #===The nominator part===#
        
        # the intersection or the corrated item index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of item_i_vec and item_j_vec
        mean_item_i = np.sum(item_i_vec) / (np.sum(np.clip(item_i_vec, 0, 1)) + EPSILON)
        mean_item_j = np.sum(item_j_vec) / (np.sum(np.clip(item_j_vec, 0, 1)) + EPSILON)

        
        #compute pearson corr by passing in the corrated index and substract the mean
        item_i_sub_mean = item_i_vec[corrated_index] - mean_item_i
        item_j_sub_mean = item_j_vec[corrated_index] - mean_item_j

        
        #===The denominator Part===#
        
        #the square of (ru,i − ri) ==> the item_i_sub_mean which is the first part of the nominator
        r_ui_sub_r_i_sq = np.square(item_i_sub_mean)
        
        # the square of (ru,i − ri) ==> the item_j_sub_mean
        r_uj_sub_r_j_sq = np.square(item_j_sub_mean)

        #the square root of the sum of the square of (ru,i − ri) ==> the item_j_sub_mean
        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
        
        #the square root of the sum of the square of (ru,i − ri) ==> the item_j_sub_mean
        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

        # finally the similarity calculation for items ==> Equation 2
        sim = np.sum(item_i_sub_mean * item_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), DELTA) / DELTA) * sim
        
        item_trained_pearson_corr[i][j] = weighted_sim
        
        

#FINDING SIMILAR USERS / ITEMS

train_ds_pred = np.zeros_like(imputed_train_ds.values)

#The main loop for finding similar users/items, and predicting the values.
for (i, j), rating in np.ndenumerate(imputed_train_ds.values):
    
    if rating > 0:

        #============USER-BASED============#
        
        # the first set of top K similar users, this set is going to be broken and compared against ITA-
        # to pick the final similar users for the current user. 
        sim_user_ids = np.argsort(imputed_trained_pearson_corr[i])[-(K+1):-1]
        
        # Making the array which will contain the similar users, and set the type to be an int-
        # because we don't want the values to be floats as they are ids
        new_user_ids = np.array([])
        new_user_ids = new_user_ids.astype(int)
      
        # this is the loop where we loop inside each user of those top K similar users
        for x in sim_user_ids:
             
            # we check each user x in the top K set against the current user and get the PCC value
            # if the value is greater than ITA then we pick this user as a similar user
            if imputed_trained_pearson_corr[i][x] > ITA:
                
                # we add the user to a new set
                new_user_ids = np.append(new_user_ids, x)

            # get the coeffecient values of those new similar users.
            sim_val = imputed_trained_pearson_corr[i][new_user_ids]
    
            # take the values of those users
            sim_users = imputed_train_ds.values[new_user_ids]    
        
        # calculate the mean of the user
        user_mean = np.sum(imputed_train_ds.values[i]) / (np.sum(np.clip(imputed_train_ds.values[i], 0, 1)) + EPSILON)
        
        #calculate the mean of those similar users for the current user
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # a mask in order to filter and have those users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        
        # this is one part of the user equation (nominator) ==> sim(u, v) * (r_vj - mean_v)
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
        
        # the prediction eqation for users 
        # NOTE: there is no mean value added here, the reason is I already added it inside the conditions down-
        # to avoid adding the mean twice.
        user_based_pred = np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
      
        
        #============ITEM-BASED============#
        
        # Making the array which will contain the similar items, and set the type to be an int-
        # because we don't want the values to be floats as they are ids
        new_item_ids = np.array([])
        new_item_ids = new_item_ids.astype(int)
        
        # This variable contains all of the top K similar items
        sim_item_ids = np.argsort(item_trained_pearson_corr[j])[-(K + 1): -1]
      
        #This is the loop where we loop through each item from the top K items and check against THETA 
        for x in sim_item_ids:
             
            # Check if the pcc value between the current item and the similar item is greater than THETA
            if item_trained_pearson_corr[j][x] > THETA:
            
                # if this is the case, add this similar item to a new set-
                # containing only items who are greater than THETA in pcc value
                new_item_ids = np.append(new_item_ids, x)

                
            # getting the pcc values for the new similar items set.
            sim_item_val = item_trained_pearson_corr[j][new_item_ids]
        
            # pass the values to the main dataset
            sim_items = imputed_train_ds.T.values[new_item_ids]

        #calculate this item's average
        item_mean = np.sum(imputed_train_ds.T.values[j]) / (np.sum(np.clip(imputed_train_ds.T.values[j], 0, 1)) + EPSILON)
        
        #calculate the similar items' average
        sim_item_mean = np.sum(sim_items, axis=1) / (np.sum(np.clip(sim_items, 0, 1), axis=1) + EPSILON)
        
        #the denominator of the equation (the sum)
        sim_sum_mean = sim_item_val * (sim_items[:, i] - sim_item_mean) 

        
        w = np.clip(sim_items[:, i], 0, 1)
        sim_r_sum_mean = sim_sum_mean * w  
          
        # the prediction eqation for items 
        # NOTE: there is no mean value added here, the reason is I already added it inside the conditions down-
        # to avoid adding the mean twice.
        item_based_pred = np.sum(sim_r_sum_mean) / (np.sum(sim_item_val * w) + EPSILON)

        
        
        #====================THE CONDITIONS PART AS WELL AS THE PREDICTION=================#
        
        #Equation 8 ==> if the both the similar users set as well as the similar items set are not empty
        if new_user_ids.size != 0 and new_item_ids.size != 0:
          
            train_ds_pred[i][j] = LAMBDA * (user_mean + user_based_pred) + (1 - LAMBDA) * (item_mean + item_based_pred)
            train_ds_pred[i][j] = np.clip(train_ds_pred[i][j], 0, 5)
            
            
        #Equation 9 ==> if the set of similar users isn't empty while the set of similar items is empty
        elif new_user_ids.size != 0 and new_item_ids.size == 0:
           
            train_ds_pred[i][j] = (user_mean + user_based_pred)
            train_ds_pred[i][j] = np.clip(train_ds_pred[i][j], 0, 5)  
             
            
        #Equation 10 ==> if the set of similar users is empty while the set of similar items isn't empty
        elif new_user_ids.size == 0 and new_item_ids.size != 0:
       
            train_ds_pred[i][j] = (item_mean + item_based_pred)
            train_ds_pred[i][j] = np.clip(train_ds_pred[i][j], 0, 5) 
            
        
        #Equation 11 ==> if both sets are empty then set the prediction to 0
        elif new_user_ids.size == 0 and new_item_ids.size == 0:
            
            train_ds_pred[i][j] = 0   
            
imputed_train_ds = train_ds_pred

# i made train_ds_pred to have the prediction values and after the prediction part is done I assigned
# the imputed_train_ds to equal to train_ds_pred.
# The reason i did not assign directly is because whenever i use imputed to 
# take the prediction values in the loop i keep getting errors and that's how i avoided it

# Evaluation

### Compute Pearson Correlation Coefficient of All Pairs of Items between active set and imputed training set

In [8]:
imputed_train_ds = pd.DataFrame(imputed_train_ds)
imputed_train_ds

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.000000,3.275990,0.000000,3.739449,0.000000,3.583973,3.887027,0.000000,0.000000,3.674033,...,0.000,3.478206,3.667172,0.000000,3.646973,3.394594,0.000000,0.0,0.000000,0.000000
1,5.000000,0.000000,4.212929,0.000000,2.751726,0.000000,3.689196,3.703704,0.000000,0.000000,...,0.000,0.000000,0.000000,4.084999,2.976744,0.000000,0.000000,0.0,0.000000,1.567856
2,0.000000,0.000000,0.000000,0.000000,5.000000,4.248832,0.000000,3.892893,3.075472,0.000000,...,0.000,4.531081,0.000000,0.000000,0.000000,0.000000,2.096774,0.0,2.784153,0.000000
3,0.000000,0.000000,0.000000,3.892086,3.908254,3.373832,0.000000,0.000000,0.000000,0.000000,...,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,3.052190,0.000000
4,0.000000,3.662128,0.000000,0.000000,0.000000,0.000000,2.690376,0.000000,0.000000,3.154639,...,0.000,0.000000,3.658820,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,4.541002,0.000000,4.292929,0.000000,0.000000,3.373832,3.887500,3.703704,0.000000,0.000000,...,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
296,0.000000,0.000000,4.292929,3.892086,0.000000,3.373832,0.000000,0.000000,0.000000,0.000000,...,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,2.810811,0.000000
297,3.738485,0.000000,3.769697,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
298,0.000000,0.000000,0.000000,4.197477,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000


In [9]:
active_user_pearson_corr = np.zeros((active_ds.shape[0], train_ds.shape[0]))

# Compute Pearson Correlation Coefficient of All Pairs of Users between active set and imputed training set
for i, user_i_vec in enumerate(active_ds.values):
    for j, user_j_vec in enumerate(imputed_train_ds.values):
        
        # ratings corated by the current pair od users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0

        # corrated item index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of user_i_vec and user_j_vec
        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

        r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
        r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

        sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim

        active_user_pearson_corr[i][j] = weighted_sim

active_user_pearson_corr

array([[-0.26923265, -0.11917643, -0.11548116, ..., -0.01339985,
         0.15839501,  0.17667748],
       [-0.14204254,  0.23799415, -0.1837835 , ..., -0.09195238,
         0.11807396, -0.03917053],
       [-0.14907712, -0.20083562,  0.29992195, ..., -0.18307445,
         0.14165623, -0.30144471],
       ...,
       [-0.12789834,  0.17694681,  0.15679091, ...,  0.1509176 ,
         0.19461845,  0.20828322],
       [ 0.41966109, -0.05944625, -0.05517665, ...,  0.03151726,
        -0.1946242 , -0.1929105 ],
       [ 0.21222837,  0.28088528,  0.02545523, ..., -0.1872918 ,
         0.26957021,  0.18250369]])

## Predict Ratings of Testing Set

In [10]:
K = 10

test_ds_pred = np.zeros_like(test_ds.values)

for (i, j), rating in np.ndenumerate(test_ds.values):

    if rating > 0:

        sim_user_ids = np.argsort(active_user_pearson_corr[i])[-1:-(K + 1):-1]

        #==================user-based==================#
        # the coefficient values of similar users
        sim_val = active_user_pearson_corr[i][sim_user_ids]

        # the average value of the current user's ratings
        sim_users = imputed_train_ds.values[sim_user_ids]
        user_mean = np.sum(active_ds.values[i]) / (np.sum(np.clip(active_ds.values[i], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # select the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        # sim(u, v) * (r_vj - mean_v)
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
        
        user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        user_based_pred = np.clip(user_based_pred, 0, 5)

        test_ds_pred[i][j] = user_based_pred
        
test_ds_pred


array([[0.        , 0.        , 0.        , ..., 0.        , 2.7637069 ,
        0.        ],
       [0.        , 0.        , 4.18111228, ..., 0.        , 0.        ,
        0.        ],
       [3.99842276, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

## Compute MAE and RMSE

In [11]:
# MAE
MAE = np.sum(np.abs(test_ds_pred - test_ds.values)) / np.sum(np.clip(test_ds.values, 0, 1))

# RMSE
RMSE = np.sqrt(np.sum(np.square(test_ds_pred - test_ds.values)) / np.sum(np.clip(test_ds.values, 0, 1)))

print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.7800486926349319, RMSE: 0.9829960346158855
