# Effective Missing Data Prediction

In [61]:
import pandas as pd
import numpy as np

# Load MovieLens 100K dataset into a dataframe of pandas
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [62]:
# Select 500 most active users and 500 most active items from the dataset
n_most_active_users = 500
n_most_active_items = 500

user_ids = df.groupby('user_id').count().sort_values(by='rating', ascending=False).head(n_most_active_users).index
item_ids = df.groupby('item_id').count().sort_values(by='rating', ascending=False).head(n_most_active_items).index
df = df[(df['user_id'].isin(user_ids)) & (df['item_id'].isin(item_ids))]

In [63]:
# Map new internal ID for items
i_ids = df['item_id'].unique().tolist()
item_dict = dict(zip(i_ids, [i for i in range(len(i_ids))]))
df['item_id'] = df['item_id'].map(item_dict)

In [64]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
1,186,0,3,891717742
3,244,1,2,880606923
5,298,2,4,884182806
6,115,3,2,881171488
7,253,4,5,891628467


# Split Dataset

In [65]:
# The number of training users and active users
n_training_users = 300
n_active_users = n_most_active_users - n_training_users

# The number of GIVEN ratings for active users
GIVEN = 20

# Randomly select users from the most active users as training set
random_uids = np.random.choice(df.user_id.unique(), n_training_users, replace=False)
train_df = df[df['user_id'].isin(random_uids)]
# Map new internal ID for all users in the training set
u_ids = train_df['user_id'].unique().tolist()
user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
train_df['user_id'] = train_df['user_id'].map(user_dict)

# The rest of users are active users for testing
remain_df = df[~df['user_id'].isin(random_uids)]
# Map new internal ID for all active users
u_ids = remain_df['user_id'].unique().tolist()
user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
remain_df['user_id'] = remain_df['user_id'].map(user_dict)

# Randomly select GIVEN ratings for active users
# n=20 means 20 items should be returmed from each group
active_df = remain_df.groupby('user_id').sample(n=20, random_state=1024)

test_df = remain_df[~remain_df.index.isin(active_df.index)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['user_id'] = train_df['user_id'].map(user_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remain_df['user_id'] = remain_df['user_id'].map(user_dict)


In [66]:
# Convert the format of datasets to matrices
df_zeros = pd.DataFrame({'user_id': np.tile(np.arange(0, n_training_users), n_most_active_items), 'item_id': np.repeat(np.arange(0, n_most_active_items), n_training_users), 'rating': 0})
train_ds = df_zeros.merge(train_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')

df_zeros = pd.DataFrame({'user_id': np.tile(np.arange(0, n_active_users), n_most_active_items), 'item_id': np.repeat(np.arange(0, n_most_active_items), n_active_users), 'rating': 0})
active_ds = df_zeros.merge(active_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')
test_ds = df_zeros.merge(test_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')

train_ds, active_ds, test_ds

(item_id  0    1    2    3    4    5    6    7    8    9    ...  490  491  492  \
 user_id                                                    ...                  
 0        0.0  2.0  0.0  4.0  0.0  4.0  4.0  0.0  0.0  2.0  ...  0.0  4.0  4.0   
 1        0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  4.0  ...  0.0  0.0  0.0   
 2        4.0  0.0  5.0  0.0  1.0  0.0  3.0  2.0  0.0  0.0  ...  0.0  0.0  0.0   
 3        3.0  0.0  4.0  0.0  0.0  3.0  2.0  2.0  0.0  5.0  ...  0.0  4.0  0.0   
 4        0.0  4.0  0.0  0.0  0.0  0.0  3.0  0.0  0.0  4.0  ...  0.0  0.0  3.0   
 ...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
 295      0.0  0.0  0.0  0.0  0.0  0.0  0.0  5.0  3.0  5.0  ...  0.0  0.0  0.0   
 296      0.0  0.0  5.0  4.0  0.0  1.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  0.0   
 297      0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 298      0.0  0.0  0.0  3.0  0.0  0.0  0.0  4.0  0.0  0.0  ...  0.0  0.0  5.0   
 299      0.0  0

In [67]:
# Predicting All Missing Data in training set
imputed_train_ds = train_ds.values.copy()

# Missing Value Prediction Implementation

In [68]:
## The following parameters are required in the given report, 
## which is named "Effective Missing Data Prediction for Collaborative Filtering", 
## and we need to use them. We are not changing the values for this implementation.
LAMBDA = 0.7    # λ
GAMMA = 10      # γ
DELTA = 10      # δ
ITA = 0.7       # η
THETA = 0.7     # θ
EPSILON = 1e-9

# For the prediction of the missing values we require the user by user and item by item pearson similarity matrix
# as per the solution in the paper. So here first the user by user pearson similarity matrix is calculated followed
# by item by item pearson similarity matrix and at last using these data the missing data has been predicted.


# Pearson similarity calculation for each pair of users in the training data set.
# Initialising the user by user pearson similarity matrix for the training data set.
np_user_pearson_corr_user_by_user = np.zeros((n_training_users, n_training_users))
imputed_train_ds = pd.DataFrame(imputed_train_ds)

# Looping the users against each other for forming the pearson similarity matrix.
for selected_user_data in imputed_train_ds.itertuples():
    for looping_user_data in imputed_train_ds.itertuples():
        
        # Forming the ratings vector for the pair of users.
        looping_user_vec = np.array(looping_user_data[1:])
        selected_user_vec = np.array(selected_user_data[1:])
        
        # Finding the indices of items commonly rated by the users.
        common_indices = np.intersect1d(np.where(selected_user_vec > 0), np.where(looping_user_vec>0))
        # If there are no commonly rated items then the user by user similarity is considered 0 and will proceed with the next pair of users. The below condition will make it possible.
        if not len(common_indices):
            continue
        
        # Finding the average ratings of the users.
        looping_user_vec_avg = np.sum(looping_user_vec)/(np.count_nonzero(looping_user_vec) + EPSILON)
        selected_user_vec_avg = np.sum(selected_user_vec)/(np.count_nonzero(selected_user_vec) + EPSILON)
        
        # Centering the ratings of the users by subtracting the average ratings from ratings.
        looping_user_vec_common_avgs= looping_user_vec[common_indices] - looping_user_vec_avg
        selected_user_vec_common_avgs = selected_user_vec[common_indices] - selected_user_vec_avg
        
        # Calculation of squares of the centered user ratings.
        looping_user_vec_common_squares = np.square(looping_user_vec_common_avgs)
        selected_user_vec_common_squares = np.square(selected_user_vec_common_avgs)
        
        # Similarity calculation between the users.
        similarity = (np.sum(looping_user_vec_common_avgs * selected_user_vec_common_avgs)/ 
                      (np.sqrt(np.sum(looping_user_vec_common_squares)) * np.sqrt(np.sum(selected_user_vec_common_squares)) +
                       EPSILON))
        
        # Applying the significance weighting to calculated similarities
        weighted_similarity = (min(len(common_indices), GAMMA)/ GAMMA) * similarity
        
        # Adding the weighted similarity to the pearson similarity matrix
        np_user_pearson_corr_user_by_user[selected_user_data[0]][looping_user_data[0]] = weighted_similarity
    
# Pearson similarity calculation for each pair of items in the training data set.
# Initialising the item by item pearson similarity matrix for the training data set.
np_item_pearson_corr_item_by_item = np.zeros((n_most_active_items, n_most_active_items))

# Looping the items against each other for forming the pearson similarity matrix.
for selected_item_data in imputed_train_ds.transpose().itertuples():
    for looping_item_data in imputed_train_ds.transpose().itertuples():
        
        # Forming the ratings vector for the pair of items.
        looping_item_vec = np.array(looping_item_data[1:])
        selected_item_vec = np.array(selected_item_data[1:])
        
        # Finding the indices of users who have rated the pair of items.
        common_indices = np.intersect1d(np.where(selected_item_vec > 0), np.where(looping_item_vec>0))
        # If there are no common users then the item by item similarity is considered 0 and will proceed with the next pair of items. The below condition will make it possible.
        if not len(common_indices):
            continue
        
        # Finding the average ratings of the items.
        looping_item_vec_avg = np.sum(looping_item_vec)/(np.count_nonzero(looping_item_vec) + EPSILON)
        selected_item_vec_avg = np.sum(selected_item_vec)/(np.count_nonzero(selected_item_vec) + EPSILON)
        
        # Centering the ratings of the items by subtracting the average ratings from ratings.
        looping_item_vec_common_avgs= looping_item_vec[common_indices] - looping_item_vec_avg
        selected_item_vec_common_avgs = selected_item_vec[common_indices] - selected_item_vec_avg
        
        # Calculation of squares of the centered item ratings.
        looping_item_vec_common_squares = np.square(looping_item_vec_common_avgs)
        selected_item_vec_common_squares = np.square(selected_item_vec_common_avgs)
        
        # Similarity calculation between the items.
        similarity = (np.sum(looping_item_vec_common_avgs * selected_item_vec_common_avgs)/ 
                      (np.sqrt(np.sum(looping_item_vec_common_squares)) * np.sqrt(np.sum(selected_item_vec_common_squares)) +
                       EPSILON))
        
        # Applying the significance weighting to calculated similarities
        weighted_similarity = (min(len(common_indices), DELTA)/ DELTA) * similarity
        
        # Adding the weighted similarity to the pearson similarity matrix
        np_item_pearson_corr_item_by_item[selected_item_data[0]][looping_item_data[0]] = weighted_similarity

# Missing value predictions
# Looping thorugh each user, item , rating combination
for (current_user, current_item), rating in np.ndenumerate(imputed_train_ds.values):
    
    # Condition to check whether the rating is 0. (Those ratings need to be predicted)
    if not rating:
        
        # Finding similar user ids and item ids based on the condition mentioned in the paper.
        similar_users_ids_with_current_user_condition_based = np.argwhere(np_user_pearson_corr_user_by_user[current_user] > ITA).flatten()
        similar_items_ids_with_current_item_condition_based = np.argwhere(np_item_pearson_corr_item_by_item[current_item] > THETA).flatten()
        
        # removing the current user and current item from the array.
        similar_users_ids_with_current_user_condition_based = similar_users_ids_with_current_user_condition_based[similar_users_ids_with_current_user_condition_based != current_user]
        similar_items_ids_with_current_item_condition_based = similar_items_ids_with_current_item_condition_based[similar_items_ids_with_current_item_condition_based != current_item]
        
        # Skipping the calculation if there are no similar items and similar users.
        if not len(similar_users_ids_with_current_user_condition_based) and not len(similar_items_ids_with_current_item_condition_based):
            continue
            
        # Finding the pearson coefficients for similar users and items.    
        pearson_coeff_similar_users = np_user_pearson_corr_user_by_user[current_user][similar_users_ids_with_current_user_condition_based]
        pearson_coeff_similar_items = np_item_pearson_corr_item_by_item[current_item][similar_items_ids_with_current_item_condition_based]
            
        # Finding the similar users and items.
        similar_users = imputed_train_ds.values[similar_users_ids_with_current_user_condition_based]
        similar_items = imputed_train_ds.transpose().values[similar_items_ids_with_current_item_condition_based]
        
        # Calculating the current user and item ratings mean
        current_user_mean = np.sum(imputed_train_ds.values[current_user]) / (np.count_nonzero(imputed_train_ds.values[current_user]) + EPSILON)
        current_item_mean = np.sum(imputed_train_ds.transpose().values[current_item]) / (np.count_nonzero(imputed_train_ds.transpose().values[current_item]) + EPSILON)
        
        # Calculating the means of all the similar users and items.
        similar_users_mean = np.sum(similar_users, axis=1)/ (np.count_nonzero(similar_users, axis=1) + EPSILON)
        similar_items_mean = np.sum(similar_items, axis=1)/ (np.count_nonzero(similar_items, axis=1) + EPSILON)
        
        # Condition for finding the users from similar users who has rated current item.
        mask_for_users = similar_users[:,current_item] > 0
        # Condition for finding items from similar items which has been rated by the current user.
        mask_for_items = similar_items[:,current_user] > 0
        
        # Calculation of the numerator values (pearson coeff * (similar user/item - similar user/item mean)) for both user and item based equations.
        equation_numerator_for_users = pearson_coeff_similar_users[mask_for_users] * (similar_users[mask_for_users, current_item] - similar_users_mean[mask_for_users])
        equation_numerator_for_items = pearson_coeff_similar_items[mask_for_items] * (similar_items[mask_for_items, current_user] - similar_items_mean[mask_for_items])
        
        # Calculation of the contribution by the similar users/items for determining the missing value for rating.
        similar_user_contribution_for_missing_value = LAMBDA * (current_user_mean + np.sum(equation_numerator_for_users)/(np.sum(pearson_coeff_similar_users[mask_for_users]) + EPSILON))
        similar_item_contribution_for_missing_value = (1 - LAMBDA) * (current_item_mean + np.sum(equation_numerator_for_items)/(np.sum(pearson_coeff_similar_items[mask_for_items]) + EPSILON))
        
        # Predicted missing value is fitted into the data set to use it for further calculations.
        imputed_train_ds.loc[current_user,current_item] = similar_item_contribution_for_missing_value + similar_user_contribution_for_missing_value

# Evaluation

### Compute Pearson Correlation Coefficient of All Pairs of Items between active set and imputed training set

In [69]:
imputed_train_ds = pd.DataFrame(imputed_train_ds)
imputed_train_ds

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,4.773384,2.000000,3.858157,4.000000,3.631905,4.000000,4.000000,3.671784,3.329546,2.000000,...,3.437049,4.000000,4.000000,3.702078,3.000000,3.000000,3.315039,3.621273,3.541033,3.377337
1,4.027060,3.822324,4.064443,3.929967,5.000000,3.793034,4.293254,3.880195,3.704583,4.000000,...,3.679636,3.801295,3.558522,3.792968,3.723858,3.661998,3.535933,3.824936,3.744714,3.704883
2,4.000000,3.581206,5.000000,3.687211,1.000000,3.895212,3.000000,2.000000,3.540864,0.000000,...,3.480987,3.553548,3.623580,4.000000,1.000000,3.243941,3.259192,3.619155,3.486359,2.000000
3,3.000000,3.423400,4.000000,3.540970,3.677548,3.000000,2.000000,2.000000,3.300217,5.000000,...,3.139294,4.000000,3.484309,3.415390,3.265096,3.171145,3.101744,3.191237,3.271225,3.254461
4,3.450534,4.000000,3.490534,3.357695,2.789510,3.578672,3.000000,3.665430,3.187632,4.000000,...,3.196297,2.999485,3.000000,3.175066,3.047991,3.026877,3.054435,3.134980,3.025812,3.140561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,4.068902,3.944301,0.000000,3.834134,3.943636,3.737538,3.849516,5.000000,3.000000,5.000000,...,3.793275,3.817800,3.837375,3.799350,3.798869,3.597973,3.701528,3.811268,3.829540,3.876241
296,3.011273,2.260679,5.000000,4.000000,3.132104,1.000000,3.247527,2.910556,2.742530,1.000000,...,1.782109,2.676711,3.069327,2.933046,2.771768,2.402720,2.496856,2.789270,1.000000,2.717306
297,4.917099,3.989189,4.929022,5.000000,4.229015,3.986037,4.272382,4.451433,3.572355,4.291366,...,3.207213,3.993434,4.684550,4.973383,3.809335,3.598022,2.979038,4.358734,3.801924,3.941878
298,4.160124,2.802849,3.443326,3.000000,3.169072,3.117646,3.253825,4.000000,2.380384,3.377221,...,1.886821,3.363340,5.000000,3.362280,3.105172,3.000000,2.963161,3.043053,3.178635,3.080179


In [70]:
active_user_pearson_corr = np.zeros((active_ds.shape[0], train_ds.shape[0]))

# Compute Pearson Correlation Coefficient of All Pairs of Users between active set and imputed training set
for i, user_i_vec in enumerate(active_ds.values):
    for j, user_j_vec in enumerate(imputed_train_ds.values):
        
        # ratings corated by the current pair od users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0

        # corrated item index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of user_i_vec and user_j_vec
        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

        r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
        r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

        sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim

        active_user_pearson_corr[i][j] = weighted_sim

active_user_pearson_corr

array([[ 0.01229161,  0.04031051,  0.00280008, ..., -0.01334131,
         0.31727715, -0.20961611],
       [-0.24496546,  0.2818903 ,  0.17869313, ...,  0.55986397,
         0.36594731,  0.33537382],
       [-0.42084656, -0.01262957,  0.27249925, ...,  0.27956129,
         0.05013149,  0.12308677],
       ...,
       [ 0.22763596,  0.10310597, -0.02987605, ...,  0.38189949,
         0.11947157,  0.0763004 ],
       [ 0.05668059,  0.5216229 , -0.10993777, ...,  0.31617461,
        -0.08481742,  0.08925924],
       [ 0.14947792,  0.29389936, -0.01882839, ...,  0.13568678,
         0.13235258, -0.18547186]])

## Predict Ratings of Testing Set

In [71]:
K = 10

test_ds_pred = np.zeros_like(test_ds.values)

for (i, j), rating in np.ndenumerate(test_ds.values):

    if rating > 0:

        sim_user_ids = np.argsort(active_user_pearson_corr[i])[-1:-(K + 1):-1]

        #==================user-based==================#
        # the coefficient values of similar users
        sim_val = active_user_pearson_corr[i][sim_user_ids]

        # the average value of the current user's ratings
        sim_users = imputed_train_ds.values[sim_user_ids]
        user_mean = np.sum(active_ds.values[i]) / (np.sum(np.clip(active_ds.values[i], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # select the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        # sim(u, v) * (r_vj - mean_v)
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
        
        user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        user_based_pred = np.clip(user_based_pred, 0, 5)

        test_ds_pred[i][j] = user_based_pred
        
test_ds_pred


array([[0.        , 0.        , 0.        , ..., 0.        , 3.33513081,
        0.        ],
       [0.        , 0.        , 4.43778735, ..., 0.        , 0.        ,
        0.        ],
       [4.20219527, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [3.72373725, 0.        , 3.77960363, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 4.02021054, 0.        , ..., 0.        , 0.        ,
        0.        ]])

## Compute MAE and RMSE

In [72]:
# MAE
MAE = np.sum(np.abs(test_ds_pred - test_ds.values)) / np.sum(np.clip(test_ds.values, 0, 1))

# RMSE
RMSE = np.sqrt(np.sum(np.square(test_ds_pred - test_ds.values)) / np.sum(np.clip(test_ds.values, 0, 1)))

print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.7657677902122191, RMSE: 0.9744411877844816
