# Install and load necesary packages

In [2]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

## Load the dataset using pandas

In [3]:
# Please don't change this cell
df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


# Split dataset


## Randomly select one rating from each user as test set

In [4]:
# please do not change this cell

from sklearn.model_selection import train_test_split

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

train_df, test_df = train_test_split(df, test_size=0.2, random_state = 10)
train_df, test_df

# Training Dataset
train_ds = np.zeros((n_users, n_items))
item_popularity = np.zeros(n_items)
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
    item_popularity[row[2]-1] =  item_popularity[row[2]-1] + 1
#train_ds = pd.DataFrame(train_ds)

# Testing Dataset
testsize = 0
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    if item_popularity[row[2]-1] > 30:
        test_ds[row[1]-1, row[2]-1] = row[3]
        testsize = testsize + 1
#test_ds = pd.DataFrame(test_ds)

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

print("Testsize = " + str(testsize))

943 users
1682 items
Construct the rating matrix based on train_df:
[[0. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]
Construct the rating matrix based on test_df:
[[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Testsize = 17678


# MAE and RMSE Utils

In [5]:
# Please don't change this cell
# you can use this devaluate Utils here, and you can also implement your own MAE and RMSE calculation. 

EPSILON = 1e-9

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Your Solution for Method 1

In [None]:
# Write your code here for Method 1
# You are required to implement the required solution 1 here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 

# Method 1: User average rating

# create binary matrix for ratings (1 is rated, 0 is not rated)
ratings_mark = (train_ds > 0).astype(int)

# calculate user average ratings
user_rated_counts = ratings_mark.sum(axis=1)
user_ratings_sum = train_ds.sum(axis=1)
user_avg_ratings = user_ratings_sum / (user_rated_counts + EPSILON)

In [7]:
# Make predictions for the test set
predicted_ds_method1 = np.zeros_like(test_ds)
test_mask = test_ds > 0
for user_idx in range(n_users):
    user_test_items = test_mask[user_idx]
    predicted_ds_method1[user_idx, user_test_items] = user_avg_ratings[user_idx]


In [8]:
# get the MAE and RMSE of method 1
MAE_solution1, RMSE_solution1 = evaluate(test_ds, predicted_ds_method1)

In [9]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE_solution1, RMSE_solution1))

MAE: 0.8258905090160901, RMSE: 1.0311430705959619


# Your Solution for Method 2

In [None]:
# Method 2: item average rating
# create binary matrix for ratings (1 is rated, 0 is not rated)
ratings_mark = (train_ds > 0).astype(int)

# calculate user average ratings
item_rated_counts = ratings_mark.sum(axis=0)
item_rating_sums = train_ds.sum(axis=0)
item_avg_ratings = item_rating_sums / (item_rated_counts + EPSILON)

In [11]:
# Make predictions for test items
predicted_ds_method2 = np.zeros_like(test_ds)
test_mask = test_ds > 0

# Still need a loop for applying predictions
for item_idx in range(n_items):
    item_test_users = test_mask[:, item_idx]
    predicted_ds_method2[item_test_users, item_idx] = item_avg_ratings[item_idx]

In [12]:
MAE_solution2, RMSE_solution2 = evaluate(test_ds, predicted_ds_method2)

In [13]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE_solution2, RMSE_solution2))

MAE: 0.7961203951019012, RMSE: 1.001314210158761


# Your Solution for Method 3

#### Method 3: User knn based collaborative filtering 
- Compute the similarity by pearson similarity and cosine similarity, we could add another method to optimize such as Eucludian, in this case compare two correlations above.
- Then choose the better method, we tune the parameters. 
- By doing that, we find weight, then choose the k.

In [19]:
# Write your code here for Method 3
# func to compute Pearson similarity
def compute_pearson_similarity(train_data, sig_weight=25):
    n_users = train_data.shape[0]
    similarity_matrix = np.zeros((n_users, n_users))

    # pre calculate user ratings mean
    user_means = np.zeros(n_users)
    for i in range(n_users):
        mask = train_data[i] > 0
        if np.sum(mask) > 0:  
            user_means[i] = np.sum(train_data[i][mask]) / np.sum(mask)

    # compute similarity

    for i in range(n_users):
        # self similarity always equals to 1
        similarity_matrix[i, i] = 1.0

        for j in range(i + 1, n_users):
            # find co-rated items 
            mask_i = train_data[i] > 0
            mask_j = train_data[j] > 0
            corrated_idx = np.logical_and(mask_i, mask_j)
            corrated_items = np.where(corrated_idx)[0]

            if len(corrated_items) == 0:
                continue

            # center the ratings vectors
            user_i_centered = train_data[i][corrated_items] - user_means[i]
            user_j_centered = train_data[j][corrated_items] - user_means[j]

            # calculate the Pearson correlation coefficient
            r_ui_sub_r_i_sq = np.square(user_i_centered)
            r_uj_sub_r_j_sq = np.square(user_j_centered)

            r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
            r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

            sim = np.sum(user_i_centered * user_j_centered) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)
            weight_sim = (min(len(corrated_items), sig_weight) / sig_weight) * sim
            # Store in both positions (symmetric matrix)
            similarity_matrix[i, j] = weight_sim
            similarity_matrix[j, i] = weight_sim
    
    return similarity_matrix


In [20]:
# func to compute cosine similarity
def compute_consine_similarity(train_data, sig_weight=25):
    n_users = train_data.shape[0]
    similarity_matrix = np.zeros((n_users, n_users))

    # compute similarity
    for i in range(n_users):
        # self similarity always equals to 1
        similarity_matrix[i, i] = 1.0

        for j in range(i + 1, n_users):
            # find co-rated items 
            mask_i = train_data[i] > 0
            mask_j = train_data[j] > 0
            corrated_idx = np.logical_and(mask_i, mask_j)
            num_corated = np.sum(corrated_idx)

            # If no co-rated items, skip
            if num_corated == 0:
                continue

            # Extract ratings for co-rated items
            user_i_corated = train_data[i][corrated_idx]
            user_j_corated = train_data[j][corrated_idx]

            # Calculate cosine similarity
            dot_product = np.sum(user_i_corated * user_j_corated)
            norm_i = np.sqrt(np.sum(np.square(user_i_corated)))
            norm_j = np.sqrt(np.sum(np.square(user_j_corated)))

            sim = dot_product / (norm_i * norm_j + EPSILON)
            weighted_sim = (min(num_corated, sig_weight) / sig_weight) * sim
            
            # Store in both positions (symmetric matrix)
            similarity_matrix[i, j] = weighted_sim
            similarity_matrix[j, i] = weighted_sim
    
    return similarity_matrix

In [21]:
# func to predict ratings using user-based collaborative filtering 
def predict_ratings(train_data, similarity_matrix, test_data, k=30):
    n_users, n_items = train_data.shape 
    predicted_ratings = np.zeros_like(test_data)
    test_mask = test_data > 0

    # pre-calculate user means to fill value for ratings if no neighbors rate the item
    user_means = np.zeros(n_users)
    for i in range(n_users):
        mask = train_data[i] > 0
        if np.sum(mask) > 0:
            user_means[i] = np.sum(train_data[i][mask]) / np.sum(mask)

    # Make the predict for all test items
    for i in range(n_users):
        user_test_items = np.where(test_mask[i])[0]
        if len(user_test_items) == 0:
            continue

        # Find the k most similar users (exluding self)
        similar_users = np.argsort(similarity_matrix[i])[::-1]
        similar_users = similar_users[similar_users != i][:k]

        for item_idx in user_test_items:
            # find the neighbors who rate that item
            mark_rated_item = train_data[similar_users, item_idx] > 0
            neighbors = similar_users[mark_rated_item]

            if len(neighbors) == 0:
                predicted_ratings[i, item_idx] = user_means[i]  # use user mean if no neighbors rated the item
                continue
        
            # Get similarities and ratings of neighbors 
            neighbor_similarities = similarity_matrix[i, neighbors]
            neighbor_ratings = train_data[neighbors, item_idx]

            # Calculate the neighbor means and weight difference
            neighbor_means = user_means[neighbors]
            weighted_diff = neighbor_ratings - neighbor_means

            # Calculate the predicted rating
            sim_sum = np.sum(neighbor_similarities) 
            prediction = user_means[i] + np.sum(neighbor_similarities * weighted_diff) / (sim_sum + EPSILON)
            predicted_ratings[i, item_idx] = np.clip(prediction, 1, 5)

    return predicted_ratings


In [23]:
# calculate similarity matrix using Pearson correlation and Cosine similarity
pearson_sim = compute_pearson_similarity(train_ds, sig_weight=25)
cosine_sim = compute_consine_similarity(train_ds, sig_weight=25)


In [24]:
# Test with different parameters k and weight
# Different k values

k_values = [20, 30, 50, 100]

# save in this result
results = []

In [25]:
# test different k values for both similarity methods
for k in k_values:
    # Test with Pearson similarity
    predicted_ratings_pearson = predict_ratings(train_ds, pearson_sim, test_ds, k=k)
    mae_p, rmse_p = evaluate(test_ds, predicted_ratings_pearson)
    results.append(("Standard", "Pearson", 25, k, mae_p, rmse_p))
    print(f"Pearson k={k}: MAE={mae_p:.4f}, RMSE={rmse_p:.4f}")
    
    # Test with Cosine similarity
    predicted_ratings_cosine = predict_ratings(train_ds, cosine_sim, test_ds, k=k)
    mae_c, rmse_c = evaluate(test_ds, predicted_ratings_cosine)
    results.append(("Standard", "Cosine", 25, k, mae_c, rmse_c))
    print(f"Cosine k={k}: MAE={mae_c:.4f}, RMSE={rmse_c:.4f}")
    

Pearson k=20: MAE=0.7775, RMSE=1.0047
Cosine k=20: MAE=0.7898, RMSE=1.0152
Pearson k=30: MAE=0.7574, RMSE=0.9775
Cosine k=30: MAE=0.7734, RMSE=0.9922
Pearson k=50: MAE=0.7393, RMSE=0.9542
Cosine k=50: MAE=0.7555, RMSE=0.9683
Pearson k=100: MAE=0.7250, RMSE=0.9345
Cosine k=100: MAE=0.7382, RMSE=0.9462


In [None]:
MAE_solution3 = 0 # 0 is an intial value, you need to update this with the actual perofrmancae of your implementation.
RMSE_solution3 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.

In [10]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE_solution3, RMSE_solution3))

MAE: 0, RMSE: 0


# Your Solution for Method 4

In [11]:
# Write your code here for Method 4
# You are required to implement the required solution 1 here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 

MAE_solution4 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
RMSE_solution4 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.


In [12]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE_solution4, RMSE_solution4))

MAE: 0, RMSE: 0


# Your Solution for Method 5

In [13]:
# Write your code here for Method 5
# You are required to implement the required solution 1 here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 

MAE_solution5 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
RMSE_solution5 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.


In [14]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE_solution5, RMSE_solution5))

MAE: 0, RMSE: 0
