In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
train_df = pd.read_csv("ml-100k/u1.base", sep="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"])
test_df = pd.read_csv("ml-100k/u1.test", sep="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"])

In [3]:
train_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [4]:
ratings_matrix = pd.pivot_table(train_df, values='rating', index='user_id', columns='item_id')
ratings_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,,4.0,1.0,5.0,,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [5]:
normalized_ratings_matrix = ratings_matrix.subtract(ratings_matrix.mean(axis=1), axis=0)
normalized_ratings_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.318519,-0.681481,0.318519,-0.681481,-0.681481,,0.318519,-2.681481,1.318519,,...,,,,,,,,,,
2,0.200000,,,,,,,,,-1.8,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,0.734694,,...,,,,,,,,,,
940,,,,-1.457944,,,0.542056,1.542056,-0.457944,,...,,,,,,,,,,
941,0.954545,,,,,,-0.045455,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [6]:
similarity_matrix = ratings_matrix.T.corr()
similarity_matrix

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000e+00,0.269680,5.000000e-01,,0.147833,0.298108,0.234765,0.870864,1.000000,-0.080917,...,0.008439,-3.194444e-01,0.203564,-2.354571e-01,0.551592,-4.875694e-16,0.210060,0.679366,-0.193489,0.243038
2,2.696799e-01,1.000000,,,,0.484802,0.763590,,,,...,0.400000,-5.163978e-01,0.293674,6.180416e-01,0.170254,1.740777e-01,-0.129460,0.000000,0.259437,0.627495
3,5.000000e-01,,1.000000e+00,-3.845925e-16,,-0.694365,,0.500000,,,...,,,-0.186551,-7.401487e-17,-0.166667,,-0.086207,1.000000,0.648886,
4,,,-3.845925e-16,1.000000e+00,,,-0.188982,1.000000,,,...,,,-0.200000,,0.258199,,0.836056,1.000000,0.327327,
5,1.478334e-01,,,,1.000000,0.025565,0.237864,0.270501,,-0.159901,...,0.293481,-5.222330e-01,-0.080582,,0.568360,8.944272e-01,0.039459,0.577350,0.233984,0.396352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,-4.875694e-16,0.174078,,,0.894427,0.617213,0.454794,,,,...,0.374351,-3.305898e-02,0.471172,-2.758386e-01,-0.073374,1.000000e+00,-0.534522,-0.131306,-0.500000,-0.187317
940,2.100603e-01,-0.129460,-8.620690e-02,8.360556e-01,0.039459,0.006962,0.016899,0.266557,,0.206725,...,-0.125059,4.352858e-01,-0.338327,-1.486075e-01,0.110022,-5.345225e-01,1.000000,0.632746,-0.022813,0.332497
941,6.793662e-01,0.000000,1.000000e+00,1.000000e+00,0.577350,0.912871,,0.182574,,,...,-0.500000,-2.355139e-16,0.273060,3.929526e-01,-0.214147,-1.313064e-01,0.632746,1.000000,-0.577350,-0.395285
942,-1.934892e-01,0.259437,6.488857e-01,3.273268e-01,0.233984,0.490971,0.299786,1.000000,1.000000,0.507093,...,0.438252,-8.703883e-01,-0.216119,4.472136e-01,0.244989,-5.000000e-01,-0.022813,-0.577350,1.000000,0.277433


In [7]:
def calculate_score(u, i):
    # Check whether the item is in the training dataset
    if i not in ratings_matrix.columns:
        return 2.5
    similarity_scores = similarity_matrix[u].drop(labels=u)
    normalized_ratings = normalized_ratings_matrix[i].drop(index=u)
# Drop users that haven't rated the item
    similarity_scores.drop(index=normalized_ratings[normalized_ratings.isnull()].index, inplace=True)
    normalized_ratings.dropna(inplace=True)
    
    # If none of the other users have rated items in common with the user in question return the baseline value
    if similarity_scores.isna().all():
        return 2.5
    
    total_score = 0
    total_weight = 0
    for v in normalized_ratings.index:        
        # It's possible that another user rated the item but that
        # they have not rated any items in common with the user in question
        if not pd.isna(similarity_scores[v]):
            total_score += normalized_ratings[v] * similarity_scores[v]
            total_weight += abs(similarity_scores[v])
            
    avg_user_rating = ratings_matrix.T.mean()[u]
    
    return avg_user_rating + total_score / total_weight

In [8]:
test_ratings = np.array(test_df["rating"])
user_item_pairs = zip(test_df["user_id"], test_df["item_id"])
pred_ratings = np.array([calculate_score(user_id, item_id) for (user_id, item_id) in user_item_pairs])
print(np.sqrt(mean_squared_error(test_ratings, pred_ratings)))

0.9725765287253909


In [9]:
baseline_rating = train_df["rating"].mean()
baseline_ratings = np.array([baseline_rating for _ in range(test_df.shape[0])])
print(np.sqrt(mean_squared_error(test_ratings, baseline_ratings)))

1.1536759477860323


In [10]:
mae = mean_absolute_error(test_ratings, pred_ratings)
mae

0.7667239772743153