In [17]:
import numpy as np
from scipy.sparse import dok_matrix
from scipy.stats import pearsonr

In [9]:
data = {}
max_user, max_movie = 0, 0
with open('u1.base', 'r') as f:
    for l in f:
        user_id, movie_id, rating, timestamp = l.split()
        user_id, movie_id, rating = int(user_id), int(movie_id), int(rating)
        
        max_user = max(max_user, user_id)
        max_movie = max(max_movie, movie_id)
        data[(user_id, movie_id)] = rating

In [13]:
M = dok_matrix((max_user + 1, max_movie + 1), dtype=np.int32)
for (uid, mid), r in data.items():
    M[uid, mid] = r

In [53]:
# TODO: similarity has to be computed for those values that are present for both users
def similarity(matrix, u1, u2):
    cor, p_val = pearsonr(matrix.getrow(u1).A[0], matrix.getrow(u2).A[0])
    return cor

In [87]:
def most_similar(matrix, u, n):
    similarities = [similarity(matrix, u, other) for other in range(0, matrix.shape[0])]
    res = np.argsort(similarities)[-(n+1):][::-1]
    res = res[res != 0]
    return res

In [88]:
def predict_rating(matrix, u, mid):
    user_mean_rating = matrix[u].A[matrix[u].A.nonzero()].mean()
    top_similars = most_similar(matrix, u, 5)
    print('Most similars:', top_similars)
    
    res = 0.
    for other in top_similars:
        other_mean = matrix[other].A[matrix[other].A.nonzero()].mean()
        print('Other mean:', other_mean)
        res += similarity(matrix, u, other) * (matrix[other].A[0][mid] - other_mean)
    print('Res:', res)
    
    res /= sum(similarity(matrix, u, other) for other in top_similars)
    print('Res:', res)
    return res

In [89]:
predict_rating(M, 1, 5)

Most similars: [  1 823 514 521 864]
Other mean: 3.6814814814814816
Other mean: 4.005405405405406
Other mean: 3.8402061855670104
Other mean: 3.14
Other mean: 3.82312925170068
Res: -4.311895553335287
Res: -1.8648334222052085


-1.8648334222052085

## Pen and paper

In [76]:
A = np.array([
    [2, 4, 3, 3],
    [1, 5, 5, 3],
    [1, 3, 3, -1],
    [3, 2, 1, -1],
])

In [77]:
A

array([[ 2,  4,  3,  3],
       [ 1,  5,  5,  3],
       [ 1,  3,  3, -1],
       [ 3,  2,  1, -1]])

In [None]:
# sim(u1, u2) = 0.866
# sim(i1, i2) = 0.773
# ans = 0.85