In [76]:
import pandas as pd
import numpy as np
import math

In [25]:
def convert_matrix(data,user,item,rating):
    matrix = data.pivot(index=user, columns=item, values=rating).as_matrix()
    return matrix   

In [29]:
def get_user_mean(matrix):
    return np.nanmean(matrix, axis = 1)

In [44]:
def centralize_matrix(matrix):
    user_mean = get_user_mean(matrix)
    centered_matrix = matrix - user_mean[:, np.newaxis]
    return centered_matrix

In [267]:
def cosine(matrix,i,j):
    if i == j:
        return 1
    item_matrix = matrix[:,[i-1,j-1]]
    item_matrix_new = item_matrix[~np.isnan(item_matrix).any(axis=1)]
    ratings_item_i = item_matrix_new[:,0]
    ratings_item_j = item_matrix_new[:,1]
    num = np.dot(ratings_item_i, ratings_item_j)
    den = math.sqrt(sum(ratings_item_i ** 2)) * math.sqrt(sum(ratings_item_j ** 2))
    x = math.sqrt(sum(ratings_item_i ** 2)) * math.sqrt(sum(ratings_item_j ** 2) - 1)
    return num/den

In [266]:
def cosine_matrix(matrix):
    where_are_NaNs = np.isnan(matrix)
    new_matrix = np.copy(matrix)
    new_matrix[where_are_NaNs] = 0
    squares = np.square(new_matrix)
    num = new_matrix.T.dot(new_matrix)
    one_matrix = np.copy(matrix)
    one_matrix[~np.isnan(one_matrix)] = 0
    one_matrix[np.isnan(one_matrix)] = 1
    one_matrix = 1 - one_matrix
    temp_matrix = squares.T.dot(one_matrix)
    den_matrix = temp_matrix.T * temp_matrix
    den = np.sqrt(den_matrix)
    output = np.divide(num,den)
    return output

In [185]:
def item_similarity(matrix):
    output = np.zeros((matrix.shape[1], matrix.shape[1]))
    for i in list(range(1,output.shape[1]+1)):
        for j in list(range(i,output.shape[1]+1)):
            output[i-1,j-1] = cosine(matrix,i,j)
            output[j-1,i-1] = output[i-1,j-1]
    return output

In [4]:
path = '../ml-latest-small/'
filename = 'ratings.csv'

In [5]:
data = pd.read_csv(path+filename)

In [6]:
data.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [7]:
user_column_name = 'userId'
item_column_name = 'movieId'
rating_column_name = 'rating'

In [26]:
matrix = convert_matrix(data, user_column_name, item_column_name, rating_column_name)

  


In [31]:
user_mean = get_user_mean(matrix)

In [49]:
centered_matrix = centralize_matrix(matrix)

array([-0.36637931,         nan, -0.36637931, ...,         nan,
               nan,         nan])

In [80]:
cosine(centered_matrix,1,3)

0.18118220238070776

In [194]:
output = cosine_matrix(centered_matrix)

  if sys.path[0] == '':


In [195]:
print(output[0,:])

[0.71075152 0.08613971 0.12821179 ...        nan        nan        nan]


In [196]:
cosine(centered_matrix,1,2)

1 2


0.4421741403584015

In [242]:
test_matrix = np.array([[1.5,0.5,1.5,-1.5,-0.5,-1.5],[1.2,2.2,np.nan,-0.8,-1.8,-0.8],[np.nan,1,1,-1,-1,np.nan]
                        ,[-1.5,-0.5,-0.5,0.5,0.5,1.5],[-1,np.nan,-1,0,1,1]])

In [224]:
test_matrix

array([[ 1.5,  0.5,  1.5, -1.5, -0.5, -1.5],
       [ 1.2,  2.2,  nan, -0.8, -1.8, -0.8],
       [ nan,  1. ,  1. , -1. , -1. ,  nan],
       [-1.5, -0.5, -0.5,  0.5,  0.5,  1.5],
       [-1. ,  nan, -1. ,  0. ,  1. ,  1. ]])

In [212]:
cosine(test_matrix,1,3)

1 3
[[ 1.5  1.5]
 [ 1.2  nan]
 [ nan  1. ]
 [-1.5 -0.5]
 [-1.  -1. ]]


0.9116846116771036

In [265]:
cosine_matrix(test_matrix)

[[ 6.94  4.14  4.   -3.96 -4.66 -6.46]
 [ 4.14  6.34  2.   -3.76 -5.46 -3.26]
 [ 4.    2.    4.5  -3.5  -3.   -4.  ]
 [-3.96 -3.76 -3.5   4.14  3.44  3.64]
 [-4.66 -5.46 -3.    3.44  5.74  3.94]
 [-6.46 -3.26 -4.    3.64  3.94  6.14]]
[[1. 1. 1. 1. 1. 1.]
 [1. 1. 0. 1. 1. 1.]
 [0. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1.]
 [1. 0. 1. 1. 1. 1.]]
[[6.94 5.94 5.5  6.94 6.94 6.94]
 [5.34 6.34 1.5  6.34 6.34 5.34]
 [3.5  3.5  4.5  4.5  4.5  3.5 ]
 [3.14 4.14 3.5  4.14 4.14 3.14]
 [4.74 4.74 2.5  5.74 5.74 4.74]
 [6.14 5.14 5.5  6.14 6.14 6.14]]
[[48.1636 31.7196 19.25   21.7916 32.8956 42.6116]
 [31.7196 40.1956  5.25   26.2476 30.0516 27.4476]
 [19.25    5.25   20.25   15.75   11.25   19.25  ]
 [21.7916 26.2476 15.75   17.1396 23.7636 19.2796]
 [32.8956 30.0516 11.25   23.7636 32.9476 29.1036]
 [42.6116 27.4476 19.25   19.2796 29.1036 37.6996]]


array([[ 1.        ,  0.73508319,  0.91168461, -0.84830227, -0.8124881 ,
        -0.9896203 ],
       [ 0.73508319,  1.        ,  0.87287156, -0.73391041, -0.99599886,
        -0.62225073],
       [ 0.91168461,  0.87287156,  1.        , -0.8819171 , -0.89442719,
        -0.91168461],
       [-0.84830227, -0.73391041, -0.8819171 ,  1.        ,  0.70567109,
         0.82899588],
       [-0.8124881 , -0.99599886, -0.89442719,  0.70567109,  1.        ,
         0.73033626],
       [-0.9896203 , -0.62225073, -0.91168461,  0.82899588,  0.73033626,
         1.        ]])

In [244]:
test_matrix[0,:]

array([ 1.5,  0.5,  1.5, -1.5, -0.5, -1.5])

In [255]:
cosine(test_matrix,4,6)

[[-1.5 -1.5]
 [-0.8 -0.8]
 [-1.   nan]
 [ 0.5  1.5]
 [ 0.   1. ]]
[-1.5 -0.8  0.5  0. ]
[-1.5 -0.8  1.5  1. ]

0.9060559154195247


0.8289958835741487