In [269]:
import pandas as pd
import numpy as np
import math

In [270]:
def convert_matrix(data,user,item,rating):
    matrix = data.pivot(index=user, columns=item, values=rating).as_matrix()
    return matrix   

In [271]:
def get_user_mean(matrix):
    return np.nanmean(matrix, axis = 1)

In [272]:
def centralize_matrix(matrix):
    user_mean = get_user_mean(matrix)
    centered_matrix = matrix - user_mean[:, np.newaxis]
    return centered_matrix

In [273]:
def cosine(matrix,i,j):
    if i == j:
        return 1
    item_matrix = matrix[:,[i-1,j-1]]
    item_matrix_new = item_matrix[~np.isnan(item_matrix).any(axis=1)]
    ratings_item_i = item_matrix_new[:,0]
    ratings_item_j = item_matrix_new[:,1]
    num = np.dot(ratings_item_i, ratings_item_j)
    den = math.sqrt(sum(ratings_item_i ** 2)) * math.sqrt(sum(ratings_item_j ** 2))
    x = math.sqrt(sum(ratings_item_i ** 2)) * math.sqrt(sum(ratings_item_j ** 2) - 1)
    return num/den

In [274]:
def cosine_matrix(matrix):
    where_are_NaNs = np.isnan(matrix)
    new_matrix = np.copy(matrix)
    new_matrix[where_are_NaNs] = 0
    squares = np.square(new_matrix)
    num = new_matrix.T.dot(new_matrix)
    one_matrix = np.copy(matrix)
    one_matrix[~np.isnan(one_matrix)] = 0
    one_matrix[np.isnan(one_matrix)] = 1
    one_matrix = 1 - one_matrix
    temp_matrix = squares.T.dot(one_matrix)
    den_matrix = temp_matrix.T * temp_matrix
    den = np.sqrt(den_matrix)
    output = np.divide(num,den)
    return output

In [275]:
def item_similarity(matrix):
    output = np.zeros((matrix.shape[1], matrix.shape[1]))
    for i in list(range(1,output.shape[1]+1)):
        for j in list(range(i,output.shape[1]+1)):
            output[i-1,j-1] = cosine(matrix,i,j)
            output[j-1,i-1] = output[i-1,j-1]
    return output

In [310]:
def get_nearest_neighbors(matrix, k):
    t = np.argsort(-1 * matrix)
    return(t[:,1:1+k])

In [315]:
def predict(ratings,similarity,ranking,user,item):
    user_ratings_for_similar_items = ratings[user-1,ranking[item-1]]
    similarity_for_similar_items = similarity[item-1,ranking[item-1]]
    value = np.dot(user_ratings_for_similar_items, similarity_for_similar_items)/sum(np.absolute(similarity_for_similar_items))
    return value

In [316]:
rank = np.array([[7,6,7,4,5,4],[6,7,np.nan,4,3,4],[np.nan,3,3,1,1,np.nan]
                        ,[1,2,2,3,3,4],[1,np.nan,1,2,3,3]])
test_matrix = np.array([[1.5,0.5,1.5,-1.5,-0.5,-1.5],[1.2,2.2,np.nan,-0.8,-1.8,-0.8],[np.nan,1,1,-1,-1,np.nan]
                        ,[-1.5,-0.5,-0.5,0.5,0.5,1.5],[-1,np.nan,-1,0,1,1]])
#print(test_matrix)
#print(cosine(test_matrix,1,3))
t = (cosine_matrix(test_matrix))
t[1,[2,3]]

array([ 0.87287156, -0.73391041])

In [317]:

neighbour = get_nearest_neighbors(t,2)

In [319]:
predict(rank,t,neighbour,3,6)

1.0

In [276]:
path = '../ml-latest-small/'
filename = 'ratings.csv'

In [277]:
data = pd.read_csv(path+filename)

In [278]:
data.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [279]:
user_column_name = 'userId'
item_column_name = 'movieId'
rating_column_name = 'rating'

In [280]:
matrix = convert_matrix(data, user_column_name, item_column_name, rating_column_name)

  


In [281]:
user_mean = get_user_mean(matrix)

In [282]:
centered_matrix = centralize_matrix(matrix)

In [283]:
cosine(centered_matrix,1,3)

0.18118220238070776

In [284]:
output = cosine_matrix(centered_matrix)

  


In [285]:
cosine(centered_matrix,1,2)

0.13213674206059256

In [286]:
output[0,1]

0.13213674206059262

In [311]:
nearest_items = get_nearest_neighbors(output,2)