In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from scipy.linalg import svd

In [2]:
# Find the highest similarity
def cosine_similarity(v,u):
    return (v @ u)/ (np.linalg.norm(v) * np.linalg.norm(u))

# Get the factor number given the reconstruction percentage
def get_k(sigma,percentage):
    sigma_sqr=sigma**2 
    sum_sigma_sqr=sum(sigma_sqr) 
    k_sum_sigma=0 
    k=0
    for i in sigma:
        k_sum_sigma+=i**2
        k+=1
        if k_sum_sigma>=sum_sigma_sqr*percentage:            
            return k

In [3]:
def svdEst(testdata,user,simMeas,item,percentage=0.9):
    n=testdata.shape[1]
    sim_total=0.0;
    rat_sim_total=0.0
    
    # Need to find the best k by exploring
    k=10
    
    u,sigma,vt=svd(testdata)
    
    #Construct the diagonal matrix     
    sigma_k=np.diag(sigma[:k])    
    
    #Convert the original data to k-dimensional space (lower dimension) according to the value of k. formed_items represents the value of item in k-dimensional space after conversion.
    formed_items=np.around(np.dot(np.dot(u[:,:k], sigma_k),vt[:k, :]),decimals=3)     
    for j in range(n):
        user_rating=testdata[user,j]
        if user_rating==0 or j==item:continue
        # the similarity between item and item j
        similarity=simMeas(formed_items[item,:].T,formed_items[j,:].T) 
        sim_total+=similarity 
        # product of similarity and the rating of user to item j, then sum
        rat_sim_total+=similarity*user_rating 
    if sim_total==0:
        return 0
    else:
        return np.round(rat_sim_total/sim_total,decimals=3)

def recommend(testdata,user,sim_meas,est_method, percentage=0.9):
    unrated_items=np.nonzero(testdata[user,:]==0)[0].tolist()
    if len(unrated_items)==0:
        return print('everything is rated')
    item_scores=[]
    for item in unrated_items:
        estimated_score=est_method(testdata,user,sim_meas,item,percentage)
        item_scores.append((item,estimated_score))
    item_scores=sorted(item_scores,key=lambda x:x[1],reverse=True)
    return item_scores

In [None]:
from datetime import datetime

print(datetime.now().strftime("%H:%M:%S"))
recommend(reviewSparse,0,sim_meas=cosine_similarity,est_method=svdEst, percentage=0.9)
print(datetime.now().strftime("%H:%M:%S"))


In [6]:
# path with rating
PATH = './data/anime/anime_ratings.dat'

# import data
df = pd.read_csv(PATH, sep='\t')
print(df.shape)

reviewmatrix = df.pivot_table(index="User_ID", columns="Anime_ID", values="Feedback").fillna(0)

# convert to a csr matrix
reviewSparse = reviewmatrix.values
# reviewSparse = csr_matrix(reviewSparse)

(419943, 3)


In [103]:
# SVD on the data based on the csr matrix
k=20
u,sigma,vt=svd(reviewSparse)

In [106]:
print(u.shape, sigma.shape, vt.shape)
sigma_k=np.diag(sigma[:k])
formed_items=np.around(np.dot(np.dot(u[:,:k], sigma_k),vt[:k, :]),decimals=3)
formed_items

(4714, 4714) (4714,) (7157, 7157)


array([[ 4.500e-02,  1.988e+00,  1.380e-01, ...,  5.000e-03,  4.000e-03,
        -1.000e-03],
       [ 1.425e+00,  1.419e+00, -4.690e-01, ...,  1.000e-03, -3.000e-03,
        -6.000e-03],
       [ 1.039e+00,  7.210e-01,  2.500e-02, ...,  1.000e-03,  1.000e-03,
         0.000e+00],
       ...,
       [ 1.360e-01,  1.571e+00,  2.810e-01, ...,  3.000e-03,  3.000e-03,
         1.000e-03],
       [ 5.700e-02,  5.900e-01, -2.800e-01, ...,  8.000e-03,  5.000e-03,
        -1.000e-03],
       [ 1.400e-02,  3.500e-02, -3.800e-02, ..., -0.000e+00,  1.000e-03,
        -1.000e-03]])

In [108]:
reviewSparse

array([[8., 0., 5., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
from datetime import datetime

print(datetime.now().strftime("%H:%M:%S"))
recommendation_0 = recommend(reviewSparse,0,sim_meas=cosine_similarity,est_method=svdEst, percentage=0.9)
print(datetime.now().strftime("%H:%M:%S"))

In [None]:
# path with rating
PATH = './data/rating.csv'

# import data
df = pd.read_csv(PATH)
print(df.shape)

#-1 rating means the viewer has watched the anime but did not assign a rating, so in our case it is equivalent to NaN
reviews = df.replace(-1,np.nan)

reviewmatrix = reviews.pivot_table(index="user_id", columns="anime_id", values="rating").fillna(0)

# convert to a csr matrix
reviewSparse = reviewmatrix.values
reviewSparse = csr_matrix(reviewSparse)