# Recommender Systems

## 1. Basics of Recommendation Algorithm

In [1]:
from scipy.spatial.distance import cosine
import sklearn.metrics as metrics
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation, cosine
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error

In [2]:
M = np.asarray([[4,3,2,3], 
                [1,2,3,1],
                [np.nan,2,1,np.nan],
                [4,3,np.nan,np.nan],
#                [1,1,1,1],
               ])
print(M.shape)
print(np.nanmean(M, axis=0))
pd.DataFrame(M)

NameError: name 'np' is not defined

### Compute similarities

#### Cosine

In [None]:
import math
def cosine_similarity(v1,v2, metric='cosine'):
    if metric == 'correlation':
        v1 = v1 - np.nanmean(v1)
        v2 = v2 - np.nanmean(v2)
    "compute similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        if np.isnan(x) or np.isnan(y): continue
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

def sim_matrix(M, dimension='user', metric='cosine'):
    N = M.shape[0] if dimension == 'user' else M.shape[1]
    sim = np.zeros([N,N])
    for i in range(N):
        for j in range(N):
            if i == j:
                sim[i,j] = 0 #Cancel out the effect of self-similarity in the sums later
                continue
            if dimension == 'user':
                v1, v2 = M[i,:], M[j,:]
            else:
                v1, v2 = M[:,i], M[:,j]
            sim[i][j] = cosine_similarity(v1,v2,metric)
    return sim

In [None]:
cosine_similarity(M[0,:], M[2,:], 'cosine')

In [None]:
sim_matrix(M, 'user')

In [None]:
sim_matrix(M, 'item')

#### Pearson

In [None]:
cosine_similarity(M[0,:], M[2,:], 'correlation')

In [None]:
sim_matrix(M, 'user', 'correlation')

In [None]:
sim_matrix(M, 'item', 'correlation')

### a) Compute the missing rating in this table using user-based collaborative filtering (CF). (Use cosine similarity, then use Pearson similarity). Assume taking all neighbors

In [None]:
# Some tests
n_users, n_items = M.shape
avg_ratings = np.nanmean(M, axis=1)
sim_users = sim_matrix(M, 'user', 'cosine')
print(sim_users[2])
print(M[:,0])
print(avg_ratings)
print(M[:,0] - avg_ratings)
print(sim_users[2] * (M[:,0] - avg_ratings))

In [None]:
def user_cf(M, metric='cosine'):
    pred = np.copy(M)
    n_users, n_items = M.shape
    avg_ratings = np.nanmean(M, axis=1)
    sim_users = sim_matrix(M, 'user', metric)
    for i in range(n_users):
        for j in range(n_items):
            if np.isnan(M[i,j]):
                pred[i,j] = avg_ratings[i] + np.nansum(sim_users[i] * (M[:,j] - avg_ratings)) / sum(sim_users[i])
    return pred

In [None]:
print("User-based CF (Cosine): \n" + str(pd.DataFrame(user_cf(M, 'cosine'))))
print("User-based CF (Pearson): \n" + str(pd.DataFrame(user_cf(M, 'correlation'))))

### b) Similarly, computing the missing rating using item-based CF.

In [None]:
def item_cf(M, metric='cosine'):
    pred = np.copy(M)
    n_users, n_items = M.shape
    avg_ratings = np.nanmean(M, axis=0)
    sim_items = sim_matrix(M, 'item', metric)
    for i in range(n_users):
        for j in range(n_items):
            if np.isnan(M[i,j]):
                pred[i,j] = avg_ratings[j] + np.nansum(sim_items[j] * (M[i,:] - avg_ratings)) / sum(sim_items[j])
    return pred

In [None]:
print("Item-based CF (Cosine): \n" + str(pd.DataFrame(item_cf(M, 'cosine'))))
print("Item-based CF (Pearson): \n" + str(pd.DataFrame(item_cf(M, 'correlation'))))

In [None]:
# M_test1 = np.asarray([[3,0,3,3], 
#                 [5,4,0,2],
#                 [1,2,4,2],
#                 [3,np.nan,1,0],
#                [2,2,0,1],
#                ])
# print("User-based CF (Cosine): \n" + str(pd.DataFrame(user_cf(M_test1, 'cosine'))))
# print("Item-based CF (Cosine): \n" + str(pd.DataFrame(item_cf(M_test1, 'cosine'))))

# M_test2 = np.asarray([[4,3,np.nan,3], 
#                 [1,2,3,1],
#                 [np.nan,2,1,np.nan],
#                 [4,3,2,4],
#                ])
# print("User-based CF (Pearson): \n" + str(pd.DataFrame(user_cf(M_test2, 'correlation'))))
# print("Item-based CF (Pearson): \n" + str(pd.DataFrame(item_cf(M_test2, 'correlation'))))

## 2. Evaluating Recommendation Algorithms

### Predictive Accuracy

In [None]:
M_result = np.asarray([[4,3,2,3], 
                [1,2,3,1],
                [1,2,1,2],
                [4,3,2,4]])
pd.DataFrame(M_result)

In [None]:
def evaluateRS(ratings, groundtruth, method='user_cf', metric='cosine'):
    if method == 'user_cf':
        prediction = user_cf(ratings, metric)
    else:
        prediction = item_cf(ratings, metric)
    MSE = mean_squared_error(prediction, groundtruth)
    RMSE = round(sqrt(MSE),3)
    print("RMSE using {0} approach ({2}) is: {1}".format(method, RMSE, metric))
    print(pd.DataFrame(prediction))
    return

In [None]:
evaluateRS(M, M_result, 'user_cf', 'cosine')
evaluateRS(M, M_result, 'user_cf', 'correlation')
evaluateRS(M, M_result, 'item_cf', 'cosine')
evaluateRS(M, M_result, 'item_cf', 'correlation')

### Ranking Accuracy

In [None]:
import scipy.stats as stats

def evaluate_rank(ratings, groundtruth, method='user_cf', metric='cosine'):
    if method == 'user_cf':
        prediction = user_cf(ratings, metric)
    else:
        prediction = item_cf(ratings, metric)
    
    avg_tau = 0
    for i in range(n_users):
        tau, p_value = stats.kendalltau(M_result[i,:], prediction[i,:])
        avg_tau += tau
    avg_tau = avg_tau / n_users
    clear_output(wait=True)
    return avg_tau

results = []
for method in ['user_cf', 'item_cf']:
    for metric in ['cosine', 'correlation']:
        rank_acc = evaluate_rank(M, M_result, method, metric)
        results += ["Rank accuracy of {0} with {1} metric: {2}".format(method[1], metric, rank_acc)]
print("\n".join(results))