# Collaborative Filtering Recommendation Systems

## Authors:
* Igor Menezes Chaves Moura - 374184
* Moésio Júnior de Meneses Filho - 374199


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors
from IPython.display import Image

### First of all, let's build a simple user-rating matrix to use as data

In [2]:
# Rating are in range(1-10) and 0 means no rating

M = np.asarray([[3,7,4,9,9,7], 
                [7,0,5,3,8,8],
                [7,5,5,0,8,4],
                [5,6,8,5,9,8],
                [5,8,8,8,10,9],
                [7,7,0,4,7,8]])
M = pd.DataFrame(M) 

In [3]:
M

Unnamed: 0,0,1,2,3,4,5
0,3,7,4,9,9,7
1,7,0,5,3,8,8
2,7,5,5,0,8,4
3,5,6,8,5,9,8
4,5,8,8,8,10,9
5,7,7,0,4,7,8


### Creating some global variables

In [4]:
global k, metric

k=4 # k similar users
metric = 'cosine' #similarity metric, Pearson similarity can be used too

## User-Based Recommendation Systems

### Now we need calculate the cosine similarity

In [5]:
cosine_sim = 1-pairwise_distances(M, metric=metric)

In [6]:
pd.DataFrame(cosine_sim)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.799268,0.779227,0.934622,0.97389,0.8846
1,0.799268,1.0,0.874744,0.90585,0.866146,0.827036
2,0.779227,0.874744,1.0,0.909513,0.865454,0.853275
3,0.934622,0.90585,0.909513,1.0,0.989344,0.865614
4,0.97389,0.866146,0.865454,0.989344,1.0,0.88164
5,0.8846,0.827036,0.853275,0.865614,0.88164,1.0


### This function finds k similar users given the user_id and the user-ratings matrix (kNN)

In [7]:
def find_k_similar_users(user_id,ratings,metric = metric, k = k):
    
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') 
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[user_id-1, :].values.reshape(1, -1), n_neighbors = k+1)
    
    similarities = 1-distances.flatten() #similarity is the inverse of distance
    
    print('{0} most similar users for User {1}:\n'.format(k,user_id))
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == user_id:
            continue

        else:
            print('{0}: User {1}, with similarity of {2}'.format(i, indices.flatten()[i]+1, similarities.flatten()[i]))
            
    
    return similarities,indices

In [8]:
similarities,indices = find_k_similar_users(1,M, metric='cosine')

4 most similar users for User 1:

1: User 5, with similarity of 0.9738899354018393
2: User 4, with similarity of 0.934621684178377
3: User 6, with similarity of 0.8846004572297814
4: User 2, with similarity of 0.7992679780524187


### To use the user-based approach , we're going the use the following formula

In [9]:
Image(url="https://cdn-images-1.medium.com/max/800/1*MdEImGMBgGY_5xltOJJAQA.png")

#### Where p(a,i) is the prediction for target or active user a for item i, w(a,u) is the similarity between users a and u, and K is the neighborhood of most similar users.

#### This formula compute predictions as the weighted avarage of deviations of neighbor's mean and adding it to the active user's mean rating. Deviations are used to ajust for the user associated biases. User biases occur as certain users may tend to always give high or low ratings to all items.

In [10]:
def predict_userbased(user_id,item_id, ratings, metric = metric, k = k):
     
    prediction = 0
    
    similarities,indices = find_k_similar_users(user_id,ratings,metric,k)
    
    
    mean_rating = ratings.loc[user_id-1].mean()
    
    sum_wt = np.sum(similarities) - 1 # -1 to desconsider the similarity between the user i and user i which is 1
    
    product = 1
    
    weighted_sum = 0
    
    
    iflatten = indices.flatten()
    
    for i in range(0,len(iflatten)):
        
        if((i+1) == user_id):
            continue
        
        else:
            
            ratings_diff = ratings.iloc[iflatten[i],item_id - 1] - np.mean(ratings.iloc[iflatten[i]])
            
            product = ratings_diff * (similarities[i])
            
            weighted_sum += product
            
        prediction = int(round(mean_rating + (weighted_sum/sum_wt))) 
        
        
    print('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))
    
    return prediction

In [11]:
predict_userbased(3,4,M)

4 most similar users for User 3:

1: User 4, with similarity of 0.9095126893401909
2: User 2, with similarity of 0.8747444148494656
3: User 5, with similarity of 0.8654538781497916
4: User 6, with similarity of 0.853274963343837

Predicted rating for user 3 -> item 4: 3


3

## Item-Based Recommendation Systems

### This function finds k similar items given the item_id and user-ratings matrix(kNN for items) 

In [14]:
def find_k_similar_items(item_id,ratings,metric=metric,k=k):
    
    similarities = []
    
    indices = []
    
    ratings = ratings.T #Now the rows are the items
    
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')
    model_knn.fit(ratings)
    
    distances,indices = model_knn.kneighbors(ratings.iloc[item_id-1].values.reshape(1, -1), n_neighbors = k+1)
    
    similarities = 1-distances.flatten()
    
    print('{0} most similar items for item {1}:\n'.format(k,item_id))
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == item_id:
            continue;

        else:
            print('{0}: Item {1} :, with similarity of {2}'.format(i,indices.flatten()[i]+1, similarities.flatten()[i]))


    return similarities,indices
    

In [15]:
similarities,indices=find_k_similar_items(3,M)

4 most similar items for item 3:

1: Item 5 :, with similarity of 0.9183361255345219
2: Item 6 :, with similarity of 0.8747597730381951
3: Item 1 :, with similarity of 0.8103647462221737
4: Item 4 :, with similarity of 0.7969178003023933


### To use the item-based approach, we're now going to use the following formula

In [16]:
Image(url='https://cdn-images-1.medium.com/max/800/1*4LhLv-MRP29aHESuaWwMAA.png')

#### Where K is the neighborhood of most similar items rated by active user a, and w(i,j) is the similarity between items i and j.

#### In this formula above we use the simple weighted average

In [18]:
def predict_itembased(user_id,item_id,ratings,metric = metric,k = k):
    
    prediction = 0
    
    weighted_sum = 0
    
    similarities,indices = find_k_similar_items(item_id,ratings)
    
    sum_wt = np.sum(similarities) - 1
    
    product = 1
    
    iflatten = indices.flatten()
    
    for i in range(0,len(iflatten)):
        
        if(iflatten[i] + 1 == user_id):
            
            continue
            
        else:
            
            product = ratings.iloc[user_id-1,iflatten[i]] * (similarities[i])
            
            weighted_sum += product
            
        
    
    prediction = int(round(weighted_sum/sum_wt)) 
            
    print('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))      
        
        
    
    return prediction

In [19]:
prediction = predict_itembased(1,3,M)

4 most similar items for item 3:

1: Item 5 :, with similarity of 0.9183361255345219
2: Item 6 :, with similarity of 0.8747597730381951
3: Item 1 :, with similarity of 0.8103647462221737
4: Item 4 :, with similarity of 0.7969178003023933

Predicted rating for user 1 -> item 3: 8


## Adjusted Cosine Similarity

#### Using cosine similarity metric for item-based CF approach does not consider difference in ratings of users (bias). Adjusted cosine similarity offsets this drawback by subtracting respective user’s average rating from each co-rated pair.

#### Adjusted Cosine Similarity is defined as below

In [20]:
Image(url="https://cdn-images-1.medium.com/max/800/1*P-JTYdDjXGwpgnUMCu2YKw.png")

In [21]:
def adjusted_cosine_similarity(M):
    
    sim_matrix = np.zeros((M.shape[1], M.shape[1]))
    
    M_u = M.mean(axis=1) #means
          
    for i in range(M.shape[1]):
        for j in range(M.shape[1]):
            
            if i == j:
                
                sim_matrix[i][j] = 1
            
            else:                
                
                if i<j:
                    
                    sum_num = sum_den1 = sum_den2 = 0
                    for k,row in M.loc[:,[i,j]].iterrows(): 

                        if ((M.loc[k,i] != 0) & (M.loc[k,j] != 0)):
                            num = (M[i][k]-M_u[k])*(M[j][k]-M_u[k])
                            den1= (M[i][k]-M_u[k])**2
                            den2= (M[j][k]-M_u[k])**2
                            
                            sum_num = sum_num + num
                            sum_den1 = sum_den1 + den1
                            sum_den2 = sum_den2 + den2
                        
                        else:
                            continue                          
                                       
                    den=(sum_den1**0.5)*(sum_den2**0.5)
                    
                    if den!=0:
                        sim_matrix[i][j] = sum_num/den
                    
                    else:
                        sim_matrix[i][j] = 0


                else:
                    sim_matrix[i][j] = sim_matrix[j][i]           
            
    return pd.DataFrame(sim_matrix)

In [22]:
def find_k_similar_items_adjcos(item_id,ratings,k = k):
    
    sim_matrix = adjusted_cosine_similarity(ratings)
    
    similarities = sim_matrix[item_id-1].sort_values(ascending=False)[:k+1].values
    
    indices = sim_matrix[item_id-1].sort_values(ascending=False)[:k+1].index
    
    print('{0} most similar items for item {1}:\n'.format(k,item_id))
    for i in range(0, len(indices)):
            if (indices[i]+1 == item_id):
                
                continue

            else:
                
                print ('{0}: Item {1} :, with similarity of {2}'.format(i,indices[i]+1, similarities[i]))
        
    return similarities ,indices

In [24]:
similarities, indices = find_k_similar_items_adjcos(3,M)

4 most similar items for item 3:

1: Item 1 :, with similarity of 0.42126273187113467
2: Item 6 :, with similarity of -0.05363989048891138
3: Item 5 :, with similarity of -0.2305213582693748
4: Item 4 :, with similarity of -0.7679410465751941


In [31]:
def predict_itembased_adjcos(user_id,item_id,ratings,metric = metric, k = k):
    
    prediction = 0
    
    similarities, indices = find_k_similar_items_adjcos(item_id,ratings,k)
    
    sum_wt = np.sum(similarities)-1
    
    product = 1
    
    weighted_sum = 0
    
    for i in range(0,len(indices)):
        
        if(indices[i]+1 == item_id):
            
            continue
        
        else:
            
            product = ratings.iloc[user_id-1,indices[i]] * (similarities[i])
            
            weighted_sum += product
            
            
    prediction = int(round(weighted_sum/sum_wt))
    
    if prediction < 0:
        prediction = 1
    
    elif prediction >10:
        prediction = 10
    
    print('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))
    
    return prediction

In [32]:
prediction = predict_itembased_adjcos(3,4,M)

4 most similar items for item 4:

1: Item 2 :, with similarity of 0.08574143411490752
2: Item 5 :, with similarity of -0.2990588277904165
3: Item 1 :, with similarity of -0.5190852688949424
4: Item 6 :, with similarity of -0.6445502869540708

Predicted rating for user 3 -> item 4: 6


#### Now our item-based approach deals with users biases, that means it deals with users that usually has high or low ratings