In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

### User-based collaborative filter

In [2]:
class CF(object):
    def __init__(self, R, k):
        self.ori_data = R.copy()
        self.data = R.astype(np.float64)
        self.n, self.m = self.data.shape
        self.kneighbors = k
    
    def __normalize(self):
        self.col_means = np.nanmean(self.data, axis=0)
        nan_indices = np.isnan(self.data)
        self.data -= self.col_means
        self.data[nan_indices] = 0

    def __similarity(self):
        self.S = cosine_similarity(self.data.T)

    def fit(self):
        self.__normalize()
        self.__similarity()
    
    def predict(self, u, i):
        similiar_users = np.argsort(self.S[u])[::-1][1:self.kneighbors+1]
        users_with_rating = [v for v in similiar_users if not np.isnan(self.ori_data[i, v])]
        
        weighted_sum = np.sum([self.S[u, v] * self.data[i, v] for v in users_with_rating])
        sum_of_weights = np.sum([np.abs(self.S[u, v]) for v in users_with_rating]) + 1e8
        
        return weighted_sum / sum_of_weights + self.col_means[u]

    def recommend(self, u):
        recommended_items = []
        for i in range(self.n):
            if np.isnan(self.ori_data[i, u]):
                rating = self.predict(u, i)
                if rating > 0:
                    recommended_items.append(i)
        return recommended_items


### Item-based collaborative filter

In [10]:
class CF(object):
    def __init__(self, R, k):
        self.ori_data = R.copy()
        self.data = R.copy()
        self.n, self.m = self.data.shape
        self.kneighbors = k
    
    def __normalize(self):
        self.row_means = np.nanmean(self.data, axis=1)
        nan_indices = np.isnan(self.data)
        self.data -= self.row_means[:, np.newaxis]
        self.data[nan_indices] = 0

    def __similarity(self):
        self.S = cosine_similarity(self.data)

    def fit(self):
        self.__normalize()
        self.__similarity()
    
    def predict(self, u, i):
        similiar_items = np.argsort(self.S[i])[::-1][1:self.kneighbors+1]
        rated_items = [v for v in similiar_items if not np.isnan(self.ori_data[v, u])]
        
        weighted_sum = np.sum([self.S[i, v] * self.data[v, u] for v in rated_items])
        sum_of_weights = np.sum([np.abs(self.S[i, v]) for v in rated_items]) + 1e-8
        
        return weighted_sum / sum_of_weights + self.row_means[i]

    def recommend(self, u):
        recommended_items = []
        for i in range(self.n):
            if np.isnan(self.ori_data[i, u]):
                rating = self.predict(u, i)
                if rating > 0:
                    recommended_items.append(i)
        return recommended_items

### Load the MovieLens data

In [None]:
# Download MovieLens data.
print("Downloading movielens data...")
from urllib.request import urlretrieve
import zipfile

urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()
print("Done. Dataset contains:")
print(zip_ref.read('ml-100k/u.info'))

In [8]:
# Load data
columns = ['User_id', 'Movie_id', 'Rating', 'timestamp']

test_data = pd.read_csv('ml-100k/u.data', sep='\t', names=columns, encoding='latin-1')

# Convert train and test data into dense tables
testing_data = test_data.pivot(index='Movie_id', columns='User_id', values='Rating')

print('\n testing data:')
print(testing_data)


 testing data:
User_id   1    2    3    4    5    6    7    8    9    10   ...  934  935  \
Movie_id                                                    ...             
1         5.0  4.0  NaN  NaN  4.0  4.0  NaN  NaN  NaN  4.0  ...  2.0  3.0   
2         3.0  NaN  NaN  NaN  3.0  NaN  NaN  NaN  NaN  NaN  ...  4.0  NaN   
3         4.0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
4         3.0  NaN  NaN  NaN  NaN  NaN  5.0  NaN  NaN  4.0  ...  5.0  NaN   
5         3.0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
...       ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
1678      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
1679      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
1680      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
1681      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
1682      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  

### Output with user-user CF

In [9]:
rs = CF(testing_data.values, k = 40)
rs.fit()

n_tests = test_data.shape[0]
SE = 0
for n in test_data.values:
    pred = rs.predict(n[0]-1, n[1]-1)
    SE += (pred - n[2])**2

RMSE = np.sqrt(SE / n_tests)
print('User-user CF, RMSE =', RMSE)

User-user CF, RMSE = 1.0308117837325852


### Output with item-item CF

In [11]:
rs = CF(testing_data.values, k = 40)
rs.fit()

n_tests = test_data.shape[0]
SE = 0
for n in test_data.values:
    pred = rs.predict(n[0]-1, n[1]-1)
    SE += (pred - n[2])**2

RMSE = np.sqrt(SE / n_tests)
print('Item-item CF, RMSE =', RMSE)

Item-item CF, RMSE = 0.761879860653961
