In [2]:
from __future__ import print_function
import numpy as np
import pandas as pd

# Reading user file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols)
n_users = users.shape[0]
print('Number of users:', n_users)

# Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols)
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols)

rate_train = ratings_base.values
rate_test = ratings_test.values

print('Number of training rates:', rate_train.shape[0])
print('Number of test rates:', rate_test.shape[0])


Number of users: 943
Number of training rates: 90570
Number of test rates: 9430


In [3]:
# Reading items file:
i_cols = ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Specify the encoding
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='ISO-8859-1')
n_items = items.shape[0]
print('Number of items:', n_items)


Number of items: 1682


In [4]:
print(rate_train[:4, :])

[[        1         1         5 874965758]
 [        1         2         3 876893171]
 [        1         3         4 878542960]
 [        1         4         3 876893119]]


In [7]:
X0 = items.values
X_train_counts = X0[:, -19:]

# Importing TfidfTransformer from sklearn
from sklearn.feature_extraction.text import TfidfTransformer

# Creating an instance of TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm='l2')

# Transforming the data
X = transformer.fit_transform(X_train_counts.tolist()).toarray()


In [8]:
def get_items_rated_by_user(rate_matrix, user_id):
    """
    return (item_ids, scores)
    """
    y = rate_matrix[:,0] # all users
    # item indices rated by user_id
    # we need to +1 to user_id since in the rate_matrix, id starts from 1 
    # but id in python starts from 0
    ids = np.where(y == user_id +1)[0] 
    item_ids = rate_matrix[ids, 1] - 1 # index starts from 0 
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

In [9]:
from sklearn.linear_model import Ridge
from sklearn import linear_model

d = X.shape[1] # data dimension
W = np.zeros((d, n_users))
b = np.zeros(n_users)

for n in range(n_users):    
    ids, scores = get_items_rated_by_user(rate_train, n)
    model = Ridge(alpha=0.01, fit_intercept  = True)
    Xhat = X[ids, :]
    model.fit(Xhat, scores) 
    W[:, n] = model.coef_
    b[n] = model.intercept_

In [10]:
# predicted scores
Yhat = X.dot(W) + b

In [11]:
n = 100
np.set_printoptions(precision=2) # 2 digits after . 
ids, scores = get_items_rated_by_user(rate_test, 10)
Yhat[n, ids]
print('Rated movies ids :', ids )
print('True ratings     :', scores)
print('Predicted ratings:', Yhat[ids, n])

Rated movies ids : [ 37 109 110 226 424 557 722 724 731 739]
True ratings     : [3 3 4 3 4 3 5 3 3 4]
Predicted ratings: [2.65 3.9  3.21 3.28 2.11 2.05 2.41 2.11 3.21 3.34]


In [13]:
def evaluate(Yhat, rates, W, b):
    se = cnt = 0
    for n in range(n_users):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred
        se += (e * e).sum(axis=0)
        cnt += e.size
    return np.sqrt(se / cnt)

print('RMSE for training: %.2f' % evaluate(Yhat, rate_train, W, b))
print('RMSE for test    : %.2f' % evaluate(Yhat, rate_test, W, b))

RMSE for training: 0.91
RMSE for test    : 1.27
