In [8]:
import pandas as pd
import numpy as np

In [4]:
u_cols =  ['user_id', 'age', 'sex', 'occupation', 'zip_code']
pd.read_csv("../data/movielens_100K/u.user", sep="|", names=u_cols, encoding="latin-1")

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [75]:
#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('../data/movielens_100K/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('../data/movielens_100K/ua.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = np.array(ratings_base)
rate_test = np.array(ratings_test)

In [5]:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('../data/movielens_100K/u.item', sep='|', names=i_cols,
 encoding='latin-1')

n_items = items.shape[0]
print('Number of items:'), n_items

Number of items:


(None, 1682)

In [78]:
X0 = np.array(items)
X_train_counts = X0[:, -19:]
X_train_counts

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=object)

In [79]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
tfidf = transformer.fit_transform(X_train_counts.tolist()).toarray()

In [88]:
import numpy as np
def get_items_rated_by_user(rate_matrix, user_id):
    """
    in each line of rate_matrix, we have infor: user_id, item_id, rating (scores), time_stamp
    we care about the first three values
    return (item_ids, scores) rated by user user_id
    """
    y = rate_matrix[:,0] # all users
    # item indices rated by user_id
    # we need to +1 to user_id since in the rate_matrix, id starts from 1 
    # while index in python starts from 0
    ids = np.where(y == user_id +1)[0] 
    item_ids = rate_matrix[ids, 1] - 1 # index starts from 0 
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

In [84]:
get_items_rated_by_user(rate_train, 942)

(158,)


(array([   1,    8,   11,   21,   22,   23,   26,   27,   30,   37,   40,
          41,   49,   50,   52,   53,   54,   55,   61,   63,   66,   67,
          68,   71,   72,   75,   78,   79,   91,   93,   95,   96,   97,
          99,  116,  120,  121,  123,  126,  131,  138,  150,  160,  167,
         171,  172,  173,  180,  181,  183,  184,  186,  187,  192,  193,
         194,  195,  199,  200,  201,  203,  204,  209,  215,  216,  217,
         218,  225,  226,  227,  228,  229,  230,  232,  233,  236,  238,
         273,  280,  281,  283,  317,  366,  372,  384,  385,  390,  392,
         398,  400,  401,  402,  404,  405,  411,  414,  418,  420,  422,
         425,  426,  430,  442,  448,  449,  467,  469,  470,  474,  484,
         507,  525,  540,  545,  548,  558,  565,  567,  568,  575,  580,
         584,  594,  608,  613,  624,  654,  671,  684,  716,  719,  720,
         721,  723,  731,  738,  755,  762,  764,  784,  793,  795,  815,
         823,  824,  830,  839,  927, 

In [89]:
from sklearn.linear_model import Ridge
from sklearn import linear_model
n_users = 943
d = tfidf.shape[1] # data dimension
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))

for n in range(n_users):    
    ids, scores = get_items_rated_by_user(rate_train, n)
    clf = Ridge(alpha=0.01, fit_intercept  = True)
    Xhat = tfidf[ids, :]
    clf.fit(Xhat, scores) 
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_

In [90]:
Yhat = tfidf.dot(W) + b

In [91]:
n = 10
np.set_printoptions(precision=2) # 2 digits after . 
ids, scores = get_items_rated_by_user(rate_test, n)
Yhat[n, ids]
print('Rated movies ids :', ids )
print('True ratings     :', scores)
print('Predicted ratings:', Yhat[ids, n])

Rated movies ids : [ 37 109 110 226 424 557 722 724 731 739]
True ratings     : [3 3 4 3 4 3 5 3 3 4]
Predicted ratings: [3.18 3.13 3.42 3.09 3.35 5.2  4.01 3.35 3.42 3.72]


In [95]:
def evaluate(Yhat, rates, W, b):
    se = 0
    cnt = 0
    for n in range(0,n_users):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred 
        se += (e*e).sum(axis = 0)
        cnt += e.size 
    return np.sqrt(se/cnt)

print( 'RMSE for training:', evaluate(Yhat, rate_train, W, b))
print( 'RMSE for test    :', evaluate(Yhat, rate_test, W, b))

RMSE for training: 0.908980456282672
RMSE for test    : 1.2703282700393035
