In [1]:
#Content-based recommendation System
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer
from model import cbcf

In [2]:
path_user = 'data_movies/u.user'
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(path_user, sep = '|', names = u_cols)
users.head(5)

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [3]:
print('Number of users:', users.shape[0])

Number of users: 943


In [4]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv('data_movies/ua.base', sep = '\t', names = r_cols)
ratings_test = pd.read_csv('data_movies/ua.test', sep = '\t', names = r_cols)
rate_train = ratings_base.values
rate_test = ratings_test.values

In [5]:
print('Number of rate base:', rate_train.shape[0])
print('Number of rate test:', rate_test.shape[0])

Number of rate base: 90570
Number of rate test: 9430


In [6]:
i_cols = ['movie_id', 'movie_title', 'release_date', 'video release date', 'IMBd url', 
         'inknow', 'action', 'adventure', 'animation', 'children\'s', 'comedy', 
         'crime', 'documentary', 'drama', 'fantasy','film-noir', 'horror', 'musical',
         'mystery', 'sci-fi', 'thriller', 'war', 'western']
items = pd.read_csv('data_movies/u.item', sep = '|', names = i_cols, encoding = 'latin1')

In [7]:
n_items = items.shape[0]
print('Number of items:', n_items)

Number of items: 1682


In [8]:
X_train_counts = items.values[:, -19:]

In [9]:
transformer = TfidfTransformer(smooth_idf = True, norm = 'l2')
X = transformer.fit_transform(X_train_counts.tolist()).toarray()
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.53676706, 0.65097024, ..., 0.53676706, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], shape=(1682, 19))

In [10]:
from sklearn.linear_model import Ridge
from sklearn import linear_model

def get_items_rated_by_user(rate_matrix, user_id):
    '''
    item indices rated by user_id 
    need to 1 to user_id since in the rate_matrix, id start from 1
    (in python, id start from 0)
    return (item_ids, scores)
    '''
    y = rate_matrix[:, 0]
    ids = np.where(y == user_id + 1)[0]
    item_ids = rate_matrix[ids, 1] - 1
    scores = rate_matrix[ids, 2]
    return item_ids, scores

def find_weights_and_bias(X, n_users, rate_train, n):
    d = X.shape[0]
    W = np.zeros((d, n_users))
    b = np.zeros(n_users)
    for n in range(n_users):
        ids, scores = get_items_rated_by_user(rate_train, n)
        model = Ridge(alpha = 0.01, fit_intercept = True)
        Xhat = X[ids, :]
        model.fit(Xhat, scores)
        W[:, n] = np.pad(model.coef_, (0, W.shape[0] - model.coef_.shape[0]), mode='constant')
        b[n] = model.intercept_
    return W, b
    

In [22]:
n = 10
W, b = find_weights_and_bias(X, users.shape[0], rate_train, n)
Yhat = X.T.dot(W) + b
np.set_printoptions(precision = 2)
ids, scores = get_items_rated_by_user(rate_test, n)
print('Rated movies ids :', ids )
print('True ratings:', scores)
print('Predicted ratings:', Yhat[n, ids])

Rated movies ids : [ 37 109 110 226 424 557 722 724 731 739]
True ratings: [3 3 4 3 4 3 5 3 3 4]
Predicted ratings: [3.19 2.82 4.37 1.52 2.61 3.57 4.19 5.69 3.85 3.64]


In [23]:
def evaluate(Yhat, rates, W, b, n_users, n):
    se = cnt = 0
    for n in range(n_users):
        ids, score_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[n, ids]
        e = score_truth - scores_pred
        se += (e*e).sum(axis = 0)
        cnt += e.size
        return np.sqrt(se/cnt)

In [24]:
print('RMSE for training: %.2f' %evaluate(Yhat, rate_train, W, b, users.shape[0], n))
print('RMSE for test: %.2f' %evaluate(Yhat, rate_test, W, b, users.shape[0], n))

RMSE for training: 2.06
RMSE for test: 1.60
