In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import Ridge
from sklearn import linear_model

In [2]:
u_cols =  ['user_id', 'sex', 'age', 'occupation', 'zip_code']
users = pd.read_csv('ml-1m/users.dat', sep='::', names=u_cols,
 encoding='latin-1')
n_users = users.shape[0]

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('mvl_can/1M_train_03.dat', sep=':', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('mvl_can/1M_test_03.dat', sep=':', names=r_cols, encoding='latin-1')

rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

i_cols = ['movie id', 'title' ,'year', 'gen']
n_items = 3951

items = pd.read_csv('ml-1m/movies.dat', sep='::', names=i_cols, encoding='latin-1')
X = items.as_matrix()
X_train_count = np.full(shape = (n_items, 19), fill_value = 0)

  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
genresList = [
  "Action",
  "Adventure",
  "Animation",
  "Children",
  "Comedy",
  "Crime",
  "Documentary",
  "Drama",
  "Fantasy",
  "Film-Noir",
  "Horror",
  "Musical",
  "Mystery",
  "Romance",
  "Sci-Fi",
  "Thriller",
  "War",
  "Western",
  "(no genres listed)"
]

def setGenresMatrix(genres):
    movieGenresMatrix = []
    movieGenresList = genres.split('|')
    for x in genresList:
        if (x in movieGenresList):
            movieGenresMatrix.append(1)
        else:
            movieGenresMatrix.append(0)
    return movieGenresMatrix
for i in range(n_items):
#     iid = (np.where(X[:, 0] == i + 1)[0]).astype(int)

#     if (len(iid) > 0) or :
    X_train_count[i] = setGenresMatrix(X[i+1, 3])
X_train_count[0]

array([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [4]:
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
tfidf = transformer.fit_transform(X_train_count.tolist()).toarray()


In [5]:
def get_items_rated_by_user(rate, user_id):
    y = rate[:,0]
    ids = np.where(y == user_id + 1)[0]
    item_ids = rate[ids, 1] - 1
    scores = rate[ids, 2]
    return (item_ids, scores)

In [6]:
def evaluate(Yhat, rates, W, b):
    se = 0
    cnt = 0
    for n in range(n_users):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred 
        se += (e*e).sum(axis = 0)
        cnt += e.size 
    return np.sqrt(se/cnt)

In [7]:
d = tfidf.shape[1] # data dimension
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))

In [8]:
for n in range(n_users):    
    ids, scores = get_items_rated_by_user(rate_train, n)
    clf = Ridge(alpha= 98, fit_intercept  = True)
    Xhat = tfidf[ids, :]

    clf.fit(Xhat, scores)
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_
Yhat = tfidf.dot(W) + b
print(str(evaluate(Yhat, rate_test, W, b)))

1.0371836405845278


In [9]:
def recommend(Yhat, rate_train, user_id, limit):
    a = np.zeros((n_items,))
    recommended_items = []
    items_rated_by_user, score = get_items_rated_by_user(rate_train, user_id)
    for i in range(n_items):
        if i not in items_rated_by_user:
            a[i] = Yhat[i, user_id]
    if len(a) < limit:
        recommended_items = np.argsort(a)[-len(a):]
    else:
        recommended_items = np.argsort(a)[-limit:]
    return recommended_items

def evaluatePR(Yhat, Data_test, limit, data_size):
    sum_p = 0
    Pu = np.zeros((n_users,))
    for u in range(n_users):
        recommended_items = recommend(Yhat, rate_train, u, limit)
        ids = np.where(Data_test[:, 0] == u)[0]
        rated_items = Data_test[ids, 1]
        for i in recommended_items:
            if i in rated_items:
                Pu[u] += 1
        sum_p += Pu[u]
    p = sum_p/(n_users * limit)
    r = sum_p/(Data_test.shape[0] + 1)
    print('%s::%d::cosine_similarity::%r::%r\r\n' % (str(data_size), limit, p, r))
# for i in range(120, 501, 10):
#     evaluatePR(Yhat, rate_test, i, '1M')