In [1]:
# import necessary libraries
import util
import numpy as np
import sqlite3
import pandas as pd
import ast

from IPython.display import Image
from IPython.display import display

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

matplotlib.rc("figure", figsize=(8,6))
matplotlib.rc("axes", labelsize=16, titlesize=16)
matplotlib.rc("xtick", labelsize=14)
matplotlib.rc("ytick", labelsize=14)
matplotlib.rc("legend", fontsize=14)
matplotlib.rc("font", size=14)

In [2]:
# load the rating matrix
M = util.load_sparse_csr("imdb_data/rating.npz").toarray()
movie_names = np.load("imdb_data/movies.npy")
user_names = np.load("imdb_data/users.npy")

print M.shape, movie_names.shape, user_names.shape

(18060, 1603) (1603,) (18060,)


In [3]:
print user_names

[u' (borkoboardo)' u'surfs_up1976' u'HumanoidOfFlesh' ...,
 u'Kaya Ozkaracalar' u'LJ27' u'KHayes666']


In [4]:
mask = M > 0
print mask.shape, mask.sum(axis=0).shape, mask.sum(axis=1).shape
print np.sum(mask.sum(axis=0) > 5)
print np.sum(mask.sum(axis=1) > 5)

(18060, 1603) (1603,) (18060,)
504
358


In [5]:
new_rating_m, new_user_names, new_movie_names = util.rating_filter(M, user_names, movie_names, (5,5))

print new_rating_m.shape, new_user_names.shape, new_movie_names.shape

(18060,) (1603,)
Original #rating 25658
Remaining #user 358 #movie 504
(358, 504)
Remaining #rating 4629
Sparsity for the rating matrix : 2.57%
(358, 504) (358,) (504,)


In [6]:
train, valid, test, user_mask = util.split_dataset(new_rating_m)
print train.shape, np.sum(train>0), np.sum(np.sum(train, axis=1) == 0)
print valid.shape, np.sum(valid>0), np.sum(np.sum(valid, axis=1) == 0)
print test.shape, np.sum(test>0), np.sum(np.sum(test, axis=1) == 0)

Number of user delete 5
(353, 504) 2669 0
(353, 504) 1040 0
(353, 504) 911 0


# Content-Based

Turn movie to feature vectors using their plot description

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

DB_NAME = 'imdb_data/imdb_final.db'

def movie_to_vec(dbname, movie_names):
    plots = util.load_movie_plot(dbname, movie_names)
    all_plots = []
    for i in range(len(movie_names)):
        p = ''
        for plot in plots[movie_names[i]]:
            p += plot + ' '
        all_plots.append(p)
    # tokenizing
    count_vect = CountVectorizer()
    plot_counts = count_vect.fit_transform(all_plots)
    # tfidf
    tf_transformer = TfidfTransformer().fit(plot_counts)
    plot_tfidf = tf_transformer.transform(plot_counts)    
    
    return all_plots, plot_counts, plot_tfidf


all_plots, plot_counts, plot_tfidf = movie_to_vec(DB_NAME, new_movie_names)
print len(all_plots), plot_counts.shape, plot_tfidf.shape

504 (504, 9499) (504, 9499)


Content based model, quite like linear regression with L2 norm

In [24]:
class ContentBased:
    def __init__(self, num_user, movie_vec):
        self.movie_vec = movie_vec.toarray()
        self.theta = np.zeros((num_user, movie_vec.shape[1]))
        
    def cal_loss(self, rating, lam=0.1):
        pred = self.movie_vec.dot(self.theta.T).T
        mask = rating > 0
        loss = 0.0
        
        for i in range(rating.shape[0]):
            loss += np.sum(np.square(pred[i,:] - rating[i,:]) * mask[i,:])
        loss /= 2.0
        loss += lam / 2.0 * np.sum(np.square(self.theta))
        
        return loss
    
    def cal_grad(self, rating, lam=0.1):
        grad = np.zeros(self.theta.shape)
        mask = rating > 0
        pred = self.movie_vec.dot(self.theta.T).T
        diff = pred - rating
        for i in range(grad.shape[0]):
            grad_user = (mask[i,:] * diff[i,:]).reshape(1,-1) * self.movie_vec.T
            grad[i,:] = grad_user.sum(axis=1)
        return grad
    
    def pred(self):
        return self.movie_vec.dot(self.theta.T).T
        
    def train(self, rating, max_ite=100, learning_rate=0.2, lam=0.1):
        for i in range(max_ite):
            loss = self.cal_loss(rating, lam=lam)
            print 'Loss at iteration %d %f' % (i, loss)
            grad = self.cal_grad(rating, lam)
            self.theta = self.theta - learning_rate * (grad + lam * self.theta)

cb = ContentBased(train.shape[0], plot_tfidf)
cb.train(train, max_ite=20)

Loss at iteration 0 63022.000000
Loss at iteration 1 26706.397676
Loss at iteration 2 15965.656951
Loss at iteration 3 11503.460561
Loss at iteration 4 9414.371119
Loss at iteration 5 8363.269184
Loss at iteration 6 7807.130677
Loss at iteration 7 7501.454908
Loss at iteration 8 7328.275278
Loss at iteration 9 7227.689778
Loss at iteration 10 7168.037830
Loss at iteration 11 7132.029843
Loss at iteration 12 7109.961976
Loss at iteration 13 7096.259142
Loss at iteration 14 7087.653074
Loss at iteration 15 7082.193940
Loss at iteration 16 7078.700537
Loss at iteration 17 7076.447616
Loss at iteration 18 7074.984595
Loss at iteration 19 7074.028598


In [29]:
# compute validation metric
def compute_mse(prediction, real):
    """ 
    Input:
        prediction (matrix) : prediction of users' ratings
        real (matrix) : real user ratings
    Output:
        mse (double) : mean squared error
    """
    # rule out the empty rating
    return np.mean(((real - prediction)**2)[real.nonzero()])


mse_train = compute_mse(cb.pred(), train)
mse_valid = compute_mse(cb.pred(), valid)

print 'MSE at train %f valid %f' % (mse_train, mse_valid)


MSE at train 3.082829 valid 22.690267


Ref
* https://www.coursera.org/learn/machine-learning/lecture/uG59z/content-based-recommendations