In [1]:
# import necessary libraries
import util
import numpy as np
import sqlite3
import pandas as pd
import ast

from IPython.display import Image
from IPython.display import display

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

matplotlib.rc("figure", figsize=(8,6))
matplotlib.rc("axes", labelsize=16, titlesize=16)
matplotlib.rc("xtick", labelsize=14)
matplotlib.rc("ytick", labelsize=14)
matplotlib.rc("legend", fontsize=14)
matplotlib.rc("font", size=14)

In [2]:
# load the rating matrix
M = util.load_sparse_csr("imdb_data/rating.npz").toarray()
movie_names = np.load("imdb_data/movies.npy")
user_names = np.load("imdb_data/users.npy")

print M.shape, movie_names.shape, user_names.shape

(18060, 1603) (1603,) (18060,)


In [3]:
print user_names

[u' (borkoboardo)' u'surfs_up1976' u'HumanoidOfFlesh' ...,
 u'Kaya Ozkaracalar' u'LJ27' u'KHayes666']


In [4]:
mask = M > 0
print mask.shape, mask.sum(axis=0).shape, mask.sum(axis=1).shape
print np.sum(mask.sum(axis=0) > 5)
print np.sum(mask.sum(axis=1) > 5)

(18060, 1603) (1603,) (18060,)
504
358


In [5]:
new_rating_m, new_user_names, new_movie_names = util.rating_filter(M, user_names, movie_names, (5,5))

print new_rating_m.shape, new_user_names.shape, new_movie_names.shape

(18060,) (1603,)
Original #rating 25658
Remaining #user 358 #movie 504
(358, 504)
Remaining #rating 4629
Sparsity for the rating matrix : 2.57%
(358, 504) (358,) (504,)


In [6]:
train, valid, test, user_mask = util.split_dataset(new_rating_m)
print train.shape, np.sum(train>0), np.sum(np.sum(train, axis=1) == 0)
print valid.shape, np.sum(valid>0), np.sum(np.sum(valid, axis=1) == 0)
print test.shape, np.sum(test>0), np.sum(np.sum(test, axis=1) == 0)

Number of user delete 5
(353, 504) 2669 0
(353, 504) 1040 0
(353, 504) 911 0


# Content-Based

Turn movie to feature vectors using their plot description

In [142]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

DB_NAME = 'imdb_data/imdb_final.db'

def movie_to_vec_plot(dbname, movie_names):
    plots = util.load_movie_plot(dbname, movie_names)
    all_plots = []
    for i in range(len(movie_names)):
        p = ''
        for plot in plots[movie_names[i]]:
            p += plot + ' '
        all_plots.append(p)
    # tokenizing
    count_vect = CountVectorizer()
    plot_counts = count_vect.fit_transform(all_plots)
    # tfidf
    tf_transformer = TfidfTransformer().fit(plot_counts)
    plot_tfidf = tf_transformer.transform(plot_counts)    
    
    return all_plots, plot_counts, plot_tfidf


all_plots, plot_counts, plot_tfidf = movie_to_vec_plot(DB_NAME, new_movie_names)
print len(all_plots), plot_counts.shape, plot_tfidf.shape

504 (504, 9499) (504, 9499)


Content based model, quite like linear regression with L2 norm

In [37]:
# compute validation metric
def compute_mse(prediction, real):
    """ 
    Input:
        prediction (matrix) : prediction of users' ratings
        real (matrix) : real user ratings
    Output:
        mse (double) : mean squared error
    """
    # rule out the empty rating
    return np.mean(((real - prediction)**2)[real.nonzero()])



In [153]:
class ContentBased:
    def __init__(self, num_user, movie_vec):
        if type(movie_vec) != np.ndarray:
            self.movie_vec = movie_vec.toarray()
        else:
            self.movie_vec = movie_vec
        self.theta = np.zeros((num_user, movie_vec.shape[1]))
        
    def cal_loss(self, rating, lam=0.1):
        pred = self.movie_vec.dot(self.theta.T).T
        mask = rating > 0
        loss = np.sum(np.square(pred - rating) * mask)      
        loss /= 2.0
        loss_l2 = loss + lam / 2.0 * np.sum(np.square(self.theta))
        
        return loss, loss_l2
    
    def cal_grad(self, rating, lam=0.1):
        grad = np.zeros(self.theta.shape)
        mask = rating > 0
        pred = self.movie_vec.dot(self.theta.T).T
        diff = pred - rating
        
        grad = (mask * diff).dot(self.movie_vec)
        
        return grad
    
    def pred(self):
        return self.movie_vec.dot(self.theta.T).T
        
    def train(self, rating, valid=None, max_ite=100, learning_rate=0.2, lam=0.1):
        for i in range(max_ite):
            loss, loss_l2 = self.cal_loss(rating, lam=lam)
            if i%10 == 0:
                if valid is None:
                    print 'Iteration %d train loss: %f' % (i, loss)
                else:
                    v_loss, v_loss_l2 = self.cal_loss(valid, lam=lam)
                    print 'Iteration %d train loss: %f valid loss: %f' % (i, loss, v_loss)
            grad = self.cal_grad(rating, lam)
            self.theta = self.theta - learning_rate * (grad + lam * self.theta)

cb = ContentBased(train.shape[0], plot_tfidf)
cb.train(train, valid=valid, max_ite=50, lam=0.1)

mse_train = compute_mse(cb.pred(), train)
mse_valid = compute_mse(cb.pred(), valid)

print 'MSE at train %f valid %f' % (mse_train, mse_valid)


Iteration 0 train loss: 63022.000000 valid loss: 26524.500000
Iteration 10 train loss: 4401.264583 valid loss: 11945.853378
Iteration 20 train loss: 4114.034755 valid loss: 11798.938670
Iteration 30 train loss: 4096.210943 valid loss: 11794.504433
Iteration 40 train loss: 4094.333932 valid loss: 11794.474821
MSE at train 3.067886 valid 22.681732


In [215]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVR
from sklearn.ensemble import ExtraTreesClassifier


def feature_selection(X, y, th='7*mean'):
    lsvr = LinearSVR(C=0.03).fit(X, y)
    model = SelectFromModel(lsvr, prefit=True, threshold='7*mean')
    X_new = model.transform(X)
    print X_new.shape
    print np.sum(lsvr.coef_ > 0)
    
    lsvr.fit(X_new, y)
    y_pred = lsvr.predict(X_new)
    print np.mean(np.square(y_pred - y))
    print model
    mask = model.get_support(indices=False)
    print np.sum(mask)

    return X_new

X = plot_tfidf
y = train.sum(axis=0) / ((train>0).sum(axis=0)+1e-4)
movie_vec_new = feature_selection(X, y)

(504, 71)
6464
10.428993589
SelectFromModel(estimator=LinearSVR(C=0.03, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0),
        prefit=True, threshold='7*mean')
0


In [181]:
cb = ContentBased(train.shape[0], movie_vec_new)
cb.train(train, valid=valid, max_ite=50, lam=0.1)

mse_train = compute_mse(cb.pred(), train)
mse_valid = compute_mse(cb.pred(), valid)

print 'MSE at train %f valid %f' % (mse_train, mse_valid)

Iteration 0 train loss: 63022.000000 valid loss: 26524.500000
Iteration 10 train loss: 13345.818953 valid loss: 8593.730959
Iteration 20 train loss: 10236.443355 valid loss: 7225.748653
Iteration 30 train loss: 9213.591353 valid loss: 6852.688442
Iteration 40 train loss: 8710.132396 valid loss: 6716.331497
MSE at train 6.306739 valid 12.803903


In [174]:
def movie_to_vec_genre(dbname, movie_names):
    movie_genres = {}
    all_genres = set()
    
    conn = sqlite3.connect(dbname)
    c = conn.cursor()

    for r in c.execute('''
        SELECT imdb_id, genres
        FROM movie
        '''):
        if r[0] in movie_names:
            gs = ast.literal_eval(r[1])
            if gs is None:
                continue
            all_genres |= set(gs)
            movie_genres[r[0]] = gs
    print len(all_genres), len(movie_genres)
    
    f_genres = list(all_genres)
    movie_vec = np.zeros((len(movie_names), len(f_genres)))
    for i in range(len(movie_names)):
        for g in movie_genres[movie_names[i]]:
            movie_vec[i,f_genres.index(g)] = 1.0
    return movie_vec, f_genres

movie_vec_genres, f_genres = movie_to_vec_genre(DB_NAME, new_movie_names)
print movie_vec_genres.shape
print f_genres

24 504
(504, 24)
[u'Mystery', u'Short', u'Sci-Fi', u'Crime', u'Drama', u'Animation', u'Music', u'Action', u'Comedy', u'Documentary', u'War', u'History', u'Romance', u'Family', u'Horror', u'Thriller', u'Film-Noir', u'Musical', u'Fantasy', u'Adventure', u'News', u'Sport', u'Biography', u'Western']


In [221]:
movie_vec_tmp = np.concatenate((movie_vec_genres, plot_tfidf.toarray()), axis=1)
movie_vec_tmp = feature_selection(movie_vec_tmp, y, th='mean')

print movie_vec_tmp.shape

cb = ContentBased(train.shape[0], movie_vec_genres)
cb.train(train, valid=valid, max_ite=50, learning_rate=0.01, lam=0.1)

mse_train = compute_mse(cb.pred(), train)
mse_valid = compute_mse(cb.pred(), valid)

print 'MSE at train %f valid %f' % (mse_train, mse_valid)

(504, 67)
6155
10.1418416261
SelectFromModel(estimator=LinearSVR(C=0.03, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0),
        prefit=True, threshold='7*mean')
0
(504, 67)
Iteration 0 train loss: 63022.000000 valid loss: 26524.500000
Iteration 10 train loss: 16612.620059 valid loss: 12012.547503
Iteration 20 train loss: 9326.644622 valid loss: 9502.152275
Iteration 30 train loss: 6656.762464 valid loss: 8633.983846
Iteration 40 train loss: 5330.488536 valid loss: 8279.680470
MSE at train 3.408181 valid 15.638814


Ref
* https://www.coursera.org/learn/machine-learning/lecture/uG59z/content-based-recommendations