In [1]:
# import necessary libraries
import util
import numpy as np
import sqlite3
import pandas as pd
import ast

from IPython.display import Image
from IPython.display import display

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

matplotlib.rc("figure", figsize=(8,6))
matplotlib.rc("axes", labelsize=16, titlesize=16)
matplotlib.rc("xtick", labelsize=14)
matplotlib.rc("ytick", labelsize=14)
matplotlib.rc("legend", fontsize=14)
matplotlib.rc("font", size=14)

In [2]:
# load the rating matrix
M = util.load_sparse_csr("imdb_data/rating.npz").toarray()
movie_names = np.load("imdb_data/movies.npy")
user_names = np.load("imdb_data/users.npy")

print M.shape, movie_names.shape, user_names.shape

(18060, 1603) (1603,) (18060,)


In [3]:
print user_names

[u' (borkoboardo)' u'surfs_up1976' u'HumanoidOfFlesh' ...,
 u'Kaya Ozkaracalar' u'LJ27' u'KHayes666']


In [4]:
mask = M > 0
print mask.shape, mask.sum(axis=0).shape, mask.sum(axis=1).shape
print np.sum(mask.sum(axis=0) > 5)
print np.sum(mask.sum(axis=1) > 5)

(18060, 1603) (1603,) (18060,)
504
358


In [23]:
new_rating_m, new_user_names, new_movie_names = util.rating_filter(M, user_names, movie_names, (5,5))

print new_rating_m.shape, new_user_names.shape, new_movie_names.shape

(18060,) (1603,)
Original #rating 25658
Remaining #user 358 #movie 504
(358, 504)
Remaining #rating 4629
Sparsity for the rating matrix : 2.57%
(358, 504) (358,) (504,)


In [33]:
def split_dataset(rating, seed=0, th=(0.6,0.2)):
    """split the rating dataset into 3 set
    
    Args:
        rating: rating matrix #user * #movie
        seed: random seed
        th: split threshold, th[0] means the per centage of data used for training, th[1] for validation, rest for test
        
    Returns:
        train, valid, test, user_mask
    """
    np.random.seed(seed)

    train = np.zeros(rating.shape)
    valid = np.zeros(rating.shape)
    test = np.zeros(rating.shape)
    
    for i in range(rating.shape[0]):
        cnt = 0
        for j in (np.random.permutation(rating.shape[1])):
            if rating[i,j] > 0:
                cnt+=1
                if np.sum(train[i,:]) == 0.0:
                    train[i,j] = rating[i,j]
                elif np.sum(valid[i,:]) == 0.0:
                    valid[i,j] = rating[i,j]
                elif np.sum(test[i,:]) == 0.0:
                    test[i,j] = rating[i,j]
                else:
                    if j % 100 <= th[0] * 100:
                        train[i,j] = rating[i,j]
                    elif j % 100 <= (th[0] + th[1]) * 100:
                        valid[i,j] = rating[i,j]
                    else:
                        test[i,j] = rating[i,j]
                        
#     return train, valid, test
    
    # remove no rating users
    user_mask_train = (np.sum(train, axis=1) == 0)
    user_mask_valid = (np.sum(valid, axis=1) == 0)
    user_mask_test = (np.sum(test, axis=1) == 0)

    user_mask = np.logical_not(user_mask_train | user_mask_valid | user_mask_test)
#     print np.sum(user_mask_train), np.sum(user_mask_valid), np.sum(user_mask_test), np.sum(user_mask)
    print 'Number of user delete %d' % (rating.shape[0] - np.sum(user_mask))
    train = train[user_mask,:]
    test = test[user_mask,:]
    valid = valid[user_mask,:]

    return train, valid, test, user_mask

split_dataset(new_rating_m)

Number of user delete 5


(array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  7.],
        [ 7.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  2.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
