## Lesson 3: Collaborative-based recommender systems: Factorization

In [2]:

#NETFLIX REAL 50.000.000 usuaris and 100.000 items
%autosave 150
%matplotlib inline
import pandas as pd
import numpy as np
import math
import matplotlib.pylab as plt

# Load Data set
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-1m/users.dat', sep='::', names=u_cols)

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', names=r_cols)

# the movies file contains columns indicating the movie's genres
# let's only load the first three columns of the file with usecols
m_cols = ['movie_id', 'title', 'release_date']
movies = pd.read_csv('ml-1m/movies.dat', sep='::', names=m_cols, usecols=range(3), encoding='latin-1')

# Construcció del DataFrame
data = pd.merge(pd.merge(ratings, users), movies)
data = data[['user_id','title', 'movie_id','rating','release_date','sex','age']]


print("La BD has "+ str(data.shape[0]) +" ratings")
print("La BD has ", data.user_id.nunique()," users")
print("La BD has ", data.movie_id.nunique(), " movies")
data.head()



def evaluate(estimate_f,data_train,data_test):
    """ RMSE-based predictive performance evaluation with pandas. """
    ids_to_estimate = zip(data_test.user_id, data_test.movie_id)
    estimated = np.array([estimate_f(u,i) if u in data_train.user_id else 3 for (u,i) in ids_to_estimate ])
    real = data_test.rating.values
    return compute_rmse(estimated, real)

def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))


## Divide the data in two sets: training and test
def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.loc[sampled_ids, 'for_testing'] = True
    return df

data['for_testing'] = False
grouped = data.groupby('user_id', group_keys=False).apply(assign_to_set)
data_train = data[grouped.for_testing == False]
data_test = data[grouped.for_testing == True]
print(data_train.shape)
print(data_test.shape)
print(data_train.index & data_test.index)

print("Training data_set has "+ str(data_train.shape[0]) +" ratings")
print("Test data set has "+ str(data_test.shape[0]) +" ratings")
print("La BD has ", data.movie_id.nunique(), " movies")

Autosaving every 150 seconds




La BD has 1000209 ratings
La BD has  6040  users
La BD has  3706  movies
(797758, 8)
(202451, 8)
Int64Index([], dtype='int64')
Training data_set has 797758 ratings
Test data set has 202451 ratings
La BD has  3706  movies


### Factorization Matrix as Dimensionality Reduction

In [18]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

class SVD_CollaborativeFiltering:
    """ Collaborative filtering using a custom sim(u,u'). """
    
    def __init__(self,DataFrame, num_components=10):
        """ Constructor """
        self.df=DataFrame
        self.sim = pd.DataFrame(np.sum([0]),columns=data_train.user_id.unique(), index=data_train.user_id.unique())
        self.num_components=num_components

    def learn(self):
        """ Prepare data structures for estimation. Similarity matrix for users """
        allUsers=set(self.df['user_id'])
        
        svd = TruncatedSVD(n_components=self.num_components)
        urm = pd.pivot_table(self.df[['user_id','movie_id','rating']],columns='user_id',index='movie_id',values='rating',fill_value=0)
        X= np.float32(urm.values)
        
        #Lets continue.....
                
    def estimate(self, user_id, movie_id):
        movie_users = self.df[self.df['movie_id'] ==movie_id]
        #movie_users = movie_users.set_index('user_id')
        
        allUsers = movie_users.user_id
        
        a = movie_users.rating.values
        b = reco.sim[user_id-1][allUsers-1] 
        rating_num = np.sum(a*b)
        rating_den = np.sum(b)
        if rating_den==0: 
            if self.df.rating[self.df['movie_id']==movie_id].mean()>0:
                # return the mean movie rating if there is no similar for the computation
                return self.df.rating[self.df['movie_id']==movie_id].mean()
            else:
                # else return mean user rating 
                return self.df.rating[self.df['user_id']==user_id].mean()
        return rating_num/rating_den


In [7]:
reco = SVD_CollaborativeFiltering(data_train,num_components=40)
reco.learn()
reco.estimate(user_id=2,movie_id=1)

4.3244454192889794

In [None]:
print('RMSE for Collaborative Recomender: %s' % evaluate(reco.estimate,data_train,data_test))

### FACTORIZATION MODEL SVD

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

class SVD_CollaborativeFiltering:
    """ Collaborative filtering using a custom sim(u,u'). """
    
    def __init__(self,DataFrame, num_components=10,
                 item_fact_reg=0.0, 
                 user_fact_reg=0.0):
        """ Constructor """
        self.df = DataFrame
        self.num_components = num_components
        self.item_fact_reg = item_fact_reg
        self.user_fact_reg = user_fact_reg
        
        urm = pd.pivot_table(self.df[['user_id','movie_id','rating']],columns='movie_id',index='user_id',values='rating',fill_value=0)
        self.n_users, self.n_items = urm.shape
        self.ratings = np.float32(urm.values)
        
        user_index = np.arange(len(urm.index))
        self.users = dict(zip(user_index,urm.index ))
        self.users_index2id = dict(zip(urm.index,user_index)) 
        
        movie_index = np.arange(len(urm.columns))
        self.movies = dict(zip(movie_index,urm.columns ))   
        self.movies_index2id = dict(zip(urm.columns, movie_index))

        self.sample_row, self.sample_col = self.ratings.nonzero()
        self.n_samples = len(self.sample_row)
    
    def __sdg__(self):
        for idx in self.training_indices:
            u = self.sample_row[idx]
            i = self.sample_col[idx]
            user_id = self.users[u]
            item_id = self.movies[i]
            
            prediction = self.estimate(user_id, item_id)
            error = (self.ratings[u,i] - prediction) # error
            
            #Update latent factors
            self.user_vecs[u, :] += self.learning_rate * \
                                    (error * self.item_vecs[i, :] - \
                                     self.user_fact_reg * self.user_vecs[u,:])
            self.item_vecs[i, :] += self.learning_rate * \
                                    (error * self.user_vecs[u, :] - \
                                     self.item_fact_reg * self.item_vecs[i,:])
                
                
    def learn(self,n_iter = 10, learning_rate=0.001):
        """ Train the model. """
        self.train_mse =[]
        self.test_mse = []
        iter_diff = 0
        
        # initialize latent vectors
        self.user_vecs = np.random.normal(scale=1./self.num_components,\
                                          size=(self.n_users, self.num_components))
        self.item_vecs = np.random.normal(scale=1./self.num_components,
                                          size=(self.n_items, self.num_components))

        self.learning_rate = learning_rate
        
        ctr =1
        while ctr <= n_iter:
            if ctr % 1 == 0:
                print('Iteration: {}'.format(ctr))
            self.training_indices = np.arange(self.n_samples)
            #shuffle training samples
            np.random.shuffle(self.training_indices)
            self.__sdg__()
            ctr += 1
                
            print('\tTrain mse: %s' % evaluate(reco.estimate,data_train,data_train))
            print('\tTest mse: %s' % evaluate(reco.estimate,data_test,data_train))
    
                
    def estimate(self, user_id, movie_id):
        """ Single user and item prediction."""
        u = self.users_index2id[user_id]
        i = self.movies_index2id[movie_id]
        prediction =  self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
        return prediction
    

In [None]:
reco = SVD_CollaborativeFiltering(data_train,num_components=40)
reco.learn(n_iter = 50)
reco.estimate(user_id=2,movie_id=1)