In [3]:
import pandas as pd

%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import numpy as np

In [4]:
# El CSV que nos descargamos no cuenta con los nombres de las columnas
# Podemos revisar los nombres de las columnas en el archivo readme que esta junto al link de descarga del set de datos
# Recuerda ubicar la carpeta descarga junto al archivo notebook que haz creado

In [5]:
# Data de usuarios
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,encoding='latin-1')

In [6]:
# Data de ratings
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,encoding='latin-1')

In [7]:
# Data de películas
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,
encoding='latin-1')

In [8]:
# Ahora podemos revisar la data
print(users.shape)
users.head()

(943, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [9]:
print(ratings.shape)
ratings.head()

(100000, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [10]:
print(items.shape)
items.head()

(1682, 24)


Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


Este conjunto de datos contiene atributos de 1682 películas. Hay 24 columnas, de las cuales las últimas 19 columnas especifican el género de una película en particular. Estas son columnas binarias, es decir, un valor de 1 indica que la película pertenece a ese género, y 0 en caso contrario.

GroupLens ya ha dividido el conjunto de datos en tren y prueba, donde los datos de prueba tienen 10 calificaciones para cada usuario, es decir, 9,430 filas en total. Leeremos estos dos archivos en nuestro entorno Python.

In [11]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

¡Finalmente es tiempo de construir nuestro motor recomendado!

In [12]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

In [13]:
n_users

943

In [14]:
n_items

1682

Ahora, crearemos una matriz de elementos de usuario que se puede utilizar para calcular la similitud entre usuarios y elementos.

In [15]:
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [16]:
data_matrix

array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

Ahora, vamos a calcular la similitud. Podemos usar la función pairwise_distance de sklearn para calcular la similitud del coseno.

In [17]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [18]:
user_similarity

array([[ 0.        ,  0.83306902,  0.95254046, ...,  0.85138306,
         0.82049212,  0.60182526],
       [ 0.83306902,  0.        ,  0.88940868, ...,  0.83851522,
         0.82773219,  0.89420212],
       [ 0.95254046,  0.88940868,  0.        , ...,  0.89875744,
         0.86658385,  0.97344413],
       ..., 
       [ 0.85138306,  0.83851522,  0.89875744, ...,  0.        ,
         0.8983582 ,  0.90488042],
       [ 0.82049212,  0.82773219,  0.86658385, ...,  0.8983582 ,
         0.        ,  0.81753534],
       [ 0.60182526,  0.89420212,  0.97344413, ...,  0.90488042,
         0.81753534,  0.        ]])

In [19]:
item_similarity

array([[ 0.        ,  0.59761782,  0.66975521, ...,  1.        ,
         0.95281693,  0.95281693],
       [ 0.59761782,  0.        ,  0.72693082, ...,  1.        ,
         0.92170064,  0.92170064],
       [ 0.66975521,  0.72693082,  0.        , ...,  1.        ,
         1.        ,  0.90312495],
       ..., 
       [ 1.        ,  1.        ,  1.        , ...,  0.        ,
         1.        ,  1.        ],
       [ 0.95281693,  0.92170064,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 0.95281693,  0.92170064,  0.90312495, ...,  1.        ,
         1.        ,  0.        ]])

Esto nos da la similitud en item-item y en usuario-usuario en una forma de matriz. El siguiente paso es hacer predicciones basadas en estas similitudes. Definamos una función para hacer precisamente eso.

In [20]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #Usamos np.newaxis para que mean_user_rating tenga el mismo formato que ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

Finalmente, haremos predicciones basadas en la similitud del usuario y la similitud del elemento.

In [21]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [22]:
user_prediction

array([[ 2.06532606,  0.73430275,  0.62992381, ...,  0.39359041,
         0.39304874,  0.3927712 ],
       [ 1.76308836,  0.38404019,  0.19617889, ..., -0.08837789,
        -0.0869183 , -0.08671183],
       [ 1.79590398,  0.32904733,  0.15882885, ..., -0.13699223,
        -0.13496852, -0.13476488],
       ..., 
       [ 1.59151513,  0.27526889,  0.10219534, ..., -0.16735162,
        -0.16657451, -0.16641377],
       [ 1.81036267,  0.40479877,  0.27545013, ..., -0.00907358,
        -0.00846587, -0.00804858],
       [ 1.8384313 ,  0.47964837,  0.38496292, ...,  0.14686675,
         0.14629808,  0.14641455]])

In [23]:
item_prediction

array([[ 0.44627765,  0.475473  ,  0.50593755, ...,  0.58815455,
         0.5731069 ,  0.56669645],
       [ 0.10854432,  0.13295661,  0.12558851, ...,  0.13445801,
         0.13657587,  0.13711081],
       [ 0.08568497,  0.09169006,  0.08764343, ...,  0.08465892,
         0.08976784,  0.09084451],
       ..., 
       [ 0.03230047,  0.0450241 ,  0.04292449, ...,  0.05302764,
         0.0519099 ,  0.05228033],
       [ 0.15777917,  0.17409459,  0.18900003, ...,  0.19979296,
         0.19739388,  0.20003117],
       [ 0.24767207,  0.24489212,  0.28263031, ...,  0.34410424,
         0.33051406,  0.33102478]])

In [24]:
class MF():

    # Initializing the user-movie rating matrix, no. of latent features, alpha and beta.
    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    # Initializing user-feature and movie-feature matrix 
    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # List of training samples
        self.samples = [
        (i, j, self.R[i, j])
        for i in range(self.num_users)
        for j in range(self.num_items)
        if self.R[i, j] > 0
        ]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
        self.sgd()
        mse = self.mse()
        training_process.append((i, mse))
        if (i+1) % 20 == 0:
            print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process

    # Computing total mean squared error
    def mse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    # Stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    # Ratings for user i and moive j
    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # Full user-movie rating matrix
    def full_matrix(self):
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

In [25]:
R= np.array(ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0))

In [26]:
mf = MF(R, K=20, alpha=0.001, beta=0.01, iterations=100)
training_process = mf.train()
print()
print("P x Q:")
print(mf.full_matrix())
print()

Iteration: 100 ; error = 337.5786

P x Q:
[[ 3.67473405  3.49268707  3.49382896 ...,  3.55342895  3.54414218
   3.53650051]
 [ 3.66284333  3.49573822  3.51642732 ...,  3.52998126  3.53348973
   3.54113179]
 [ 3.61528044  3.44786452  3.44298288 ...,  3.49618542  3.5157249
   3.48974374]
 ..., 
 [ 3.66591673  3.4964524   3.50346481 ...,  3.53724808  3.53914921
   3.54753461]
 [ 3.71448117  3.55134273  3.53902126 ...,  3.59465034  3.59003412
   3.59517227]
 [ 3.63255414  3.46383743  3.46567214 ...,  3.50749815  3.51067001
   3.50672208]]

