# Sistema de recomendación con filtrado colaborativo Item x Item
En este notebook se va a proceder a realizar un sistema de colaboración basado en filtrado colaborativo Item x Item usando como base el conjunto de datos creado en el notebook de análisis exploratorio del dataset movielens

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
# Importo el csv con los datos de las películas a analizar
user_x_ratings_movies = pd.read_csv('./data/user_x_ratings_movies.csv')
user_x_ratings_movies = user_x_ratings_movies.set_index("userId")

In [2]:
user_x_ratings_movies.info()
user_x_ratings_movies.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1894 entries, 116 to 138325
Columns: 24881 entries, 1 to 131258
dtypes: float64(24881)
memory usage: 359.5 MB


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,131231,131239,131241,131243,131248,131250,131252,131254,131256,131258
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
116,3.0,2.0,2.0,,,1.5,,1.0,1.5,2.0,...,,,,,,,,,,
156,5.0,5.0,2.0,3.0,3.0,4.0,4.0,,3.0,4.0,...,,,,,,,,,,
208,4.0,,,,,,,,,,...,,,,,,,,,,
298,4.0,3.0,3.0,,3.0,5.0,,,,4.0,...,,,,,,,,,,
359,5.0,,,,,5.0,,,,4.0,...,,,,,,,,,,


### 1. Calculamos el rating medio de cada usuario

In [3]:
user_x_ratings_movies["meanRating"] = user_x_ratings_movies.mean(axis = 1, skipna = True) 
user_x_ratings_movies

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,131239,131241,131243,131248,131250,131252,131254,131256,131258,meanRating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
116,3.0,2.0,2.0,,,1.5,,1.0,1.5,2.0,...,,,,,,,,,,2.140541
156,5.0,5.0,2.0,3.0,3.0,4.0,4.0,,3.0,4.0,...,,,,,,,,,,3.611290
208,4.0,,,,,,,,,,...,,,,,,,,,,3.663820
298,4.0,3.0,3.0,,3.0,5.0,,,,4.0,...,,,,,,,,,,2.850044
359,5.0,,,,,5.0,,,,4.0,...,,,,,,,,,,3.617692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138162,4.0,3.0,,,4.0,4.0,3.0,,,,...,,,,,,,,,,3.891917
138208,3.0,2.0,2.0,2.0,2.0,3.0,3.0,1.0,,2.0,...,,,,,,,,,,2.659560
138254,4.0,3.5,2.0,,,4.5,3.0,,,4.0,...,,,,,,,,,,3.547308
138301,2.5,2.5,,,,3.5,,,,2.0,...,,,,,,,,,,3.337687


### 1. Se realiza el cálculo de la matriz con mean-centering

In [4]:
user_x_ratings_movies_mean = user_x_ratings_movies.loc[:, user_x_ratings_movies.columns != 'meanRating']
user_x_ratings_movies_mean

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,131231,131239,131241,131243,131248,131250,131252,131254,131256,131258
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
116,3.0,2.0,2.0,,,1.5,,1.0,1.5,2.0,...,,,,,,,,,,
156,5.0,5.0,2.0,3.0,3.0,4.0,4.0,,3.0,4.0,...,,,,,,,,,,
208,4.0,,,,,,,,,,...,,,,,,,,,,
298,4.0,3.0,3.0,,3.0,5.0,,,,4.0,...,,,,,,,,,,
359,5.0,,,,,5.0,,,,4.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138162,4.0,3.0,,,4.0,4.0,3.0,,,,...,,,,,,,,,,
138208,3.0,2.0,2.0,2.0,2.0,3.0,3.0,1.0,,2.0,...,,,,,,,,,,
138254,4.0,3.5,2.0,,,4.5,3.0,,,4.0,...,,,,,,,,,,
138301,2.5,2.5,,,,3.5,,,,2.0,...,,,,,,,,,,


In [5]:
for index, i in user_x_ratings_movies.loc[:, user_x_ratings_movies.columns != 'meanRating'].iterrows():
    user_x_ratings_movies_mean.loc[index] = i - user_x_ratings_movies.loc[index]["meanRating"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [6]:
user_x_ratings_movies_mean

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,131231,131239,131241,131243,131248,131250,131252,131254,131256,131258
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
116,0.859459,-0.140541,-0.140541,,,-0.640541,,-1.140541,-0.640541,-0.140541,...,,,,,,,,,,
156,1.388710,1.388710,-1.611290,-0.61129,-0.611290,0.388710,0.388710,,-0.611290,0.388710,...,,,,,,,,,,
208,0.336180,,,,,,,,,,...,,,,,,,,,,
298,1.149956,0.149956,0.149956,,0.149956,2.149956,,,,1.149956,...,,,,,,,,,,
359,1.382308,,,,,1.382308,,,,0.382308,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138162,0.108083,-0.891917,,,0.108083,0.108083,-0.891917,,,,...,,,,,,,,,,
138208,0.340440,-0.659560,-0.659560,-0.65956,-0.659560,0.340440,0.340440,-1.659560,,-0.659560,...,,,,,,,,,,
138254,0.452692,-0.047308,-1.547308,,,0.952692,-0.547308,,,0.452692,...,,,,,,,,,,
138301,-0.837687,-0.837687,,,,0.162313,,,,-1.337687,...,,,,,,,,,,


In [7]:
sim_values = csr_matrix(user_x_ratings_movies_mean.fillna(0).transpose().values)
sim_values = cosine_similarity(sim_values)
sim_values

array([[ 1.        , -0.12293643, -0.17556395, ...,  0.01790333,
         0.01790333,  0.00989441],
       [-0.12293643,  1.        ,  0.14934691, ...,  0.0410518 ,
         0.0410518 ,  0.0137811 ],
       [-0.17556395,  0.14934691,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.01790333,  0.0410518 ,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 0.01790333,  0.0410518 ,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 0.00989441,  0.0137811 ,  0.        , ...,  0.        ,
         0.        ,  1.        ]])

Creamos la matriz de similaridad

In [8]:
sim_matrix = pd.DataFrame(sim_values, index=user_x_ratings_movies_mean.columns, columns=user_x_ratings_movies_mean.columns)
sim_matrix

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,131231,131239,131241,131243,131248,131250,131252,131254,131256,131258
1,1.000000,-0.122936,-0.175564,-0.186482,-0.211674,0.310492,-0.099251,-0.121950,-0.215510,0.108034,...,0.000000,0.017903,0.017903,0.017903,0.017903,0.017903,0.017903,0.017903,0.017903,0.009894
2,-0.122936,1.000000,0.149347,0.061579,0.207424,-0.190972,0.163351,0.108667,0.147514,0.063632,...,0.000618,0.041052,0.041052,0.041052,0.041052,0.041052,0.041052,0.041052,0.041052,0.013781
3,-0.175564,0.149347,1.000000,0.143477,0.288437,-0.156021,0.150047,0.079661,0.175186,0.051667,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,-0.186482,0.061579,0.143477,1.000000,0.168426,-0.138685,0.104511,0.054113,0.086817,-0.038182,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,-0.211674,0.207424,0.288437,0.168426,1.000000,-0.213005,0.235800,0.158533,0.146136,0.015769,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131250,0.017903,0.041052,0.000000,0.000000,0.000000,0.007482,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000
131252,0.017903,0.041052,0.000000,0.000000,0.000000,0.007482,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000
131254,0.017903,0.041052,0.000000,0.000000,0.000000,0.007482,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000
131256,0.017903,0.041052,0.000000,0.000000,0.000000,0.007482,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000


In [9]:
#Esta es la manera de calcular el Adjusted cosine similarity de manera manual, pero computacionalmente es muy costoso y rudimentario, así que uso la función cosine_similarity de sklearn
'''columns = user_x_ratings_movies_mean.columns
count = 0
total = len(columns) * len(columns)

for i in columns:
    for j in columns:
        nominator = np.sum(user_x_ratings_movies_mean[str(i)] * user_x_ratings_movies_mean[str(j)])
        denom_a = np.sqrt(np.sum(np.power(user_x_ratings_movies_mean[str(i)], 2)))
        denom_b = np.sqrt(np.sum(np.power(user_x_ratings_movies_mean[str(j)], 2)))
        sim_matrix.loc[i][str(j)] = nominator / (denom_a * denom_b)
        count += 1
        print(f"Progress: {(count*100)/total}% ---- {count}/{total} ", end="\r")

sim_matrix.head(10)'''

'columns = user_x_ratings_movies_mean.columns\ncount = 0\ntotal = len(columns) * len(columns)\n\nfor i in columns:\n    for j in columns:\n        nominator = np.sum(user_x_ratings_movies_mean[str(i)] * user_x_ratings_movies_mean[str(j)])\n        denom_a = np.sqrt(np.sum(np.power(user_x_ratings_movies_mean[str(i)], 2)))\n        denom_b = np.sqrt(np.sum(np.power(user_x_ratings_movies_mean[str(j)], 2)))\n        sim_matrix.loc[i][str(j)] = nominator / (denom_a * denom_b)\n        count += 1\n        print(f"Progress: {(count*100)/total}% ---- {count}/{total} ", end="\r")\n\nsim_matrix.head(10)'

#### Cálculo de los ratings no valorados
Una vez creada la matriz de similaridad, se cogen n películas similares a la que se quiere calcular su rating y se calcula, dando así una predicción de rating a todas las péliculas no valoradas

In [10]:
'''predicted_ratings = pd.DataFrame(index=user_x_ratings_movies_mean.index, columns = user_x_ratings_movies_mean.columns)
predicted_ratings'''

'predicted_ratings = pd.DataFrame(index=user_x_ratings_movies_mean.index, columns = user_x_ratings_movies_mean.columns)\npredicted_ratings'

In [11]:
#Ahora hay que calcular los ratings para los valores nan, sería interesante pillar solo aquellos índices y columnas que retornen un valor nan pero claro eso puede  ser imposible...
#Hbará que mirar cómo continua el algoritmo a ver...

### Predicción de un rating
En las siguientes celdas se harán predicción de un rating que no ha sido valorado y predicción de uno que sí lo ha sido

Para esta predicción se van a coger los *k* vecinos más similares siendo *k* = 5, valor que para futuro se cambiará y se estudiará su mejor resultado

In [56]:
#Caso de la pélicula 5 para el usuario 116
knn5 = np.delete(sim_matrix["5"].nlargest(6).index.to_numpy(), 0)
knn_ratings5 = user_x_ratings_movies.loc[116][knn5].values
knn_cosine5 = sim_matrix.loc["5"][knn5].to_numpy()
print(knn5)
print(knn_ratings5)
print(knn_cosine5)

['355' '2953' '186' '1556' '432']
[1.5 0.5 1.5 0.5 2. ]
[0.37806171 0.37282806 0.36617871 0.35683325 0.35005656]


In [57]:
r_116_5 = np.sum(knn_ratings5 * knn_cosine5) / np.sum(knn_cosine5)
r_116_5

1.1959179186383695

In [58]:
#Caso de la pélicula 1 para el usuario 116
knn1 = np.delete(sim_matrix["1"].nlargest(6).index.to_numpy(), 0)
knn_ratings1 = user_x_ratings_movies.loc[116][knn1].values
knn_cosine1 = sim_matrix.loc["1"][knn1].to_numpy()
print(knn1)
print(knn_ratings1)
print(knn_cosine1)

['3114' '1198' '6377' '260' '4886']
[2.  4.  3.  4.5 3. ]
[0.74800919 0.62905137 0.60235081 0.57980231 0.5794559 ]


En este último experimento obtenido de un rating ya dado por el ususario vemos que el sistema ha predecido gratamente la valoración otorgada por el usuario 116 siendo el verdadero valor **3** y el predecido **3.24**

In [59]:
r_116_1 = np.sum(knn_ratings1 * knn_cosine1) / np.sum(knn_cosine1)
r_116_1

3.2391923160308127