In [1]:
# sistema de recomendación basado en ítems.
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 100,000 ratings y 3,600 tags aplicados a 9,000 películas por parte de 600 usuarios,
#http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

warnings.filterwarnings('ignore')
df = pd.read_csv('ratings.csv')

movie_titles = pd.read_csv('movies.csv')
df = pd.merge(df,movie_titles,on='movieId')
print("<HEAD>")
print(df.head())
print("<DESCRIBE>")
print(df.describe())
print("<INFO>")
print(df.info())


<HEAD>
   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  
<DESCRIBE>
              userId        movieId         rating     timestamp
count  100836.000000  100836.000000  100836.000000  1.008360e+05
mean      326.127564   19435.295718       3.501557  1.205946e+09
std       182.618491   35530.987199       1.042529  2.162610e+08
min    

In [3]:
#obtenemos la calificación promedio de cada pelicula
ratings = pd.DataFrame(df.groupby('title')['rating'].mean())
#obtenemos el numero de calificaciones recibidas por la pelicula (cuantas calificaciones)
ratings['number_of_ratings'] = df.groupby('title')['rating'].count()
print(ratings.head())


                                         rating  number_of_ratings
title                                                             
'71 (2014)                                  4.0                  1
'Hellboy': The Seeds of Creation (2004)     4.0                  1
'Round Midnight (1986)                      3.5                  2
'Salem's Lot (2004)                         5.0                  1
'Til There Was You (1997)                   4.0                  2


In [4]:
#obtenemos un dataframe con las columnas como los títulos de la película y 
#las filas como los identificadores de usuario. Cada columna representa todas las 
#clasificaciones de una película por todos los usuarios. 
#La clasificación aparece como NAN cuando un usuario no ha clasificado una determinada película.
movie_matrix = df.pivot_table(index = 'userId', columns = 'title',
                             values = 'rating')
print(movie_matrix.head())

title   '71 (2014)  'Hellboy': The Seeds of Creation (2004)  \
userId                                                        
1              NaN                                      NaN   
2              NaN                                      NaN   
3              NaN                                      NaN   
4              NaN                                      NaN   
5              NaN                                      NaN   

title   'Round Midnight (1986)  'Salem's Lot (2004)  \
userId                                                
1                          NaN                  NaN   
2                          NaN                  NaN   
3                          NaN                  NaN   
4                          NaN                  NaN   
5                          NaN                  NaN   

title   'Til There Was You (1997)  'Tis the Season for Love (2015)  \
userId                                                               
1                             Na

In [5]:
#ordenar las películas más valoradas de mayor a menor para seleccionar 2 y trabajar con ellas
print(ratings.sort_values('number_of_ratings',ascending = False).head(10))


                                             rating  number_of_ratings
title                                                                 
Forrest Gump (1994)                        4.164134                329
Shawshank Redemption, The (1994)           4.429022                317
Pulp Fiction (1994)                        4.197068                307
Silence of the Lambs, The (1991)           4.161290                279
Matrix, The (1999)                         4.192446                278
Star Wars: Episode IV - A New Hope (1977)  4.231076                251
Jurassic Park (1993)                       3.750000                238
Braveheart (1995)                          4.031646                237
Terminator 2: Judgment Day (1991)          3.970982                224
Schindler's List (1993)                    4.225000                220


In [6]:
# mostramos las calificaciones de 2 peliculas en particular
#Supongamos que un usuario ha visto Air Force One (1997) y Contact (1997)
#muestran el userId y la calificación que le dieron a las dos películas

AFO_user_raiting = movie_matrix['Air Force One (1997)']
contact_user_raiting = movie_matrix['Contact (1997)']
print(AFO_user_raiting.head(50))
print(contact_user_raiting.head(50))

userId
1     NaN
2     NaN
3     NaN
4     NaN
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
10    NaN
11    4.0
12    NaN
13    NaN
14    NaN
15    NaN
16    NaN
17    NaN
18    3.0
19    NaN
20    NaN
21    NaN
22    NaN
23    NaN
24    NaN
25    NaN
26    NaN
27    NaN
28    2.0
29    NaN
30    NaN
31    NaN
32    NaN
33    NaN
34    NaN
35    NaN
36    NaN
37    NaN
38    NaN
39    NaN
40    NaN
41    NaN
42    4.0
43    NaN
44    NaN
45    4.0
46    NaN
47    NaN
48    NaN
49    NaN
50    NaN
Name: Air Force One (1997), dtype: float64
userId
1     NaN
2     NaN
3     NaN
4     NaN
5     NaN
6     NaN
7     5.0
8     NaN
9     NaN
10    NaN
11    5.0
12    NaN
13    NaN
14    NaN
15    NaN
16    NaN
17    NaN
18    NaN
19    3.0
20    NaN
21    NaN
22    NaN
23    NaN
24    NaN
25    NaN
26    NaN
27    4.0
28    3.5
29    NaN
30    NaN
31    NaN
32    NaN
33    NaN
34    NaN
35    NaN
36    NaN
37    NaN
38    NaN
39    5.0
40    NaN
41    2.0
42    NaN
43    NaN
44    NaN
45 

In [7]:
# El objetivo es buscar películas similares a Contact (1997) y Air Force One (1997) 
#que recomendaremos a este usuario. Podemos lograrlo calculando la correlación entre 
#las clasificaciones de estas dos películas y las clasificaciones del resto de las películas del conjunto de datos.
#Las películas que tienen un alto coeficiente de correlación son las películas más similares entre sí.
#coeficiente de correlación de Pearson
#obtener la correlación entre la clasificación de cada película y la del Air Force One.
similar_to_air_force_one = movie_matrix.corrwith(AFO_user_raiting)
print(similar_to_air_force_one)
print('------------------------------------')
#obtener la correlación entre la clasificación de cada película y la del Air Force One.
similar_to_contact = movie_matrix.corrwith(contact_user_raiting)
print(similar_to_contact)

title
'71 (2014)                                        NaN
'Hellboy': The Seeds of Creation (2004)           NaN
'Round Midnight (1986)                            NaN
'Salem's Lot (2004)                               NaN
'Til There Was You (1997)                         NaN
                                               ...   
eXistenZ (1999)                              0.129099
xXx (2002)                                  -0.188006
xXx: State of the Union (2005)                    NaN
¡Three Amigos! (1986)                       -0.187477
À nous la liberté (Freedom for Us) (1931)         NaN
Length: 9719, dtype: float64
------------------------------------
title
'71 (2014)                                        NaN
'Hellboy': The Seeds of Creation (2004)           NaN
'Round Midnight (1986)                            NaN
'Salem's Lot (2004)                               NaN
'Til There Was You (1997)                         NaN
                                               ...   
eXis

In [8]:
#Eliminamos registros NaN
corr_contac = pd.DataFrame(similar_to_contact, columns = ['Correlation'])
corr_contac.dropna(inplace=True)
print('corr_contac')
print(corr_contac.head())

corr_AFO = pd.DataFrame(similar_to_air_force_one, columns = ['correlation'])
corr_AFO.dropna(inplace=True)
print('corr_AFO')
print(corr_AFO.head())

corr_contac
                                   Correlation
title                                         
'burbs, The (1989)                    0.486761
(500) Days of Summer (2009)           0.634064
*batteries not included (1987)        0.868599
...And Justice for All (1979)         1.000000
10 Things I Hate About You (1999)    -0.102640
corr_AFO
                                correlation
title                                      
'burbs, The (1989)                 0.168459
(500) Days of Summer (2009)        0.086874
*batteries not included (1987)    -0.866025
10 Cloverfield Lane (2016)         0.192450
10 Items or Less (2006)           -1.000000


In [9]:
#lgunas de las películas tienen muy pocas clasificaciones y pueden terminar siendo recomendadas 
#simplemente porque una o dos personas les dieron una clasificación de 5 estrellas
#Mostramos el number_of_ratings para asegurarnos
corr_contac = pd.DataFrame(similar_to_contact, columns = ['Correlation'])
corr_contac.dropna(inplace=True)
#unir los dos dataframes con la columna number_of_ratings en el dataframe ratings.
#para saber el numero de calificantes
corr_contac = corr_contac.join(ratings['number_of_ratings'])
print('------------- corr_contac -------------------')
print(corr_contac.head())

corr_AFO = pd.DataFrame(similar_to_air_force_one, columns = ['correlation'])
corr_AFO.dropna(inplace=True)
corr_AFO = corr_AFO.join(ratings['number_of_ratings'])
print('-----------------  corr_AFO --------------------')
print(corr_AFO.head())

------------- corr_contac -------------------
                                   Correlation  number_of_ratings
title                                                            
'burbs, The (1989)                    0.486761                 17
(500) Days of Summer (2009)           0.634064                 42
*batteries not included (1987)        0.868599                  7
...And Justice for All (1979)         1.000000                  3
10 Things I Hate About You (1999)    -0.102640                 54
-----------------  corr_AFO --------------------
                                correlation  number_of_ratings
title                                                         
'burbs, The (1989)                 0.168459                 17
(500) Days of Summer (2009)        0.086874                 42
*batteries not included (1987)    -0.866025                  7
10 Cloverfield Lane (2016)         0.192450                 14
10 Items or Less (2006)           -1.000000                  3


In [10]:
#Finalmente, nos quedamos con la similitud de las películas en base a su correlación 
#con las dos con las que estamos probando, pero que tengan más de 100 ratings cada una 
#de ellas, y las ordenamos de mayor a menor correlación:
corr_contac = pd.DataFrame(similar_to_contact, columns = ['Correlation'])
corr_contac.dropna(inplace=True)
corr_contac = corr_contac.join(ratings['number_of_ratings'])
print('------------- corr_contac -------------------')
#mostramos las que tienen calificantes mayor a 100
print(corr_contac[corr_contac['number_of_ratings'] > 100].sort_values(by = 'Correlation', 
                                                                     ascending =False).head(10))

corr_AFO = pd.DataFrame(similar_to_air_force_one, columns = ['Correlation'])
corr_AFO.dropna(inplace=True)
corr_AFO = corr_AFO.join(ratings['number_of_ratings'])
print('-----------------  corr_AFO --------------------')
print(corr_AFO[corr_AFO['number_of_ratings'] > 100].sort_values(by = 'Correlation', 
                                                                     ascending =False).head(10))

------------- corr_contac -------------------
                                    Correlation  number_of_ratings
title                                                             
Sleepless in Seattle (1993)            0.689602                106
American Pie (1999)                    0.670109                103
Clear and Present Danger (1994)        0.641203                110
Firm, The (1993)                       0.640332                101
Bourne Identity, The (2002)            0.639769                112
Outbreak (1995)                        0.586934                101
E.T. the Extra-Terrestrial (1982)      0.569043                122
Apollo 13 (1995)                       0.563138                201
Die Hard: With a Vengeance (1995)      0.552904                144
Four Weddings and a Funeral (1994)     0.542013                103
-----------------  corr_AFO --------------------
                                 Correlation  number_of_ratings
title                                