In [1]:
# Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import numpy as np

In [2]:
# Creamos los dataframes
df_movies = pd.read_csv('ml-latest-small/movies.csv')
df_ratings = pd.read_csv('ml-latest-small/ratings.csv')


In [3]:
#Mostramos el dataframe de peliculas para ver como esta conformado
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
#Primero eliminamos '|' que divide los generos
df_movies['genres']=df_movies['genres'].apply(lambda x: re.sub(r'\|\b',' ',x))

#Creamos un nuevo atributo "content", en donde almacenaremos el titulo y los generos
df_movies['content'] = df_movies['title'] + ' ' + df_movies['genres']

#Sacamos los parentesis que encierran el año de la pelicula
df_movies['content']=df_movies['content'].apply(lambda x: re.sub(r'[()]',' ',x))

#Entonces en content termina quedando el titulo, año y los generos.

In [5]:
df_movies

Unnamed: 0,movieId,title,genres,content
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,Toy Story 1995 Adventure Animation Children ...
1,2,Jumanji (1995),Adventure Children Fantasy,Jumanji 1995 Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men 1995 Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance,Waiting to Exhale 1995 Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995 Comedy
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy,Black Butler: Book of the Atlantic 2017 Acti...
9738,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy,No Game No Life: Zero 2017 Animation Comedy ...
9739,193585,Flint (2017),Drama,Flint 2017 Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation,Bungo Stray Dogs: Dead Apple 2018 Action Ani...


In [7]:
df_movies_content = df_movies.content
df_movies_content

0       Toy Story  1995  Adventure Animation Children ...
1               Jumanji  1995  Adventure Children Fantasy
2                  Grumpier Old Men  1995  Comedy Romance
3           Waiting to Exhale  1995  Comedy Drama Romance
4               Father of the Bride Part II  1995  Comedy
                              ...                        
9737    Black Butler: Book of the Atlantic  2017  Acti...
9738    No Game No Life: Zero  2017  Animation Comedy ...
9739                                   Flint  2017  Drama
9740    Bungo Stray Dogs: Dead Apple  2018  Action Ani...
9741           Andrew Dice Clay: Dice Rules  1991  Comedy
Name: content, Length: 9742, dtype: object

In [8]:
#Bag of Words.
#Declaramos el vectorizador, en este caso tfidf, eliminando las stop_words(the, a, an, etc).
tf_idf = TfidfVectorizer(stop_words=('english'))

In [9]:
#Vectorizamos las palabras que se encuentran en la columna content
df_movies_content_tfidf = tf_idf.fit_transform(df_movies_content)

In [12]:
#Se genera una matriz donde las filas son la cantidad de peliculas
#Y las columnas son la cantidad de palabras que hay en content
df_movies_content_tfidf.shape

(9742, 9060)

In [15]:
#Aplicamos el cosine similarity a los vectores
cosine_sim = cosine_similarity(df_movies_content_tfidf)

In [16]:
cosine_sim

array([[1.        , 0.35797045, 0.12863151, ..., 0.        , 0.05346934,
        0.01147169],
       [0.35797045, 1.        , 0.12368123, ..., 0.        , 0.        ,
        0.        ],
       [0.12863151, 0.12368123, 1.        , ..., 0.        , 0.        ,
        0.01044372],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.05346934, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.01147169, 0.        , 0.01044372, ..., 0.        , 0.        ,
        1.        ]])

In [17]:
#Creamos un dataframe con las similitudes
df_tfidf_cos = pd.DataFrame(cosine_similarity(df_movies_content_tfidf))

In [18]:
#Creamos una variable para almacenar los id de las peliculas con los id del dataframe
index_to_movie_id = df_movies['movieId']

In [19]:
df_tfidf_cos.columns = [str(index_to_movie_id[int(col)]) for col in df_tfidf_cos.columns]

In [21]:
df_tfidf_cos.index = [index_to_movie_id[idx] for idx in df_tfidf_cos.index]

In [22]:
df_tfidf_cos

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,1.000000,0.357970,0.128632,0.133429,0.148409,0.153483,0.175459,0.241260,0.130653,0.197455,...,0.101123,0.058234,0.020957,0.064036,0.0,0.127226,0.162742,0.000000,0.053469,0.011472
2,0.357970,1.000000,0.123681,0.128294,0.142697,0.173904,0.168707,0.273361,0.148037,0.223727,...,0.000000,0.000000,0.000000,0.000000,0.0,0.058855,0.075285,0.000000,0.000000,0.000000
3,0.128632,0.123681,1.000000,0.159453,0.135110,0.139729,0.209681,0.106949,0.118945,0.125547,...,0.019497,0.000000,0.019079,0.000000,0.0,0.014515,0.018567,0.000000,0.000000,0.010444
4,0.133429,0.128294,0.159453,1.000000,0.140149,0.144940,0.217501,0.110938,0.123381,0.130229,...,0.020224,0.012600,0.036667,0.000000,0.0,0.015056,0.019259,0.022177,0.000000,0.010833
5,0.148409,0.142697,0.135110,0.140149,1.000000,0.161213,0.184296,0.123393,0.137233,0.144850,...,0.022495,0.000000,0.022013,0.000000,0.0,0.016747,0.021422,0.000000,0.000000,0.012049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.127226,0.058855,0.014515,0.015056,0.016747,0.038196,0.019799,0.000000,0.032515,0.034319,...,0.105245,0.043404,0.015620,0.047729,0.0,1.000000,0.228533,0.144810,0.059893,0.008550
193583,0.162742,0.075285,0.018567,0.019259,0.021422,0.000000,0.025326,0.000000,0.000000,0.000000,...,0.096411,0.055520,0.019981,0.061053,0.0,0.228533,1.000000,0.185235,0.050978,0.010937
193585,0.000000,0.000000,0.000000,0.022177,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.017178,0.023008,0.000000,0.0,0.144810,0.185235,1.000000,0.000000,0.000000
193587,0.053469,0.000000,0.000000,0.000000,0.000000,0.034418,0.000000,0.000000,0.029299,0.030925,...,0.080452,0.039111,0.000000,0.043008,0.0,0.059893,0.050978,0.000000,1.000000,0.000000


In [42]:
#Peliculas mas similares a una en especifico
df_tfidf_cos.iloc[0].sort_values(ascending=False)[1:11]

3114     0.880446
78499    0.821047
4929     0.538018
3400     0.456423
27       0.421651
5843     0.400561
2161     0.385685
45074    0.365351
2        0.357970
13       0.357436
Name: 1, dtype: float64

In [91]:
#Mostramos los datos de la pelicula
df_movies[df_movies.movieId == 3400]

Unnamed: 0,movieId,title,genres,content
2539,3400,We're Back! A Dinosaur's Story (1993),Adventure Animation Children Fantasy,We're Back! A Dinosaur's Story 1993 Adventur...


In [43]:
#Creacion del perfil del usuario
#Con esto vamos a recomendar peliculas a un usuario en base a las peliculas que haya visto
#Para esto utilizamos el dataframe de ratings creado anteriormente
#En los corchetes aclaramos el id del usuario al que queremos analizar
df_user_ratings = df_ratings[df_ratings.userId == 100]

In [44]:
#Unimos el dataframe de peliculas con el de ratings del usuario especificado
#Asi tenemos los datos de las peliculas que el usuario califico, y la calificacion
df_user_movies = df_movies.reset_index().merge(df_user_ratings, on='movieId')

In [45]:
df_user_movies

Unnamed: 0,index,movieId,title,genres,content,userId,rating,timestamp
0,2,3,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men 1995 Comedy Romance,100,3.5,1100183804
1,10,11,"American President, The (1995)",Comedy Drama Romance,"American President, The 1995 Comedy Drama Ro...",100,4.0,1100184041
2,15,16,Casino (1995),Crime Drama,Casino 1995 Crime Drama,100,4.5,1100185959
3,16,17,Sense and Sensibility (1995),Drama Romance,Sense and Sensibility 1995 Drama Romance,100,4.5,1100184147
4,18,19,Ace Ventura: When Nature Calls (1995),Comedy,Ace Ventura: When Nature Calls 1995 Comedy,100,1.0,1100183757
...,...,...,...,...,...,...,...,...
143,4566,6785,Seven Brides for Seven Brothers (1954),Comedy Musical Romance Western,Seven Brides for Seven Brothers 1954 Comedy ...,100,4.5,1100184338
144,4614,6873,Intolerable Cruelty (2003),Comedy Romance,Intolerable Cruelty 2003 Comedy Romance,100,4.0,1100186781
145,4797,7149,Something's Gotta Give (2003),Comedy Drama Romance,Something's Gotta Give 2003 Comedy Drama Rom...,100,3.5,1100184159
146,5223,8529,"Terminal, The (2004)",Comedy Drama Romance,"Terminal, The 2004 Comedy Drama Romance",100,4.0,1100184289


In [46]:
#Agregamos una columna de weight en el dataframe
df_user_movies['weight'] = df_user_movies['rating']/5.

In [47]:
df_user_movies

Unnamed: 0,index,movieId,title,genres,content,userId,rating,timestamp,weight
0,2,3,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men 1995 Comedy Romance,100,3.5,1100183804,0.7
1,10,11,"American President, The (1995)",Comedy Drama Romance,"American President, The 1995 Comedy Drama Ro...",100,4.0,1100184041,0.8
2,15,16,Casino (1995),Crime Drama,Casino 1995 Crime Drama,100,4.5,1100185959,0.9
3,16,17,Sense and Sensibility (1995),Drama Romance,Sense and Sensibility 1995 Drama Romance,100,4.5,1100184147,0.9
4,18,19,Ace Ventura: When Nature Calls (1995),Comedy,Ace Ventura: When Nature Calls 1995 Comedy,100,1.0,1100183757,0.2
...,...,...,...,...,...,...,...,...,...
143,4566,6785,Seven Brides for Seven Brothers (1954),Comedy Musical Romance Western,Seven Brides for Seven Brothers 1954 Comedy ...,100,4.5,1100184338,0.9
144,4614,6873,Intolerable Cruelty (2003),Comedy Romance,Intolerable Cruelty 2003 Comedy Romance,100,4.0,1100186781,0.8
145,4797,7149,Something's Gotta Give (2003),Comedy Drama Romance,Something's Gotta Give 2003 Comedy Drama Rom...,100,3.5,1100184159,0.7
146,5223,8529,"Terminal, The (2004)",Comedy Drama Romance,"Terminal, The 2004 Comedy Drama Romance",100,4.0,1100184289,0.8


In [50]:
#Perfil del Usuario
# Realiza el producto entre las distancias que hay entre las palabras y los pesos de las peliculas
user_profile = np.dot(df_movies_content_tfidf[df_user_movies['index'].values].toarray().T, df_user_movies['weight'].values)

In [95]:
#user_profile

In [69]:
import sys

np.set_printoptions(threshold=sys.maxsize)

user_profile.shape

(9060,)

In [101]:
type(user_profile)

numpy.ndarray

In [73]:
#Hace un cosine entre el perfil del usuario y las de tfidf
C = cosine_similarity(np.atleast_2d(user_profile), df_movies_content_tfidf)

In [107]:
#C

In [74]:
#Ordena los cosine, y devuelve los indices de las peliculas nada mas de mayor a menor
R = np.argsort(C)[:, ::-1]

In [108]:
R.shape

(1, 9742)

In [75]:
#Devuelve las peliculas, sin tener en cuenta las que ya vio el usuario
recommendations = [i for i in R[0] if i not in df_user_movies['index'].values]

In [81]:
df_movies['title'][recommendations].head(10)

2178          Romance (1999)
4544    Anything Else (2003)
250          Only You (1994)
216              I.Q. (1994)
529          Two Much (1995)
632     She's the One (1996)
8875           5 to 7 (2014)
8253          What If (2013)
5352             P.S. (2004)
9299        All Yours (2016)
Name: title, dtype: object