In [1]:
import requests
import pandas as pd
import numpy as np
import seaborn as sns
from bs4 import BeautifulSoup

try:
    import surprise
except ModuleNotFoundError:
    !pip install scikit-surprise
    import surprise
try:
    import scipy as sp
except ModuleNotFoundError:
    !pip install scipy
    import scipy as sp

from sklearn.metrics.pairwise import cosine_similarity



In [42]:
class Procesos:
    def __init__(self):
        self.cargaDocumentos()
    def cargaDocumentos(self):
        self.df_links = pd.read_csv('csv/links.csv')
        self.df_links = self.df_links.dropna()
        self.df_movies = pd.read_csv('csv/movies.csv')
        self.df_movies = self.df_movies.dropna()
        self.df_ratings = pd.read_csv('csv/ratings.csv')
        self.df_ratings = self.df_ratings.dropna()
        self.df_tags = pd.read_csv('csv/tags.csv')
        self.df_tags = self.df_tags.dropna()
        self.df_movies_ratings = self.df_ratings.merge(self.df_movies)[['userId','movieId','title', 'rating','genres']]
        
        self.df_movies_ratings_tags = pd.merge(self.df_movies_ratings, self.df_tags, how='outer')[['userId','movieId','title', 'rating','genres', 'tag']]
        self.df_movies_ratings_tags["tag"] = self.df_movies_ratings_tags["tag"].str.lower()
        #self.df_movies_ratings_tags.fillna("vacio", inplace = True)
        
        self.ratings_table = self.df_movies_ratings.pivot_table(index='userId', columns='title', values='rating')
        #para cambiar los NAN por 0:
        self.ratings_table.fillna(0, inplace=True)
    def recomedacionPorValoracionOtrosUsuarios(self, nombrePelicula, n_similares):
        sparse_rating = sp.sparse.csr_matrix(self.ratings_table)
        similitud_usuarios = cosine_similarity(sparse_rating)
        #se hace con la transpuesta de la matriz creada anteriormente
        similitud_movies = cosine_similarity(sparse_rating.T)
        df_similitud_usuarios = pd.DataFrame(similitud_usuarios, index=self.ratings_table.index, columns=self.ratings_table.index)
        df_similitud_movies = pd.DataFrame(similitud_movies, index=self.ratings_table.columns, columns=self.ratings_table.columns)
        #le sumamos uno a n_similares porque la primera siempre es la propia pelicula y nos la saltamos
        n_similares+=1
        contador = 1
        print('Peliculas similares a ' + nombrePelicula + ':')
        print()
        for movie in df_similitud_movies[nombrePelicula].sort_values(ascending=False).index[1:n_similares]:
            print(str(contador) + ' - ' + str(movie))
            contador+=1
    def recomedacionPorGenero(self, nombrePelicula, n_similares):
        genres = list(set([genre for genres in self.df_movies["genres"].str.split("|") for genre in genres]))
        genre_matrix = []
        for index, row in self.df_movies.iterrows():
            genre_list = row["genres"].split("|")
            genre_vector = [1 if genre in genre_list else 0 for genre in genres]
            genre_matrix.append(genre_vector)
        genre_matrix = pd.DataFrame(genre_matrix, columns=genres)
        contador = 1
        selected_movie = self.df_movies[self.df_movies["title"] == nombrePelicula]
        selected_movie_index = selected_movie.index[0]
        #sacamos las similitudes de los generos
        similarities = cosine_similarity(genre_matrix[selected_movie_index:selected_movie_index+1], genre_matrix).flatten()
        #las metemos en una tupla y las ordenamos de mayor a menor 
        movie_list = [(index, similarity) for index, similarity in enumerate(similarities)]
        movie_list.sort(key=lambda x: x[1], reverse=True)

        print('Peliculas similares a ' + nombrePelicula + ':')
        print()
        #la bandera nos sirve para saltarnos la propia peli que buscamos
        #siempre esta a false y si nos encontramos la peli que estamos buscando la activamos a True
        #si esta en True al finalizar el bucle significa que ha saltado el titulo que buscabamos para no repetirse a si mismo 
        #y por lo tanto hay que añadir uno mas para llegar al numero deseado por el usuario
        bandera=False
        for movie in movie_list[0:n_similares]:
            if(nombrePelicula != self.df_movies.iloc[movie[0]]["title"]):
                print(str(contador)+' - ' +self.df_movies.iloc[movie[0]]["title"])
                contador+=1
            else:
                bandera=True
        if(bandera):
            #print('bandera')
            mov=movie_list[n_similares][0]
            print(str(contador)+' - ' +self.df_movies.iloc[mov]["title"])
    def predecirRatingDeUserAPeliculaPorSusGeneros(self, nombrePelicula, user_id):
        yaVotado = self.df_movies_ratings[(self.df_movies_ratings['title']==nombrePelicula) & (self.df_movies_ratings['userId']==user_id)]["rating"].unique()
        if(len(yaVotado)!=0):
            prediction = yaVotado[0]
            print()
            print("La prediccion para " + nombrePelicula+" es: " + str(prediction))
            #return prediction
        else:
            # obtener géneros de la película a predecir
            movie_genres = self.df_movies_ratings[self.df_movies_ratings['title']==nombrePelicula]["genres"].unique()
            generosPeli = movie_genres[0].split("|")
            # filtrar valoraciones del usuario para peliculas con generos en comun
            user_ratings_ID = self.df_movies_ratings[self.df_movies_ratings['userId'] == user_id]
            user_ratings = user_ratings_ID.loc[user_ratings_ID['genres'].str.split('|').apply(lambda x: any(i in x for i in generosPeli))]
            # calcular la media de valoraciones del usuario para las peliculas con generos en comun
            if user_ratings.empty:
                print("La lista es empty")
                #return None
            else:
                #prediction = user_ratings_ID['rating'].mean()
                prediction = format(user_ratings['rating'].mean(), '.3f')
                print()
                print("La prediccion para " + nombrePelicula + " es: " + str(prediction))
                #return prediction
    def recomedacionPorTags(self, nombrePelicula, n_similares):
        count_matrix = self.df_movies_ratings_tags.pivot_table(index='movieId', columns='tag', values='userId')
        #count_matrix = self.df_movies_ratings_tags.pivot_table(index='movieId', columns='tag', values='rating')
        count_matrix.fillna(0, inplace=True)
        sparse_rating = sp.sparse.csr_matrix(count_matrix)
        #print(sparse_rating)    
        selected_movie = self.df_movies[self.df_movies["title"] == nombrePelicula]["movieId"].values[0]
        #print(selected_movie)

        #encontramos el id de la pelicula en la matriz
        selected_movie_index = count_matrix.index.get_loc(selected_movie)

        similarities = cosine_similarity(sparse_rating, sparse_rating[selected_movie_index])

        movie_list = [(index, similarity) for index, similarity in enumerate(similarities)]
        movie_list.sort(key=lambda x: x[1], reverse=True)
        
        print('Peliculas similares a ' + nombrePelicula + ':')
        print()
        bandera=False
        contador = 1
        for movie in movie_list[0:n_similares]:
            if(nombrePelicula != self.df_movies.iloc[movie[0]]["title"]):
                print(str(contador)+' - ' +self.df_movies.iloc[movie[0]]["title"])
                contador+=1
            else:
                bandera=True
        if(bandera):
            mov=movie_list[n_similares][0]
            print(str(contador)+' - ' +self.df_movies.iloc[mov]["title"])
        

            

p = Procesos()

#Extra
p.recomedacionPorValoracionOtrosUsuarios("Star Wars: Episode IV - A New Hope (1977)",10)

#Recomendaciones en base a caracteristicas sueltas
p.recomedacionPorGenero("Star Wars: Episode IV - A New Hope (1977)",10)
p.recomedacionPorTags("Star Wars: Episode IV - A New Hope (1977)", 10)

#Prediccion de rating mediante un usuario dado
p.predecirRatingDeUserAPeliculaPorSusGeneros("Star Wars: Episode IV - A New Hope (1977)",1)

#Recomendacion en base a un usuario y caracteristicas


Peliculas similares a Star Wars: Episode IV - A New Hope (1977):

1 - Star Wars: Episode V - The Empire Strikes Back (1980)
2 - Star Wars: Episode VI - Return of the Jedi (1983)
3 - Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
4 - Matrix, The (1999)
5 - Indiana Jones and the Last Crusade (1989)
6 - Back to the Future (1985)
7 - Star Wars: Episode I - The Phantom Menace (1999)
8 - Terminator, The (1984)
9 - Godfather, The (1972)
10 - Saving Private Ryan (1998)
Peliculas similares a Star Wars: Episode IV - A New Hope (1977):

1 - Waterworld (1995)
2 - Stargate (1994)
3 - Demolition Man (1993)
4 - Star Wars: Episode V - The Empire Strikes Back (1980)
5 - Star Wars: Episode VI - Return of the Jedi (1983)
6 - Star Trek III: The Search for Spock (1984)
7 - Lost in Space (1998)
8 - Rocketeer, The (1991)
9 - Tron (1982)
10 - Six-String Samurai (1998)
Peliculas similares a Star Wars: Episode IV - A New Hope (1977):

1 - Misérables, Les (1995)
2 - Double Happine

In [12]:
prueba = Procesos()
prueba.df_movies_ratings_tags

Unnamed: 0,userId,movieId,title,rating,genres,tag
0,1,1,Toy Story (1995),4.0,Adventure|Animation|Children|Comedy|Fantasy,
1,5,1,Toy Story (1995),4.0,Adventure|Animation|Children|Comedy|Fantasy,
2,7,1,Toy Story (1995),4.5,Adventure|Animation|Children|Comedy|Fantasy,
3,15,1,Toy Story (1995),2.5,Adventure|Animation|Children|Comedy|Fantasy,
4,17,1,Toy Story (1995),4.5,Adventure|Animation|Children|Comedy|Fantasy,
...,...,...,...,...,...,...
102879,573,6016,,,,not seen
102880,573,6157,,,,bad
102881,573,6157,,,,ben affleck
102882,600,273,,,,gothic


In [76]:
def predecirRatingDeUserAPeliculaPorSusTags(nombrePelicula, user_id):
    yaVotado = prueba.df_movies_ratings[(prueba.df_movies_ratings['title']==nombrePelicula) & (prueba.df_movies_ratings['userId']==user_id)]["rating"].unique()
    if(len(yaVotado)!=0):
        prediction = yaVotado[0]
        print()
        print("La prediccion para " + nombrePelicula+" es: " + str(prediction))
        #return prediction
    else:
        # obtener tags de la película a predecir
        tagsPeli = []
        movie_tags = prueba.df_movies_ratings_tags[prueba.df_movies_ratings_tags['title']==nombrePelicula]["tag"].unique()
        for m in movie_tags:
            tagsPeli.append(m)
        #print(tagsPeli)
        filtroEnBaseMovieId=prueba.df_movies_ratings[prueba.df_movies_ratings['userId']==user_id]
        filtroMergeandoTags=filtroEnBaseMovieId.merge(prueba.df_tags)[['userId','movieId','title', 'rating', 'tag']]
        user_ratings = filtroMergeandoTags[filtroMergeandoTags['tag'].isin(tagsPeli)]
        # calcular la media de valoraciones del usuario para las peliculas con generos en comun
        if user_ratings.empty:
            print("La lista es empty")
            #return None
        else:
            #prediction = user_ratings_ID['rating'].mean()
            prediction = format(user_ratings['rating'].mean(), '.3f')
            print()
            print("La prediccion para " + nombrePelicula + " es: " + str(prediction))
            #return prediction
predecirRatingDeUserAPeliculaPorSusTags("City of God (Cidade de Deus) (2002)", 1)

La lista es empty


In [93]:
filtroMergeandoTags=prueba.df_movies_ratings_tags[['userId','movieId','title', 'rating', 'tag']]

Unnamed: 0,userId,movieId,title,rating,tag
0,1,1,Toy Story (1995),4.0,
1,5,1,Toy Story (1995),4.0,
2,7,1,Toy Story (1995),4.5,
3,15,1,Toy Story (1995),2.5,
4,17,1,Toy Story (1995),4.5,
...,...,...,...,...,...
102879,573,6016,,,not seen
102880,573,6157,,,bad
102881,573,6157,,,ben affleck
102882,600,273,,,gothic


In [97]:
filtroEnBaseMovieId=prueba.df_movies_ratings_tags[prueba.df_movies_ratings_tags['userId']==472]
filtroEnBaseMovieId = filtroEnBaseMovieId.sort_values(by='movieId', ascending=True)
filtroEnBaseMovieId

Unnamed: 0,userId,movieId,title,rating,genres,tag
742,472,50,"Usual Suspects, The (1995)",5.0,Crime|Mystery|Thriller,
29210,472,135,Down Periscope (1996),3.5,Comedy,
16953,472,318,"Shawshank Redemption, The (1994)",5.0,Crime|Drama,
33964,472,765,Jack (1996),4.0,Comedy|Drama,
46346,472,858,"Godfather, The (1972)",5.0,Crime|Drama,
49865,472,1193,One Flew Over the Cuckoo's Nest (1975),4.5,Drama,
51609,472,1221,"Godfather: Part II, The (1974)",5.0,Crime|Drama,
44735,472,1693,Amistad (1997),4.0,Drama|Mystery,
22602,472,1892,"Perfect Murder, A (1998)",4.0,Thriller,
62123,472,2134,Weird Science (1985),3.5,Comedy|Fantasy|Sci-Fi,


In [66]:
prueba.df_tags[prueba.df_tags["userId"]==474]

Unnamed: 0,userId,movieId,tag,timestamp
981,474,1,pixar,1137206825
982,474,2,game,1137375552
983,474,5,pregnancy,1137373903
984,474,5,remake,1137373903
985,474,7,remake,1137375642
...,...,...,...,...
2483,474,40819,Johnny Cash,1137200595
2484,474,41566,C.S. Lewis,1137181617
2485,474,41997,In Netflix queue,1137179603
2486,474,42002,In Netflix queue,1137202150


In [64]:
tagsPeli = []
movie_tags = prueba.df_movies_ratings_tags[prueba.df_movies_ratings_tags['title']=="City of God (Cidade de Deus) (2002)"]["tag"].unique()
for m in movie_tags:
    tagsPeli.append(m)
#print(tagsPeli)
filtroEnBaseMovieId=prueba.df_movies_ratings[prueba.df_movies_ratings['userId']==474]
filtroMergeandoTags=filtroEnBaseMovieId.merge(prueba.df_tags)[['userId','movieId','title', 'rating', 'tag']]
user_ratings = filtroMergeandoTags[filtroMergeandoTags['tag'].isin(tagsPeli)]
user_ratings


Unnamed: 0,userId,movieId,title,rating,tag
3,474,101,Bottle Rocket (1996),3.5,crime
50,474,1089,Reservoir Dogs (1992),4.0,violence
123,474,2959,Fight Club (1999),4.0,violence
198,474,1179,"Grifters, The (1990)",3.5,crime
212,474,1304,Butch Cassidy and the Sundance Kid (1969),3.5,crime
373,474,5989,Catch Me If You Can (2002),4.0,crime
486,474,6016,City of God (Cidade de Deus) (2002),4.0,crime
487,474,6016,City of God (Cidade de Deus) (2002),4.0,photography
488,474,6016,City of God (Cidade de Deus) (2002),4.0,violence
514,474,5266,Panic Room (2002),4.0,crime


In [99]:
result = prueba.df_movies_ratings.join(prueba.df_tags, on=['userId', 'movieId'], how='left')
result

ValueError: len(left_on) must equal the number of levels in the index of "right"