In [1]:
import requests
import pandas as pd
import numpy as np
import seaborn as sns
from bs4 import BeautifulSoup

try:
    import surprise
except ModuleNotFoundError:
    !pip install scikit-surprise
    import surprise
try:
    import scipy as sp
except ModuleNotFoundError:
    !pip install scipy
    import scipy as sp

from sklearn.metrics.pairwise import cosine_similarity



In [91]:
class Procesos:
    def __init__(self):
        self.cargaDocumentos()
    def cargaDocumentos(self):
        self.df_links = pd.read_csv('csv/links.csv')
        self.df_links = self.df_links.dropna()
        self.df_movies = pd.read_csv('csv/movies.csv')
        self.df_movies = self.df_movies.dropna()
        self.df_ratings = pd.read_csv('csv/ratings.csv')
        self.df_ratings = self.df_ratings.dropna()
        self.df_tags = pd.read_csv('csv/tags.csv')
        self.df_tags = self.df_tags.dropna()
        self.df_movies_ratings = self.df_ratings.merge(self.df_movies)[['userId','movieId','title', 'rating','genres']]
        
        self.df_movies_ratings_tags = pd.merge(self.df_movies_ratings, self.df_tags, how='left')[['userId','movieId','title', 'rating','genres', 'tag']]
        #self.df_movies_ratings_tags.fillna("vacio", inplace = True)
        
        self.ratings_table = self.df_movies_ratings.pivot_table(index='userId', columns='title', values='rating')
        #para cambiar los NAN por 0:
        self.ratings_table.fillna(0, inplace=True)
    def recomedacionPorValoracionOtrosUsuarios(self, title, n_similares):
        sparse_rating = sp.sparse.csr_matrix(self.ratings_table)
        similitud_usuarios = cosine_similarity(sparse_rating)
        #se hace con la transpuesta de la matriz creada anteriormente
        similitud_movies = cosine_similarity(sparse_rating.T)
        df_similitud_usuarios = pd.DataFrame(similitud_usuarios, index=self.ratings_table.index, columns=self.ratings_table.index)
        df_similitud_movies = pd.DataFrame(similitud_movies, index=self.ratings_table.columns, columns=self.ratings_table.columns)
        #le sumamos uno a n_similares porque la primera siempre es la propia pelicula y nos la saltamos
        n_similares+=1
        contador = 1
        print('Peliculas similares a ' + title + ':')
        print()
        for movie in df_similitud_movies[title].sort_values(ascending=False).index[1:n_similares]:
            print(str(contador) + ' - ' + str(movie))
            contador+=1
    def recomedacionPorGenero(self, title, n_similares):
        genres = list(set([genre for genres in self.df_movies["genres"].str.split("|") for genre in genres]))
        genre_matrix = []
        for index, row in self.df_movies.iterrows():
            genre_list = row["genres"].split("|")
            genre_vector = [1 if genre in genre_list else 0 for genre in genres]
            genre_matrix.append(genre_vector)
        genre_matrix = pd.DataFrame(genre_matrix, columns=genres)
        contador = 1
        selected_movie = self.df_movies[self.df_movies["title"] == title]
        selected_movie_index = selected_movie.index[0]
        #sacamos las similitudes de los generos
        similarities = cosine_similarity(genre_matrix[selected_movie_index:selected_movie_index+1], genre_matrix).flatten()
        #las metemos en una tupla y las ordenamos de mayor a menor 
        movie_list = [(index, similarity) for index, similarity in enumerate(similarities)]
        movie_list.sort(key=lambda x: x[1], reverse=True)

        print('Peliculas similares a ' + title + ':')
        print()
        #la bandera nos sirve para saltarnos la propia peli que buscamos
        #siempre esta a false y si nos encontramos la peli que estamos buscando la activamos a True
        #si esta en True al finalizar el bucle significa que ha saltado el titulo que buscabamos para no repetirse a si mismo 
        #y por lo tanto hay que añadir uno mas para llegar al numero deseado por el usuario
        bandera=False
        for movie in movie_list[0:n_similares]:
            if(title != self.df_movies.iloc[movie[0]]["title"]):
                print(str(contador)+' - ' +self.df_movies.iloc[movie[0]]["title"])
                contador+=1
            else:
                bandera=True
        if(bandera):
            #print('bandera')
            mov=movie_list[n_similares][0]
            print(str(contador)+' - ' +self.df_movies.iloc[mov]["title"])
    def predecirRatingDeUserAPeliculaPorSusGeneros(self, nombrePelicula, user_id):
        yaVotado = self.df_movies_ratings[(self.df_movies_ratings['title']==nombrePelicula) & (self.df_movies_ratings['userId']==user_id)]["rating"].unique()
        if(len(yaVotado)!=0):
            prediction = yaVotado[0]
            print()
            print("La prediccion para " + nombrePelicula+" es: " + str(prediction))
            #return prediction
        else:
            # obtener géneros de la película a predecir
            movie_genres = self.df_movies_ratings[self.df_movies_ratings['title']==nombrePelicula]["genres"].unique()
            generosPeli = movie_genres[0].split("|")
            # filtrar valoraciones del usuario para peliculas con generos en comun
            user_ratings_ID = self.df_movies_ratings[self.df_movies_ratings['userId'] == user_id]
            user_ratings = user_ratings_ID.loc[user_ratings_ID['genres'].str.split('|').apply(lambda x: any(i in x for i in generosPeli))]
            # calcular la media de valoraciones del usuario para las peliculas con generos en comun
            if user_ratings.empty:
                print("La lista es empty")
                #return None
            else:
                #prediction = user_ratings_ID['rating'].mean()
                prediction = format(user_ratings['rating'].mean(), '.3f')
                print()
                print("La prediccion para " + nombrePelicula + " es: " + str(prediction))
                #return prediction
            

p = Procesos()
p.recomedacionPorValoracionOtrosUsuarios("Star Wars: Episode IV - A New Hope (1977)",10)
p.recomedacionPorGenero("Star Wars: Episode IV - A New Hope (1977)",10)
p.predecirRatingDeUserAPeliculaPorSusGeneros("Star Wars: Episode IV - A New Hope (1977)",1)

Peliculas similares a Star Wars: Episode IV - A New Hope (1977):

1 - Star Wars: Episode V - The Empire Strikes Back (1980)
2 - Star Wars: Episode VI - Return of the Jedi (1983)
3 - Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
4 - Matrix, The (1999)
5 - Indiana Jones and the Last Crusade (1989)
6 - Back to the Future (1985)
7 - Star Wars: Episode I - The Phantom Menace (1999)
8 - Terminator, The (1984)
9 - Godfather, The (1972)
10 - Saving Private Ryan (1998)
      Adventure  Drama  Musical  War  Sci-Fi  Children  Thriller  Documentary  \
0             1      0        0    0       0         1         0            0   
1             1      0        0    0       0         1         0            0   
2             0      0        0    0       0         0         0            0   
3             0      1        0    0       0         0         0            0   
4             0      0        0    0       0         0         0            0   
...         ... 

In [40]:
prueba = Procesos()
prueba.df_movies_ratings

Unnamed: 0,userId,movieId,title,rating,genres
0,1,1,Toy Story (1995),4.0,Adventure|Animation|Children|Comedy|Fantasy
1,5,1,Toy Story (1995),4.0,Adventure|Animation|Children|Comedy|Fantasy
2,7,1,Toy Story (1995),4.5,Adventure|Animation|Children|Comedy|Fantasy
3,15,1,Toy Story (1995),2.5,Adventure|Animation|Children|Comedy|Fantasy
4,17,1,Toy Story (1995),4.5,Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...
100831,610,160341,Bloodmoon (1997),2.5,Action|Thriller
100832,610,160527,Sympathy for the Underdog (1971),4.5,Action|Crime|Drama
100833,610,160836,Hazard (2005),3.0,Action|Drama|Thriller
100834,610,163937,Blair Witch (2016),3.5,Horror|Thriller


In [41]:
prueba.df_movies_ratings['rating'].unique()

array([4. , 4.5, 2.5, 3.5, 3. , 5. , 0.5, 2. , 1.5, 1. ])

In [None]:
from scipy.sparse import csr_matrix
count_matrix = csr_matrix(count_matrix)

from scipy.sparse.linalg import cosmul
cosine_sim = cosmul(count_matrix, count_matrix)




KeyError: "None of [Index([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n       ...\n       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],\n      dtype='object', length=102677)] are in the [columns]"

In [168]:
prueba.df_movies_ratings_tags[prueba.df_movies_ratings_tags['tag']=='']

KeyError: "None of [Index([       nan,        nan,        nan,        nan,        nan,        nan,\n              nan,        nan,        nan,        nan,        nan,        nan,\n              nan,        nan,        nan,        nan,        nan,        nan,\n              nan,        nan,        nan,        nan,        nan,        nan,\n              nan,        nan,        nan,        nan,        nan,        nan,\n              nan,        nan,        nan,        nan,        nan,        nan,\n              nan,        nan,        nan,        nan,        nan,        nan,\n              nan,        nan,        nan,        nan,        nan,        nan,\n              nan,        nan,        nan,        nan,        nan, 'Scotland',\n              nan,        nan,        nan,        nan,        nan,        nan],\n      dtype='object')] are in the [columns]"

In [153]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import pandas as pd
import numpy as np
#from scipy.sparse.linalg import inv


def recomendarTags(movie_title, user_id, n_similares):
    # Crea una matriz de conteo de tags
    #tags = prueba.df_movies_ratings_tags['tag']
    #count_vectorizer = CountVectorizer()
    #tag_matrix = count_vectorizer.fit_transform(prueba.df_movies_ratings_tags['tag'].fillna(""))
    #tag_matrix = csr_matrix(tag_matrix)
    tagges = list(prueba.df_movies_ratings_tags["tag"].unique())
    #print(tagges)
    count_matrix = prueba.df_movies_ratings_tags.pivot_table(index='userId', columns='tag', values='movieId')
    #####count_matrix = prueba.df_movies_ratings_tags.pivot_table(index='movieId', columns='tag', values='rating')
    count_matrix.fillna(0, inplace=True)
    print(count_matrix)
    
    sparse_rating = sp.sparse.csr_matrix(count_matrix)
    #cosine_sim = cosine_similarity(sparse_rating)
    print(sparse_rating)
    
    selected_movie = prueba.df_movies[prueba.df_movies["title"] == movie_title]
    selected_movie_index = selected_movie.index[0]
    
    genre_matrix = pd.DataFrame(sparse_rating, columns=tagges)
    

    similarities = cosine_similarity(genre_matrix[selected_movie_index:selected_movie_index+1], genre_matrix).flatten()
    #las metemos en una tupla y las ordenamos de mayor a menor 
    movie_list = [(index, similarity) for index, similarity in enumerate(similarities)]
    movie_list.sort(key=lambda x: x[1], reverse=True)
    print(movie_list)
    #print(cosine_sim)
    
    #count_matrix = count.fit_transform(tags)
    # Calcula la similitud coseno entre las películas
    #cosine_sim = cosine_similarity(tag_matrix)
    #######cosine_sim = cosine_similarity(tag_matrix, tag_matrix)
    # Obtiene el índice de la película dada
    #movie_index = prueba.df_movies_ratings_tags[prueba.df_movies_ratings_tags['title'] == movie_title].index[0]
    # Ordena las similitudes en orden descendente
    # Selecciona las películas recomendadas
    print('Peliculas similares a ' + movie_title + ':')
    print()
    #la bandera nos sirve para saltarnos la propia peli que buscamos
    #siempre esta a false y si nos encontramos la peli que estamos buscando la activamos a True
    #si esta en True al finalizar el bucle significa que ha saltado el titulo que buscabamos para no repetirse a si mismo 
    #y por lo tanto hay que añadir uno mas para llegar al numero deseado por el usuario
    bandera=False
    contador = 1
    for movie in movie_list[0:n_similares]:
        if(movie_title != prueba.df_movies.iloc[movie[0]]["title"]):
            print(str(contador)+' - ' +prueba.df_movies.iloc[movie[0]]["title"])
            contador+=1
        else:
            bandera=True
    if(bandera):
        #print('bandera')
        mov=movie_list[n_similares][0]
        print(str(contador)+' - ' +prueba.df_movies.iloc[mov]["title"])

#Ejemplo de uso
recomendations = recomendarTags("Toy Story (1995)", 1, 10)

tag     "artsy"  06 Oscar Nominated Best Movie - Animation  1900s   1920s  \
userId                                                                      
2           0.0                                   0.000000    0.0     0.0   
7           0.0                                   0.000000    0.0     0.0   
18          0.0                                   0.000000    0.0     0.0   
21          0.0                                   0.000000    0.0     0.0   
49          0.0                                   0.000000    0.0     0.0   
62          0.0                                   0.000000    0.0     0.0   
63          0.0                                   0.000000    0.0     0.0   
76          0.0                                   0.000000    0.0     0.0   
103         0.0                                   0.000000    0.0     0.0   
106         0.0                                   0.000000    0.0     0.0   
112         0.0                                   0.000000    0.0     0.0   

ValueError: Shape of passed values is (54, 1), indices imply (54, 1544)

In [78]:
prueba.df_movies_ratings_tags[(prueba.df_movies_ratings_tags['tag']=='Al Pacino')]

Unnamed: 0,userId,movieId,title,rating,genres,tag
40917,62,2023,"Godfather: Part III, The (1990)",5.0,Crime|Drama|Mystery|Thriller,Al Pacino
51505,18,1221,"Godfather: Part II, The (1974)",5.0,Crime|Drama,Al Pacino
52404,18,431,Carlito's Way (1993),4.0,Crime|Drama,Al Pacino
54078,424,4262,Scarface (1983),3.5,Action|Crime|Drama,Al Pacino
67328,62,5388,Insomnia (2002),4.5,Action|Crime|Drama|Mystery|Thriller,Al Pacino


In [75]:
prueba.df_tags[prueba.df_tags['movieId']==8622]

Unnamed: 0,userId,movieId,tag,timestamp
2360,474,8622,politics,1138040083
2361,474,8622,terrorism,1138040083


In [171]:
def recommend_tags(movie_title, user_id):
    # Create a pivot table with the movie IDs as values and the user IDs and tags as indexes
    count_matrix = prueba.df_movies_ratings_tags.pivot_table(index='userId', columns='tag', values='movieId')
    # Replace any NaN values with 0
    count_matrix.fillna(0, inplace=True)
    # Create a sparse matrix from the pivot table
    sparse_rating = sp.sparse.csr_matrix(count_matrix)
    # Select the movie of interest using its title
    selected_movie = prueba.df_movies_ratings_tags[prueba.df_movies_ratings_tags["title"] == movie_title]
    # Get the index of the selected movie
    selected_movie_index = selected_movie.index[0]
    # Create a cosine similarity model using the sparse matrix
    cosine_sim = cosine_similarity(sparse_rating, sparse_rating[selected_movie_index])
    # Get the indices of the most similar movies
    similar_movie_indices = cosine_sim.argsort()[-2:-11:-1]
    # Create a DataFrame with the most similar movies and their titles
    similar_movies = pd.DataFrame(columns=['title'])
    for i in similar_movie_indices:
        similar_movies = similar_movies.append({'title': prueba.df_movies_ratings_tags.loc[i, 'title']}, ignore_index=True)
    return similar_movies


recomendations = recommend_tags("Toy Story (1995)", 1)
print(recomendations)

                                              title
0  0    Toy Story (1995)
Name: title, dtype: object
1  0    Toy Story (1995)
Name: title, dtype: object
2  0    Toy Story (1995)
Name: title, dtype: object
3  0    Toy Story (1995)
Name: title, dtype: object
4  0    Toy Story (1995)
Name: title, dtype: object
5  0    Toy Story (1995)
Name: title, dtype: object
6  0    Toy Story (1995)
Name: title, dtype: object
7  0    Toy Story (1995)
Name: title, dtype: object
8  0    Toy Story (1995)
Name: title, dtype: object


  similar_movies = similar_movies.append({'title': prueba.df_movies_ratings_tags.loc[i, 'title']}, ignore_index=True)
  similar_movies = similar_movies.append({'title': prueba.df_movies_ratings_tags.loc[i, 'title']}, ignore_index=True)
  similar_movies = similar_movies.append({'title': prueba.df_movies_ratings_tags.loc[i, 'title']}, ignore_index=True)
  similar_movies = similar_movies.append({'title': prueba.df_movies_ratings_tags.loc[i, 'title']}, ignore_index=True)
  similar_movies = similar_movies.append({'title': prueba.df_movies_ratings_tags.loc[i, 'title']}, ignore_index=True)
  similar_movies = similar_movies.append({'title': prueba.df_movies_ratings_tags.loc[i, 'title']}, ignore_index=True)
  similar_movies = similar_movies.append({'title': prueba.df_movies_ratings_tags.loc[i, 'title']}, ignore_index=True)
  similar_movies = similar_movies.append({'title': prueba.df_movies_ratings_tags.loc[i, 'title']}, ignore_index=True)
  similar_movies = similar_movies.append({'title': prueb

replicar codigo del coseno para los tags sacando en una lista todos los tags que hay para poder generar una matriz de ella

In [None]:
df_matriz_tags = df.pivot_table(index='movieId', columns='tag', values='timestamp', aggfunc='count').fillna(0)

# Calcular la similaridad coseno entre todas las películas
cosine_sim = cosine_similarity(df_matriz_tags)

# Función para recomendar películas en base a una película dada
def recommend_movies(movie_name, cosine_sim=cosine_sim):
    movie_index = df_matriz_tags.index[df_matriz_tags.index.str.contains(movie_name)].tolist()[0]
    similar_movies = sorted(list(enumerate(cosine_sim[movie_index])), key=lambda x:x[1], reverse=True)
    similar_movies = similar_movies[1:11]
    movie_indices = [i[0] for i in similar_movies]
    return df_matriz_tags.index[movie_indices]