<a href="https://colab.research.google.com/github/Geuens/curso_ai/blob/main/Copy_of_Recomendador_peliculas_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import json
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
import pickle
# NLTK Stem y stopwords
nltk.download('stopwords')
nltk.download('punkt')

def explore_data(movies):

    # Exploración inicial de los datos
    median_average = movies['vote_average'].median()
    median_count = movies['vote_count'].median()
    # print(median_average, median_count)
    percentil = movies['vote_count'].quantile(0.8)
    # print(percentil)
    # print("Número de peliculas dentro del percentil: " + str(len(movies[movies['vote_count']>percentil])))

    d_movies = movies.copy().loc[movies['vote_count'] > percentil]
    registros, columnas = d_movies.shape
    # print("registros: " + str(registros) + " Columnas: " + str(columnas))

    # Podemos hacer una visualización de cómo se distribuyen los votos con nuestro percentil

    # Con Seaborn
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='vote_count', y='vote_average', data=d_movies)
    plt.title('Distribución de Promedio de Votos')
    plt.xlabel('Conteo')
    plt.ylabel('Promedio de Votos')
    plt.show()

    # Podemos hacer una visualización de las películas con mayor índice de popularidad
    populares = d_movies.sort_values('popularity', ascending=False)

    plt.figure(figsize=(12,4))

    plt.barh(populares['title'].head(6),populares['popularity'].head(6), align='center', color='skyblue')
    plt.gca().invert_yaxis()
    plt.xlabel("Popularidad")
    plt.title("Películas Populares")
    plt.show()

# El índice bayes, que se utiliza en el sistema Imbd, nos permite equilibrar las valoraciones con el número de votos
# v es el número de votos por película (vote_count).
# m es el umbral mínimo de votos en nuestro caso el percentil 0.8.
# R es la calificación promedio de la película (vote_average).
# C es el promedio de votos general. d_movies['vote_average'].mean()
def rating_ponderado(data, percentil, C):
    v = data['vote_count']
    R = data['vote_average']
    return (v/(v+percentil) * R) + (percentil/(percentil+v) * C)

# Mostramos las primeras 10 con la columnas
#print(movies[['title','score', 'vote_count', 'vote_average']].head(10))

# print(movies['overview'].head(5))

# Proceso de stemización. Puede no ser necesario pero reduce la dimensionalidad y es una forma de normalización
def preprocess(text):

    # Preparamos stemmer y stopwords
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    # Tokenizar el texto
    words = nltk.word_tokenize(text)

    # Eliminar stopwords y aplicar stemming
    return ' '.join([stemmer.stem(word) for word in words if word.lower() not in stop_words and word.isalpha()])

def load_data():
    # Carga del dataset de créditos
    credits = pd.read_csv('drive/MyDrive/Colab Notebooks/data/TMDB_5000_Movie/tmdb_5000_credits.csv')

    # Carga del dataset de películas
    movies = pd.read_csv('drive/MyDrive/Colab Notebooks/data/TMDB_5000_Movie/tmdb_5000_movies.csv')

    # Carga del dataset de equivalencias entre imdb y tmdb
    links = pd.read_csv('drive/MyDrive/Colab Notebooks/data/TMDB_5000_Movie/links.csv')
    links.drop('movieId', axis=1, inplace=True)

    movies = movies.merge(links, left_on='id', right_on='tmdbId', how='left')
    movies.drop('tmdbId', axis=1, inplace=True)

    credits.columns = ['id', 'title', 'cast', 'crew']
    credits.drop(['title'], axis=1, inplace=True)
    movies= movies.merge(credits,on='id')
    # Con fillna ponemos strings vacias en NaN. apply pasamos la función sobre cada overview
    movies['overview'] = movies['overview'].fillna('').apply(preprocess)
    ####Descartamos crew ya que no nos aporta y descargamos peso

    movies.drop('crew', axis=1, inplace=True)

    # Convert the string representation to a list of dictionaries
    movies['cast'] = movies['cast'].apply(lambda x: json.loads(x))

    # Extract the "name" values from the list of dictionaries
    movies['cast'] = movies['cast'].apply(lambda x: [actor['name'] for actor in x])
    movies['cast'] = movies['cast'].apply(lambda x: ",".join(x))

    # Convert the string representation to a list of dictionaries
    movies['genres'] = movies['genres'].apply(lambda x: json.loads(x))

    # Extract the "name" values from the list of dictionaries
    movies['genres'] = movies['genres'].apply(lambda x: [genre['name'] for genre in x])
    movies['genres'] = movies['genres'].apply(lambda x: ",".join(x))

    # Convert the string representation to a list of dictionaries
    movies['keywords'] = movies['keywords'].apply(lambda x: json.loads(x))

    # Extract the "name" values from the list of dictionaries
    movies['keywords'] = movies['keywords'].apply(lambda x: [keyword['name'] for keyword in x])
    movies['keywords'] = movies['keywords'].apply(lambda x: ",".join(x))


    # Combine the text from "cast," "genres," and "keywords" columns into a single text column
    movies['combined_text'] = movies['overview'] + ' ' + movies['cast'] + ' ' + movies['genres'] + ' ' + movies['keywords']

    # Promedios de votos lo vamos a necesitar para calcula indice bayes.
    C = movies['vote_average'].mean()
    # Agregamos a movies el campo score que nos servirá para ordenar los resultados
    # y ofrecer los mejor valorados primero. d_movies es nuestro dataframe con percentil
    percentil = movies['vote_count'].quantile(0.8)
    movies['score'] = movies.apply(rating_ponderado, axis=1, args=(percentil, C))
    #Ordenar película en descendente por score

    return movies

def simple_recommendator(movies, reference_movie_index):
    ################################# Calculo de el grado de similitud usando TF-IDF y https://es.wikipedia.org/wiki/Similitud_coseno ##########################################

    #Podemos hacer las stopwords de una vez con tfidfVectotizer, pero antes hemos usado NLTK
    #tfidf = TfidfVectorizer(stop_words='english')

    # Initialize the TF-IDF vectorizer
    tfidf = TfidfVectorizer()

    # Fit and transform the preprocessed overviews
    tfidf_matrix = tfidf.fit_transform(movies['overview'])

    # Calculate the cosine similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    #ordenamos las pelicula según indice
    movies = movies.sort_index()
    #Output the shape of tfidf_matrix
    # print(tfidf_matrix.shape)

    # Calculate cosine similarity with all movies
    cosine_sim_scores = cosine_similarity(tfidf_matrix[reference_movie_index], tfidf_matrix)

    # Get the indices of movies with highest similarity (excluding the reference movie)
    similar_movie_indices = cosine_sim_scores.argsort()[0][::-1][1:]

    # Get the top N similar movies
    top_N = 10  # Change this to the desired number of recommendations
    recommended_movies = movies.iloc[similar_movie_indices[:top_N]]
    return recommended_movies # 'recommended_movies' now contains the top N recommended movies based on cosine similarity.

# Ahora se alinean mucho mnejor las recomendaciones con la película avatar!!
#print(recommended_movies.sort_values('score', ascending=False)['title'])

def save_to_excel(movies):
    # Save the dataset to an Excel file
    movies.to_excel('excel_movies.xlsx', index=True)


movies = load_data()

def train_recommendation():

    # Create a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Fit and transform the combined text column
    tfidf_matrix = tfidf_vectorizer.fit_transform(movies['combined_text'])

    save_recomendation(tfidf_matrix)
    return tfidf_matrix

def save_recomendation(tfidf_matrix):
    with open('tfidf_matrix.pickle', 'wb') as handle:
        pickle.dump(tfidf_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_tfidf_matrix():

    with open('tfidf_matrix.pickle', 'rb') as handle:
        tfidf_matrix = pickle.load(handle)
    return tfidf_matrix

def recommendation(reference_movie_index):
    # Si existe el fichero con la matriz lo cargamos, si no llamamos a train_recommendation
    try:
        tfidf_matrix = load_tfidf_matrix()
    except:
        tfidf_matrix = train_recommendation()

    cosine_sim_scores = cosine_similarity(tfidf_matrix[reference_movie_index], tfidf_matrix)

    # Get the indices of movies with highest similarity (excluding the reference movie)
    similar_movie_indices = cosine_sim_scores.argsort()[0][::-1][1:]

    # Get the top N similar movies
    top_N = 10  # Change this to the desired number of recommendations
    recommended_movies = movies.iloc[similar_movie_indices[:top_N]]
    print(recommended_movies.columns)
    return recommended_movies['title']
#print(recommendation(0)['title'])

# Crea una lista de diccionarios para el dropdown
movie_options = [(row["title"], index) for index, row in movies.iterrows()]

demo = gr.Interface(
    recommendation,
    [

        gr.Dropdown(
            movie_options, label="Película", info="Selecciona una película que te haya gustado"
        ),

    ],
    "text"

)


demo.launch()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://86c8eaafb056ecbd3b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


