In [141]:
#Importo librerías
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jugas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jugas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [142]:
df_modelo = pd.read_parquet("C:/Users/jugas/Proyecto/Proyecto/data/df_modelo.parquet")

In [143]:
df_modelo.head()

Unnamed: 0,genres,id,original_language,overview,popularity,title,vote_average,release_year,return
4,"[Crime, Comedy]",5.0,en,It's Ted the Bellhop's first night on the job....,9.026586,Four Rooms,6.5,1995.0,1.075
11,"[Animation, Family]",12.0,en,"Nemo, an adventurous young clownfish, is unexp...",25.497794,Finding Nemo,7.6,2003.0,10.00357
12,"[Comedy, Drama, Romance]",13.0,en,A man with a low IQ has accomplished great thi...,48.307194,Forrest Gump,8.2,1994.0,12.32628
13,[Drama],14.0,en,"Lester Burnham, a depressed suburban father in...",20.726578,American Beauty,7.9,1999.0,23.753107
15,"[Drama, Crime, Music]",16.0,en,"Selma, a Czech immigrant on the verge of blind...",10.684806,Dancer in the Dark,7.7,2000.0,3.127491


In [144]:
df_modelo['genres'] = df_modelo['genres'].astype(str)  # Convierto a str

In [145]:
##Se separan los géneros y se convierten en palabras individuales
df_modelo['genres'] = df_modelo['genres'].fillna('').apply(lambda x: ' '.join(x.replace(',', ' ').replace('-', '').lower().split()))
     

In [146]:
df_modelo.shape

(26256, 9)

In [147]:
#Hago un conteo de los NaN
df_modelo.isna().sum()

genres                 0
id                     0
original_language      0
overview             146
popularity             0
title                  0
vote_average           0
release_year           0
return                 0
dtype: int64

In [148]:
#Elimino los NaN
df_modelo = df_modelo.dropna()

In [149]:
#Dejo solo los registros con id distinto de 0
df_modelo = df_modelo[df_modelo['id']!=0]

In [150]:
#Reseteo el indice 
df_modelo.reset_index(drop=True, inplace=True)

In [151]:
df_modelo.head()

Unnamed: 0,genres,id,original_language,overview,popularity,title,vote_average,release_year,return
0,['crime' 'comedy'],5.0,en,It's Ted the Bellhop's first night on the job....,9.026586,Four Rooms,6.5,1995.0,1.075
1,['animation' 'family'],12.0,en,"Nemo, an adventurous young clownfish, is unexp...",25.497794,Finding Nemo,7.6,2003.0,10.00357
2,['comedy' 'drama' 'romance'],13.0,en,A man with a low IQ has accomplished great thi...,48.307194,Forrest Gump,8.2,1994.0,12.32628
3,['drama'],14.0,en,"Lester Burnham, a depressed suburban father in...",20.726578,American Beauty,7.9,1999.0,23.753107
4,['drama' 'crime' 'music'],16.0,en,"Selma, a Czech immigrant on the verge of blind...",10.684806,Dancer in the Dark,7.7,2000.0,3.127491


In [152]:
#Cambio el tipo de dato de la columna id a int
df_modelo['id'] = df_modelo['id'].astype(int)

In [153]:
df_modelo['title'].value_counts()

title
Home                              6
Captive                           5
Cinderella                        5
A Christmas Carol                 5
Beauty and the Beast              5
                                 ..
Space Mutiny                      1
Starchaser: The Legend of Orin    1
Body Parts                        1
Soccer Dog: The Movie             1
The Truth Is in the Stars         1
Name: count, Length: 24978, dtype: int64

In [154]:
# Creo una instancia de la clase TfidfVectorizer

Vectorizacion = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))

# Aplico la transformación TF-IDF al texto contenido en las columnas "overview", "genres" y "title" y "original_language"
matriz_vectorizada = Vectorizacion.fit_transform(df_modelo['overview'] + ' ' + df_modelo['genres'] + ' ' + df_modelo['title'] + ' ' + df_modelo['original_language'])
     

In [155]:
df_modelo.head()

Unnamed: 0,genres,id,original_language,overview,popularity,title,vote_average,release_year,return
0,['crime' 'comedy'],5,en,It's Ted the Bellhop's first night on the job....,9.026586,Four Rooms,6.5,1995.0,1.075
1,['animation' 'family'],12,en,"Nemo, an adventurous young clownfish, is unexp...",25.497794,Finding Nemo,7.6,2003.0,10.00357
2,['comedy' 'drama' 'romance'],13,en,A man with a low IQ has accomplished great thi...,48.307194,Forrest Gump,8.2,1994.0,12.32628
3,['drama'],14,en,"Lester Burnham, a depressed suburban father in...",20.726578,American Beauty,7.9,1999.0,23.753107
4,['drama' 'crime' 'music'],16,en,"Selma, a Czech immigrant on the verge of blind...",10.684806,Dancer in the Dark,7.7,2000.0,3.127491


In [156]:
def recomendacion(titulo):


    # Convertir el título ingresado a minúsculas
    titulo = titulo.lower()

    # Crear un índice con todos los títulos en minúsculas
    indices = pd.Series(df_modelo.index, index=df_modelo['title'].str.lower()).drop_duplicates()

    # Resto del código sin cambios
    if titulo not in indices:
        return 'La película ingresada no se encuentra en la base de datos'

    # ... (resto de la función)

    # Obtiene el índice de la primera aparición del título
    ind = indices[titulo]

    # Calcula la similitud coseno
    cosine_sim = cosine_similarity(matriz_vectorizada[ind], matriz_vectorizada).flatten()

    # Obtiene los índices de las películas más similares
    simil = sorted(enumerate(cosine_sim), key=lambda x: x[1], reverse=True)[1:6]
    valid_ind = [i[0] for i in simil if i[0] < len(df_modelo)]

    # Obtiene los títulos de las películas recomendadas
    recomendaciones = df_modelo.iloc[valid_ind]['title'].tolist()

    return recomendaciones

In [157]:
recomendacion("braveheart")

['Certified Copy',
 'Stalingrad',
 'Henry V',
 'Johnny Mad Dog',
 'We Were Soldiers']

In [158]:
df_modelo.to_parquet("C:/Users/jugas/Proyecto/Proyecto/data/modelo_consulta.parquet")