In [30]:
import pandas as pd

# Cargar el dataset
df = pd.read_parquet('movies_dataset.parquet')



In [31]:
# Verificar las columnas disponibles
df.columns

Index(['belongs_to_collection', 'budget', 'genres', 'id', 'original_language',
       'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'cast', 'crew', 'mes_numero', 'mes'],
      dtype='object')

In [32]:
# Asegurarnos de que las columnas necesarias existen y no tienen valores nulos
df['title'] = df['title'].fillna('').astype(str)
df['genres'] = df['genres'].fillna('').astype(str)
df['overview'] = df['overview'].fillna('').astype(str)
df['id'] = df['id'].fillna('').astype(str)

In [33]:
# Crear una columna combinada de características relevantes para la recomendación
df['combined_features'] = df['title'] + ' ' + df['overview'] + ' ' + df['genres']

In [34]:
# Verificar la cantidad de valores nulos en cada columna
null_counts = df.isnull().sum()
print("Valores nulos después de la limpieza:\n", null_counts)

Valores nulos después de la limpieza:
 belongs_to_collection    41038
budget                       0
genres                       0
id                           0
original_language           11
overview                     0
popularity                   3
production_companies         3
production_countries         3
release_date                87
revenue                      3
runtime                    260
spoken_languages             3
status                      84
tagline                  25099
title                        0
vote_average                 3
vote_count                   3
cast                     37009
crew                     41047
mes_numero                  87
mes                         87
combined_features            0
dtype: int64


In [35]:
null_counts = df.isnull().sum()


In [36]:
print("Valores nulos después de la limpieza:\n", null_counts)


Valores nulos después de la limpieza:
 belongs_to_collection    41038
budget                       0
genres                       0
id                           0
original_language           11
overview                     0
popularity                   3
production_companies         3
production_countries         3
release_date                87
revenue                      3
runtime                    260
spoken_languages             3
status                      84
tagline                  25099
title                        0
vote_average                 3
vote_count                   3
cast                     37009
crew                     41047
mes_numero                  87
mes                         87
combined_features            0
dtype: int64


In [37]:
# Rellenar valores nulos en las columnas relevantes
for col in ['title', 'genres', 'overview']:
    df[col] = df[col].fillna('')

In [38]:
# Verificar nuevamente la cantidad de valores nulos en cada columna
null_counts = df.isnull().sum()
print("Valores nulos después de la limpieza:\n", null_counts)

Valores nulos después de la limpieza:
 belongs_to_collection    41038
budget                       0
genres                       0
id                           0
original_language           11
overview                     0
popularity                   3
production_companies         3
production_countries         3
release_date                87
revenue                      3
runtime                    260
spoken_languages             3
status                      84
tagline                  25099
title                        0
vote_average                 3
vote_count                   3
cast                     37009
crew                     41047
mes_numero                  87
mes                         87
combined_features            0
dtype: int64


In [39]:
# Seleccionar solo las columnas necesarias para el modelo
df_reduced = df[['title', 'id', 'genres', 'overview', 'combined_features']]

In [40]:
df_reduced

Unnamed: 0,title,id,genres,overview,combined_features
0,Toy Story,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...","Toy Story Led by Woody, Andy's toys live happi..."
1,Jumanji,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Jumanji When siblings Judy and Peter discover ...
2,Grumpier Old Men,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Grumpier Old Men A family wedding reignites th...
3,Waiting to Exhale,31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...","Waiting to Exhale Cheated on, mistreated and s..."
4,Father of the Bride Part II,11862,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Father of the Bride Part II Just when George B...
...,...,...,...,...,...
45533,Subdue,439050,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",Rising and falling between a man and woman.,Subdue Rising and falling between a man and wo...
45534,Century of Birthing,111109,"[{'id': 18, 'name': 'Drama'}]",An artist struggles to finish his work while a...,Century of Birthing An artist struggles to fin...
45535,Betrayal,67758,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...","When one of her hits goes wrong, a professiona...","Betrayal When one of her hits goes wrong, a pr..."
45536,Satan Triumphant,227506,[],"In a small town live two brothers, one a minis...",Satan Triumphant In a small town live two brot...


In [41]:
# Función para desanidar la columna 'genres'
def extract_genres(genres):
    try:
        genres_list = ast.literal_eval(genres)
        return ' '.join([genre['name'] for genre in genres_list])
    except:
        return ''


In [42]:
# Aplicar la función a la columna 'genres'
df['genres'] = df['genres'].apply(extract_genres)

In [43]:
# Crear una columna combinada de características relevantes para la recomendación
df['combined_features'] = df['title'] + ' ' + df['overview'] + ' ' + df['genres']


In [44]:
# Verificar nuevamente la cantidad de valores nulos en cada columna
null_counts = df.isnull().sum()
print("Valores nulos después de la limpieza:\n", null_counts)

Valores nulos después de la limpieza:
 belongs_to_collection    41038
budget                       0
genres                       0
id                           0
original_language           11
overview                     0
popularity                   3
production_companies         3
production_countries         3
release_date                87
revenue                      3
runtime                    260
spoken_languages             3
status                      84
tagline                  25099
title                        0
vote_average                 3
vote_count                   3
cast                     37009
crew                     41047
mes_numero                  87
mes                         87
combined_features            0
dtype: int64


In [45]:
# Seleccionar solo las columnas necesarias para el modelo
df_reduced = df[['title', 'id', 'genres', 'overview', 'combined_features']]

In [46]:
df_reduced

Unnamed: 0,title,id,genres,overview,combined_features
0,Toy Story,862,,"Led by Woody, Andy's toys live happily in his ...","Toy Story Led by Woody, Andy's toys live happi..."
1,Jumanji,8844,,When siblings Judy and Peter discover an encha...,Jumanji When siblings Judy and Peter discover ...
2,Grumpier Old Men,15602,,A family wedding reignites the ancient feud be...,Grumpier Old Men A family wedding reignites th...
3,Waiting to Exhale,31357,,"Cheated on, mistreated and stepped on, the wom...","Waiting to Exhale Cheated on, mistreated and s..."
4,Father of the Bride Part II,11862,,Just when George Banks has recovered from his ...,Father of the Bride Part II Just when George B...
...,...,...,...,...,...
45533,Subdue,439050,,Rising and falling between a man and woman.,Subdue Rising and falling between a man and wo...
45534,Century of Birthing,111109,,An artist struggles to finish his work while a...,Century of Birthing An artist struggles to fin...
45535,Betrayal,67758,,"When one of her hits goes wrong, a professiona...","Betrayal When one of her hits goes wrong, a pr..."
45536,Satan Triumphant,227506,,"In a small town live two brothers, one a minis...",Satan Triumphant In a small town live two brot...


In [47]:
# Guardar el dataframe reducido
df_reduced.to_parquet('movies_dataset_reduced.parquet')

In [48]:
# Seleccionar una muestra aleatoria del dataset
df_sample = df.sample(n=5000, random_state=42)  # Ajusta el tamaño de la muestra según sea necesario

In [None]:
# Seleccionar solo las columnas necesarias para el modelo
df_reduced = df_sample[['title', 'id', 'genres', 'overview', 'combined_features']]

In [49]:
# Guardar el dataframe reducido
df_reduced.to_parquet('movies_dataset_reduced_sample.parquet')

Preparación Modelo ML

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [51]:
# Cargar el dataframe reducido
df_reduced = pd.read_parquet('movies_dataset_reduced_sample.parquet')


In [52]:
# Crear el vectorizador TF-IDF
tfidf = TfidfVectorizer(stop_words='english')

In [53]:
# Aplicar el vectorizador a las características combinadas
tfidf_matrix = tfidf.fit_transform(df_reduced['combined_features'])


In [54]:
# Calcular la similitud del coseno entre todas las películas
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [55]:
# Crear una serie con los índices de las películas
indices = pd.Series(df_reduced.index, index=df_reduced['title']).drop_duplicates()

In [56]:
def recomendacion(titulo, cosine_sim=cosine_sim):
    if titulo not in indices:
        return "Película no encontrada en el dataset"
    
    # Obtener el índice de la película que coincide con el título
    idx = indices[titulo]
    
    # Obtener los puntajes de similitud de todas las películas con esa película
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Ordenar las películas basadas en los puntajes de similitud
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Obtener los índices de las 5 películas más similares
    sim_scores = sim_scores[1:6]
    
    # Obtener los títulos de las 5 películas más similares
    movie_indices = [i[0] for i in sim_scores]
    
    return df_reduced['title'].iloc[movie_indices].tolist()


In [57]:
# Probar la función con un título de película
titulo_a_probar = "Toy Story"  # Cambia este valor al título de la película que quieras probar
resultado = recomendacion(titulo_a_probar)
print(resultado)

['Toy Story 3', 'Toy Story 2', 'The 40 Year Old Virgin', 'Small Fry', "Andy Hardy's Blonde Trouble"]
