In [28]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Juli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
data = pd.read_parquet("./Datasets/Data para modelo")
data

Unnamed: 0,overview,title,genres_names,actors_names,directors_names
0,"College coeds in New York City, Al, the son of...",Down to You,"Comedy, Drama, Family, Romance","Freddie Prinze Jr., Julia Stiles, Selma Blair,...",Kris Isacsson
1,The President of the United States must deal w...,Deterrence,"Action, Drama, Mystery, Thriller","Kevin Pollak, Timothy Hutton, Sheryl Lee Ralph...",Rod Lurie
2,Brian Hooks plays a character who is just rele...,3 Strikes,"Action, Comedy, Romance","Brian Hooks, N'Bushe Wright, Faizon Love, E-40...",DJ Pooh
3,An avid detective is assigned to investigate t...,Chain of Fools,"Action, Comedy, Crime","Salma Hayek, Jeff Goldblum, Elijah Wood, David...","Pontus Löwenhielm, Patrick von Krusenstjerna"
4,The recently deceased Mona Dearly (Bette Midle...,Drowning Mona,"Comedy, Crime, Mystery","Danny DeVito, Bette Midler, Neve Campbell, Jam...",Nick Gomez
...,...,...,...,...,...
7575,A stranger named Silas flees from a devastatin...,The Final Storm,"Action, Mystery, Thriller, Horror","Lauren Holly, Luke Perry, Steve Bacic, Cole He...",Uwe Boll
7576,"Pretty, popular, and slim high-schooler Aly Sc...",To Be Fat Like Me,"Drama, Family, TV Movie","Kaley Cuoco, Caroline Rhea, Melissa Halstrom, ...",Douglas Barr
7577,Hyperactive teenager Kelly is enrolled into a ...,Cadet Kelly,Comedy,"Hilary Duff, Christy Carlson Romano, Gary Cole...",Larry Shaw
7578,"It's Halloween in the 100 Acre Wood, and Roo's...",Pooh's Heffalump Halloween Movie,"Animation, Family","Jimmy Bennett, Peter Cullen, Jim Cummings, Joh...","Saul Blinkoff, Elliot M. Bour"


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7579 entries, 0 to 7579
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   overview         7579 non-null   object
 1   title            7579 non-null   object
 2   genres_names     7579 non-null   object
 3   actors_names     7579 non-null   object
 4   directors_names  7579 non-null   object
dtypes: object(5)
memory usage: 355.3+ KB


In [31]:
# Preprocesamiento de los datos
# Voy a combinar todas las columnas en una sola columna de texto
data['combined'] = data[['genres_names', 'actors_names', 'directors_names', 'overview']].agg(' '.join, axis=1)

In [32]:
stop = list(stopwords.words('english'))

In [33]:
import re

data['combined'] = data['combined'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))

In [34]:
# Creo el vectorizador con las stop words ya aplicadas

vectorizer = TfidfVectorizer(stop_words=stop)
tfidf_matrix = vectorizer.fit_transform(data['combined'])

In [35]:
features = vectorizer.get_feature_names_out().tolist()
print(features)



In [36]:
# Calculo la similitud del coseno
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [38]:
cosine_sim_df = pd.DataFrame(cosine_sim)

# Función para obtener las 5 películas más similares (excluyendo la película en sí misma)
def obtener_top_similitudes(sim_matrix, top_n=5):
    recomendaciones = {}
    for idx in range(sim_matrix.shape[0]):
        # Obtener la fila de similitud para la película actual
        sim_scores = sim_matrix[idx]
        
        # Crear un DataFrame con las similitudes y los índices
        sim_scores_df = pd.DataFrame({
            'index': range(len(sim_scores)),
            'similarity': sim_scores
        })
        
        # Ordenar por similitud en orden descendente y excluir el índice actual
        sim_scores_df = sim_scores_df.sort_values(by='similarity', ascending=False)
        sim_scores_df = sim_scores_df[sim_scores_df['index'] != idx]
        
        # Seleccionar los top_n similares
        top_similitudes = sim_scores_df.head(top_n)
        
        # Guardar los resultados
        recomendaciones[idx] = top_similitudes['index'].tolist()
    
    return recomendaciones

# Obtener las 5 películas más similares para cada película
recomendaciones = obtener_top_similitudes(cosine_sim_df)

# Ejemplo de cómo acceder a las recomendaciones para la película con índice 0
pelicula_index = 0
print(f"Películas similares a la película {pelicula_index}: {recomendaciones[pelicula_index]}")


Películas similares a la película 0: [283, 7203, 188, 1838, 253]


In [39]:
import joblib

# Guardaré la variable recomendaciones en un archivo PKL
joblib.dump(recomendaciones, 'recomendaciones.pkl')

['recomendaciones.pkl']