In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
df = pd.read_parquet('../datasets/movie_dataset.parquet')

In [3]:
df.shape

(45379, 20)

Para evitar catastrofes, se limitaran los datos

In [4]:
short_df = df.sample(n=1000, random_state=42)

In [5]:
short_df.reset_index(drop=True, inplace=True)

In [6]:
short_df.isnull().sum()

belongs_to_collection    897
budget                     0
genres                     0
id                         0
original_language          0
overview                  18
popularity                 0
production_companies       0
production_countries       0
release_date               0
revenue                    0
runtime                    5
spoken_languages           0
status                     4
tagline                  537
title                      0
vote_average               0
vote_count                 0
release_year               0
return                     0
dtype: int64

In [7]:
short_df.dropna(subset=["title", "genres", "tagline", "overview"], inplace=True)

In [8]:
# Aplicar la técnica TF-IDF para representar los textos en forma numérica
tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(short_df["title"] + " " + str(short_df["genres"]) + " " + short_df["tagline"] + " " + short_df["overview"])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [13]:
with open('similarity_matrix.pickle', 'wb') as f:
    pickle.dump(cosine_sim, f)

In [33]:
def recommendation(title):
    """
    Recibe un título de película y devuelve una lista con los 5 títulos de películas más similares.

    Parámetros:
    - title (str): Título de la película.

    Retorna:
    - list: Lista con los títulos de las 5 películas más similares.

    """

    # Buscar la fila correspondiente al título de la película
    idx = short_df.index[short_df["title"].str.lower() == title.lower()].tolist()
    if len(idx) == 0:
        return "Película no encontrada"
    else:
        idx = idx[0]
    
    # Calcular la similitud de la película con todas las demás películas
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Ordenar las películas según su similitud y seleccionar las 5 más similares
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]
    
    # Obtener los índices de las películas recomendadas
    movie_indices = [i[0] for i in sim_scores]
    
    # Devolver los títulos de las películas recomendadas
    return list(short_df["title"].iloc[movie_indices])

In [18]:
short_df['title']

0              Extraction
2           The Glass Key
3       The Sunshine Boys
4                No Mercy
6                    Push
              ...        
992             Wild Hogs
993                Choose
995    The Pick-up Artist
996              Sky High
999               Ed Wood
Name: title, Length: 463, dtype: object

In [38]:
recommendation('Push')

['One A.M.', 'Lovers and Liars', 'Premam', "Hell's Highway", 'So Big!']

In [39]:
short_df.loc[short_df['title'] == 'Push']

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,release_year,return
6,,38000000.0,[Action],13455,en,"After his father, an assassin, is brutally mur...",11.596222,"[Icon Productions, Infinity Features Entertain...","[United States of America, Canada]",2009-02-06,45465299.0,111.0,[English],Released,One push can change everything.,Push,5.9,671.0,2009,1.196455


In [40]:
short_df.loc[short_df['title'] == 'One A.M.']

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,release_year,return
50,,0.0,[Comedy],53418,en,A drunken homeowner has a difficult time getti...,1.472505,[Lone Star Corporation],[United States of America],1916-08-07,0.0,22.0,[English],Released,An inspiring and fantastic one-man show!,One A.M.,6.3,17.0,1916,0.0


Bueno, al menos recomienda

In [41]:
short_df.to_parquet('../datasets/short_df.parquet', index=False)