In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords

In [3]:
# Cargar el dataset
df1 = pd.read_parquet('D:/2024/HenryData/Py_Individual/PI_Recomendacion/Datasets/df_model.parquet') 

In [4]:
df1.head(2)

Unnamed: 0,id,title,genres,overview,actor,director,belongs_to_collection
0,862,Toy Story,"Animation, Comedy, Family","Led by Woody, Andy's toys live happily in his ...","Tom Hanks, Tim Allen, Don Rickles",John Lasseter,Toy Story Collection
1,8844,Jumanji,"Adventure, Fantasy, Family",When siblings Judy and Peter discover an encha...,"Robin Williams, Jonathan Hyde, Kirsten Dunst",Joe Johnston,No collecction


Muestra representativa para deployar el modelo

In [5]:
df_filtrado = df1[df1['belongs_to_collection'] != 'No collecction'] 

In [6]:
df_filtrado.shape

(2554, 7)

In [7]:
df_filtrado.head()

Unnamed: 0,id,title,genres,overview,actor,director,belongs_to_collection
0,862,Toy Story,"Animation, Comedy, Family","Led by Woody, Andy's toys live happily in his ...","Tom Hanks, Tim Allen, Don Rickles",John Lasseter,Toy Story Collection
2,15602,Grumpier Old Men,"Romance, Comedy",A family wedding reignites the ancient feud be...,"Walter Matthau, Jack Lemmon, Ann-Margret",Howard Deutch,Grumpy Old Men Collection
4,11862,Father of the Bride Part II,Comedy,Just when George Banks has recovered from his ...,"Steve Martin, Diane Keaton, Martin Short",Charles Shyer,Father of the Bride Collection
9,710,GoldenEye,"Adventure, Action, Thriller",James Bond must unmask the mysterious head of ...,"Pierce Brosnan, Sean Bean, Izabella Scorupco",Martin Campbell,James Bond Collection
12,21032,Balto,"Family, Animation, Adventure",An outcast half-wolf risks his life to prevent...,"Kevin Bacon, Bob Hoskins, Bridget Fonda",Simon Wells,Balto Collection


In [8]:
# Extraer una muestra del 10% y resetear los índices
df_muestra = df1.sample(frac=0.1, random_state=42).reset_index(drop=True)

In [9]:
df_muestra.head()

Unnamed: 0,id,title,genres,overview,actor,director,belongs_to_collection
0,19105,Iron Maiden: Flight 666,"Documentary, Music",A chronological account of the heavy metal ban...,"Bruce Dickinson, Steve Harris, Nicko McBrain","Sam Dunn, Scot McFadyen",No collecction
1,114150,Pitch Perfect,"Comedy, Music, Romance",College student Beca knows she does not want t...,"Anna Kendrick, Skylar Astin, Brittany Snow",Jason Moore,Pitch Perfect Collection
2,9768,Cry-Baby,"Comedy, Music",A prim and proper schoolgirl goes against her ...,"Johnny Depp, Amy Locane, Polly Bergen",John Waters,No collecction
3,17113,The Ballad of Jack and Rose,Drama,Jack Slavin is an environmentalist with a hear...,"Daniel Day-Lewis, Camilla Belle, Catherine Keener",Rebecca Miller,No collecction
4,442087,Mike Birbiglia: Thank God for Jokes,Comedy,Mike Birbiglia declares that a joke should nev...,"Mike Birbiglia, Jimmy Kimmel","Mike Birbiglia, Seth Barrish",No collecction


In [10]:
df=df_muestra.copy()

In [11]:
df.shape

(2163, 7)

In [12]:
# Inicializa el vectorizador TF-IDF
tfidf = TfidfVectorizer(stop_words='english')

# Genera la matriz de TF-IDF para 'overview'
tfidf_matrix = tfidf.fit_transform(df['overview'])

In [13]:
# Función para extraer y unificar la información de género, actor, director y coleccion
def combine_features(row):
    return row['genres'] + " " + row['actor'] + " " + row['director']+ " " + row['belongs_to_collection']

# Aplica la función de combinación de características
df['combined_features'] = df.apply(combine_features, axis=1)

In [14]:
tfidf_combined = TfidfVectorizer(max_features=5000, stop_words='english')
combined_matrix = tfidf_combined.fit_transform(df['combined_features'])

In [15]:
# Calcula la similitud del coseno sobre la matriz TF-IDF de la combinación de características
cosine_sim = cosine_similarity(combined_matrix)

In [16]:
def get_recommendations_1(title, cosine_sim):
    # Normaliza el título para evitar problemas con mayúsculas/minúsculas
    title = title.strip().lower()

    # Filtra el DataFrame con una comparación insensible a las mayúsculas
    matches = df[df['title'].str.lower() == title]

    # Verifica si se encontraron coincidencias
    if matches.empty:
        return f"No se encontró la película '{title.capitalize()}' en el DataFrame."

    try:
        # Obtiene el índice de la primera coincidencia
        idx = matches.index[0]

        # Calcula las puntuaciones de similitud de todas las películas
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Ordena las películas según los puntajes de similitud
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Obtén los índices de las 5 películas más similares (excluyendo la misma)
        top_indices = [i[0] for i in sim_scores[1:6]]

        # Retorna los títulos de las películas recomendadas
        return df['title'].iloc[top_indices].tolist()
    
    except Exception as e:
        return f"Error: {str(e)}"

In [17]:
# Llamar a la función
get_recommendations_1('Batman Returns', cosine_sim)

['Matilda',
 'Romancing the Stone',
 'Big Fish',
 "What's the Worst That Could Happen?",
 'Solitary Man']

In [25]:
# Llamar a la función
get_recommendations_1('Pitch Perfect', cosine_sim)

['Up in the Air', 'Mr. Right', 'Cake', 'The Voices', 'The Colour of Magic']