# Sistema de Recomendación

In [47]:
#Importamos las librerias necesarias.
import pandas as pd
import numpy as np
import difflib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [48]:
#Importamos el dataframe.
archivo = pd.read_csv(r"C:\Users\miche\OneDrive\Escritorio\Proyecto_MLOps\DataSets\dataframe_final.csv", parse_dates=["release_date"], thousands=".")
df = pd.DataFrame(archivo)

In [49]:
#Verificamos que la importacion del dataset haya sido correcta.
df.head(2)

Unnamed: 0,id_collection,name_collection,budget,id_genres,name_genres,id,original_language,overview,popularity,id_production_companies,...,revenue,runtime,name_spoken_languages,status,tagline,title,vote_average,vote_count,cast,director
0,10194.0,Toy Story Collection,300000000,"[16, 35, 10751]","['Animation', 'Comedy', 'Family']",862,en,"Led by Woody, Andy's toys live happily in his ...",21946943.0,[3],...,3735540330,810.0,['English'],Released,,Toy Story,77,54150,"['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",['John Lasseter']
1,,,650000000,"[12, 14, 10751]","['Adventure', 'Fantasy', 'Family']",8844,en,When siblings Judy and Peter discover an encha...,17015539.0,"[559, 2550, 10201]",...,2627972490,1040.0,"['English', 'Français']",Released,Roll the dice and unleash the excitement!,Jumanji,69,24130,"['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",['Joe Johnston']


In [50]:
#Vemos las columnas que necesitamos
df.columns

Index(['id_collection', 'name_collection', 'budget', 'id_genres',
       'name_genres', 'id', 'original_language', 'overview', 'popularity',
       'id_production_companies', 'name_production_companies',
       'name_production_countries', 'release_date', 'release_year',
       'release_month', 'return', 'release_day', 'revenue', 'runtime',
       'name_spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'cast', 'director'],
      dtype='object')

In [51]:
#Creamos un nuevo df con solo las columnas que vamos a necesitas
df = df[["id_collection", "title", "name_genres", "overview"]]

In [52]:
#Realizmos un muestreo
df = df[0:5000]

In [53]:
#Hacemos el replace de los caracteres que no necesitamos dentro de la columna name_genres
df["name_genres"] =df["name_genres"].str.replace("[", "")
df["name_genres"] =df["name_genres"].str.replace("]", "")
df["name_genres"] =df["name_genres"].str.replace("'", "")

#Verificamos el df
df

  df["name_genres"] =df["name_genres"].str.replace("[", "")
  df["name_genres"] =df["name_genres"].str.replace("]", "")


Unnamed: 0,id_collection,title,name_genres,overview
0,10194.0,Toy Story,"Animation, Comedy, Family","Led by Woody, Andy's toys live happily in his ..."
1,,Jumanji,"Adventure, Fantasy, Family",When siblings Judy and Peter discover an encha...
2,119050.0,Grumpier Old Men,"Romance, Comedy",A family wedding reignites the ancient feud be...
3,,Waiting to Exhale,"Comedy, Drama, Romance","Cheated on, mistreated and stepped on, the wom..."
4,96871.0,Father of the Bride Part II,Comedy,Just when George Banks has recovered from his ...
...,...,...,...,...
4995,,How to Kill Your Neighbor's Dog,"Comedy, Drama","The story of Peter McGowan, a chain-smoking, i..."
4996,,The Last Man,Comedy,Apocalyptic comedy finds a socially-challenged...
4997,,Maryam,Drama,An Iranian-born teenager living in suburban Ne...
4998,,Mean Machine,"Comedy, Drama",Disgraced ex-England captain (Danny 'Mean Mach...


In [54]:
#Creamos una nueva columna llamada Tag, la cual se compone de overview + namge_genres
df["tags"] = df.overview+df.name_genres

In [55]:
#Procedemos a eliminar las columnas que no vamos a utilizar.
df = df.drop(columns=["overview", "name_genres"])


In [56]:
#Tomamos una muestra para bajar el dataframe y pueda ser utilizado, obtenemos una muestra de 25%
df.shape


(5000, 3)

In [57]:
#instanciamos nuestro modelo
cv = CountVectorizer(max_features=5000, stop_words="english")

In [58]:
#Generamos el nuevo_df
nuevo_df = cv.fit_transform(df["tags"].values.astype("U")).toarray()

In [59]:
#Vemos la forma del df
nuevo_df.shape

(5000, 5000)

In [60]:
#Creamos la similaridad en base al nuevo df
similaridad = cosine_similarity(nuevo_df) #50% del data set 4m 6.6s 

In [61]:
#Comprobamos la proximidad de cada elemento
similaridad

array([[1.        , 0.04270814, 0.0520051 , ..., 0.        , 0.04484485,
        0.02437575],
       [0.04270814, 1.        , 0.08495482, ..., 0.03698634, 0.02441931,
        0.        ],
       [0.0520051 , 0.08495482, 1.        , ..., 0.        , 0.02973505,
        0.09697623],
       ...,
       [0.        , 0.03698634, 0.        , ..., 1.        , 0.03883678,
        0.1266601 ],
       [0.04484485, 0.02441931, 0.02973505, ..., 0.03883678, 1.        ,
        0.05574947],
       [0.02437575, 0.        , 0.09697623, ..., 0.1266601 , 0.05574947,
        1.        ]])

In [62]:
#Creamos la funcion
def recomendacion(titulo: str):
    titulo=titulo.title() #Cuando se ingrese el titulo lo convertira en las primeras letras de cada palabra en mayuscula
    indice = df[df["title"] == titulo].index[0] #El indice de la pelicula
    distancia = sorted(list(enumerate(similaridad[indice])), reverse=True, key=lambda vector:vector[1]) #Creamos una nueva variable para encontrar la similaridad en base al indice de la pelicula, ponerla en orden descendente
    lista_peliculas = [] #Creamos una lista vacia
    for i in distancia[0:5]: #Iteramos en la variable distancia del indice 0 al 5, la cual contiene los valores aproximados al titulo.
        lista_peliculas.append(df.iloc[i[0]].title) #agregamos a la lista los 5 valores
    
    return lista_peliculas #retornamos los 5 valores en formato de lista.

In [63]:
#Probamos la funcion
recomendacion("Toy Story")

['Toy Story', 'Toy Story 2', 'Man on the Moon', 'Condorman', 'Window to Paris']

In [64]:
#Exportamos los archivos en formato pkl los cuales contienen el modelo de Machine Learning
pickle.dump(df, open("lista_peliculas.pkl", "wb"))
pickle.dump(similaridad, open("similaridad.pkl", "wb"))

In [65]:
#Probamos que funcione el archivo lista_peliculas.pkl
pickle.load(open("lista_peliculas.pkl","rb"))

Unnamed: 0,id_collection,title,tags
0,10194.0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,,Jumanji,When siblings Judy and Peter discover an encha...
2,119050.0,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,96871.0,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...,...
4995,,How to Kill Your Neighbor's Dog,"The story of Peter McGowan, a chain-smoking, i..."
4996,,The Last Man,Apocalyptic comedy finds a socially-challenged...
4997,,Maryam,An Iranian-born teenager living in suburban Ne...
4998,,Mean Machine,Disgraced ex-England captain (Danny 'Mean Mach...


In [66]:
#Probamos que funcione el archivo similaridad.pkl
pickle.load(open("similaridad.pkl", "rb"))

array([[1.        , 0.04270814, 0.0520051 , ..., 0.        , 0.04484485,
        0.02437575],
       [0.04270814, 1.        , 0.08495482, ..., 0.03698634, 0.02441931,
        0.        ],
       [0.0520051 , 0.08495482, 1.        , ..., 0.        , 0.02973505,
        0.09697623],
       ...,
       [0.        , 0.03698634, 0.        , ..., 1.        , 0.03883678,
        0.1266601 ],
       [0.04484485, 0.02441931, 0.02973505, ..., 0.03883678, 1.        ,
        0.05574947],
       [0.02437575, 0.        , 0.09697623, ..., 0.1266601 , 0.05574947,
        1.        ]])