In [2]:
import pandas as pd

In [3]:
# Se carga el dataset movies_dataset.csv a un dataframe en pandas.
movies = pd.read_csv("movies_clean.csv")

In [4]:
# Se normaliza la variable género para la creacion de una nueva variable "tags".
movies['genres'] = movies['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies['genres'] = movies['genres'].str.split(',')
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x))

In [5]:
# Se normaliza la variable cast para la creacion de una nueva variable "tags".
movies['cast'] = movies['cast'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies['cast'] = movies['cast'].str.replace(',', ' ')

In [6]:
# Se crea una nueva variable "tags" que será la que se utilizará en el modelo de ML.
movies['tags'] = movies['overview'] + ' ' + movies['cast'] + ' ' + movies['genres'] + ' ' + movies['title']

In [7]:
movies['tags']

0        Minions Stuart, Kevin and Bob are recruited by...
1        An Amazon princess comes to the world of Man t...
2        A live-action adaptation of Disney's version o...
3        After being coerced into working for a crime b...
4        The special bond that develops between plus-si...
                               ...                        
45340     Directed by Vasilis Vafeas   The Love of Ulysses
45341    This film records a 12 day ritual performed by...
45342    Wealthy family in the 1940s have failed to cha...
45343                         Directed by   The Last Ferry
45344    Robin Williams stand up special.   Robin Willi...
Name: tags, Length: 45345, dtype: object

In [8]:
# Nos quedamos solo con las variables id, title y tags.
movies = movies[['id', 'title','tags']]
print(movies.shape)
print(movies.isna().sum())

(45345, 3)
id         0
title      0
tags     941
dtype: int64


In [9]:
# Eliminamos las filas que contengan nulos.
movies.dropna(inplace=True)

In [10]:
movies.shape

(44404, 3)

In [None]:
# movies.to_csv('movies_ML.csv', index=False) # Se guarda en un nuevo archivo 'movies_ML.csv' para su utilización para el modelo.

In [2]:
# movies = pd.read_csv("movies_ML.csv") # Se carga el dataset movies_ML.csv a un dataframe en pandas.

In [3]:
# Para este proyecto se utilizaran solo las primeras 3000 filas, ya que el archivo llega a ser muy pesado si se utilizan todas.
movies = movies.iloc[:3000] 

In [4]:
# Se vectorizarán los datos para su utilización en el modelo de ML.
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)

In [5]:
# Se aplica la función fit_transform() a la variable tags.
data_vectorizada = tfidf.fit_transform(movies['tags'].values)

In [6]:
tfidf.get_feature_names_out()

array(['000', '007', '10', ..., 'zooeydeschanel', 'zoëbell', 'zoëkravitz'],
      dtype=object)

In [7]:
# Se genera un dataframe con los datos vectorizados.
data_vectorizada = pd.DataFrame(data_vectorizada.toarray(), index=movies['tags'].index.tolist())

In [8]:
data_vectorizada.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
data_vectorizada.shape

(3000, 5000)

In [10]:
# Se reduce la dimensionalidad del dataframe con el modelo TruncatedSVD.
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=3000)

reduced_data = svd.fit_transform(data_vectorizada)
reduced_data.shape

(3000, 3000)

In [11]:
svd.explained_variance_ratio_.cumsum()

array([0.00455547, 0.01160729, 0.01751131, ..., 0.99997673, 0.99998842,
       1.        ])

In [12]:
# Se saca la similaridad del coseno de los datos para su utilización en nuestra función.
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(reduced_data)
similarity

array([[1.        , 0.02420195, 0.01977927, ..., 0.03627507, 0.00951389,
        0.00452038],
       [0.02420195, 1.        , 0.04142777, ..., 0.04216908, 0.01831083,
        0.01617157],
       [0.01977927, 0.04142777, 1.        , ..., 0.02749734, 0.01663158,
        0.02039452],
       ...,
       [0.03627507, 0.04216908, 0.02749734, ..., 1.        , 0.02345273,
        0.07057717],
       [0.00951389, 0.01831083, 0.01663158, ..., 0.02345273, 1.        ,
        0.01495169],
       [0.00452038, 0.01617157, 0.02039452, ..., 0.07057717, 0.01495169,
        1.        ]])

In [16]:
def recomendacion(movie):
    '''Se ingresa una película y a partir de esta la función retorna 
        una recomendación de 5 películas en forma de lista. '''
    
    id_movie = movies[movies['title']==movie].index[0]
    distancias = similarity[id_movie]
    movie_list = sorted(list(enumerate(distancias)), reverse=True, key=lambda x:x[1])[1:6]
    lista_recomendacion = []

    for movie_id in movie_list:
        lista_recomendacion.append(movies.iloc[movie_id[0]].title)
        
    return {'lista_recomendada': lista_recomendacion}

In [22]:
recomendacion('Thor')

{'lista_recomendada': ['Thor: The Dark World',
  'Thor: Ragnarok',
  'The Avengers',
  'Avengers: Age of Ultron',
  'After Earth']}

In [23]:
# Se guardan el modelo en un archivo .pkl para su su futura utilización.
import pickle
with open('model_similarity.pkl', 'wb') as archivo:
    pickle.dump(similarity, archivo)