In [3]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD

Movies_Details = pd.read_csv('C:/tmdb_movies_detail.csv', sep = '|', encoding='utf-8')  

In [4]:
Movies_Details.head()

Unnamed: 0,movieId,genres,title,original_language,overview,popularity,release_date,runtime,vote_average,vote_count
0,19995,"['Action', 'Adventure', 'Fantasy', 'Science Fi...",Avatar,en,"In the 22nd century, a paraplegic Marine is di...",150.437577,2009-12-10,162.0,7.2,11800
1,285,"['Adventure', 'Fantasy', 'Action']",Pirates of the Caribbean: At World's End,en,"Captain Barbossa, long believed to be dead, ha...",139.082615,2007-05-19,169.0,6.9,4500
2,206647,"['Action', 'Adventure', 'Crime']",Spectre,en,A cryptic message from Bond’s past sends him o...,107.376788,2015-10-26,148.0,6.3,4466
3,49026,"['Action', 'Crime', 'Drama', 'Thriller']",The Dark Knight Rises,en,Following the death of District Attorney Harve...,112.31295,2012-07-16,165.0,7.6,9106
4,49529,"['Action', 'Adventure', 'Science Fiction']",John Carter,en,"John Carter is a war-weary, former military ca...",43.926995,2012-03-07,132.0,6.1,2124


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
#Elimina de cada descripcion todos los caracteres innecesarios y lo convierte en una matriz TF-IDF
Vectorized_Result = TfidfVectorizer(min_df=3, max_features=None,
                      strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                      ngram_range=(1, 3),
                      stop_words='english')

#En caso de que alguna pelicula se haya quedado con el campo overview vacía, llenamos con ''
Movies_Details['overview']=Movies_Details['overview'].fillna('')

In [7]:
Vectorized_Matrix = Vectorized_Result.fit_transform(Movies_Details['overview'])

In [9]:
#buscamos los valores de similaridad, sigmoid_kernel recibirá dos vectores de dos overviews de peliculas distintas y calculará su similitud del 0 al 1
from sklearn.metrics.pairwise import sigmoid_kernel

#le paso la misma matriz pues queremos combinaciones de un elemento de 
#la primera contra todos los elemtnso de la segunda y asi sucesivamente 
Sigmoid_Result = sigmoid_kernel(Vectorized_Matrix,Vectorized_Matrix)

In [10]:
Indices = pd.Series(Movies_Details.index, index=Movies_Details['title']).drop_duplicates()

In [11]:
Indices

title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4798
Newlyweds                                   4799
Signed, Sealed, Delivered                   4800
Shanghai Calling                            4801
My Date with Drew                           4802
Length: 4803, dtype: int64

In [13]:
def give_rec(title, Sigmoid_Result=Sigmoid_Result):
    #Obtiene el indice de la pelicula solicitada
    idx=Indices[title]
    
    #Obtengo las puntuaciones de similaridad por pares
    Sig_Scores = list(enumerate(Sigmoid_Result[idx]))
    
    #Ordeno las scores de mayor a menor
    Sig_Scores = sorted(Sig_Scores, key=lambda x: x[1], reverse=True)
    
    #Escojo las 10 mejores (ignorando la 0 que será la propia peli contra si misma)
    Sig_Scores = Sig_Scores[1:11]
    
    #Obtengo los indices de las 10 peliculas
    Movie_Indices = [i[0] for i in Sig_Scores]
    
    #Retorno el titulo de las peliculas obtenidas a través de la busqueda de sus indices en mi documento de peliculas original
    return Movies_Details['title'].iloc[Movie_Indices]

In [14]:
give_rec('Batman v Superman: Dawn of Justice')

3819                 Defendor
1359                   Batman
3       The Dark Knight Rises
428            Batman Returns
1720                 Kick-Ass
210            Batman & Robin
299            Batman Forever
119             Batman Begins
2638               Metropolis
589            Dracula Untold
Name: title, dtype: object