In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
movies = pd.read_csv(r'C:\Users\Cliente\Desktop\Python\TMDb\data\tmdb_5000_movies.csv')
credits = pd.read_csv(r'C:\Users\Cliente\Desktop\Python\TMDb\data\tmdb_5000_credits.csv')


print(movies.columns)
movies.head(3)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [7]:
print(type(movies['genres'].iloc[0]))
print(movies['genres'].iloc[0])

<class 'str'>
[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]


In [8]:
import ast

def parse_genres(x):
    try:
        genres = ast.literal_eval(x)
        return [d['name'] for d in genres]
    except:
        return []

movies['genres'] = movies['genres'].apply(parse_genres)

# Verificar resultado
movies[['title', 'genres']].head(5)

Unnamed: 0,title,genres
0,Avatar,"[Action, Adventure, Fantasy, Science Fiction]"
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]"
2,Spectre,"[Action, Adventure, Crime]"
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]"
4,John Carter,"[Action, Adventure, Science Fiction]"


In [9]:
# Preenche NaN na sinopse com string vazia
movies['overview'] = movies['overview'].fillna('')

# Combina gêneros (lista) em texto + overview
movies['metadata'] = movies['genres'].apply(lambda x: ' '.join(x)) + ' ' + movies['overview']

# Verificar o resultado
movies[['title', 'metadata']].head(3)

Unnamed: 0,title,metadata
0,Avatar,Action Adventure Fantasy Science Fiction In th...
1,Pirates of the Caribbean: At World's End,"Adventure Fantasy Action Captain Barbossa, lon..."
2,Spectre,Action Adventure Crime A cryptic message from ...


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Criar o vetor TF-IDF, ignorando stopwords em inglês
tfidf = TfidfVectorizer(stop_words='english')

# Aplicar o TF-IDF na coluna metadata
tfidf_matrix = tfidf.fit_transform(movies['metadata'])

# Calcular similaridade do cosseno entre todos os filmes
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("Matriz de similaridade calculada. Shape:", cosine_sim.shape)

Matriz de similaridade calculada. Shape: (4803, 4803)


In [11]:
# Cria uma série para mapear título do filme ao índice da matriz
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# Exemplo: qual o índice do filme "The Dark Knight Rises"?
print(indices['The Dark Knight Rises'])

3


In [12]:
def recomendar_filmes(titulo, num_recomendacoes=5):
    # Obtem o índice do filme
    idx = indices[titulo]
    
    # Obtem as similaridades do filme com todos os outros
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Ordena do mais similar para o menos similar
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Pega as posições dos filmes mais similares, ignorando o próprio filme (posição 0)
    sim_scores = sim_scores[1:num_recomendacoes+1]
    
    # Extrai os índices dos filmes recomendados
    filmes_indices = [i[0] for i in sim_scores]
    
    # Retorna os títulos dos filmes recomendados
    return movies['title'].iloc[filmes_indices]

# Teste a função
recomendar_filmes('The Dark Knight Rises')

65                              The Dark Knight
299                              Batman Forever
428                              Batman Returns
1359                                     Batman
3854    Batman: The Dark Knight Returns, Part 2
Name: title, dtype: object

In [13]:
def recomendar_filmes_detalhado(titulo, num_recomendacoes=5):
    idx = indices[titulo]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recomendacoes+1]
    filmes_indices = [i[0] for i in sim_scores]
    
    return movies.loc[filmes_indices, ['title', 'release_date', 'vote_average', 'vote_count']]

In [14]:
recomendar_filmes_detalhado('The Dark Knight Rises')

Unnamed: 0,title,release_date,vote_average,vote_count
65,The Dark Knight,2008-07-16,8.2,12002
299,Batman Forever,1995-05-31,5.2,1498
428,Batman Returns,1992-06-19,6.6,1673
1359,Batman,1989-06-23,7.0,2096
3854,"Batman: The Dark Knight Returns, Part 2",2013-01-18,7.9,419


In [15]:
def recomendar_filmes_detalhado_filtrado(titulo, num_recomendacoes=5, min_votos=50):
    idx = indices[titulo]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    recomendados = []
    for i, score in sim_scores[1:]:
        if movies.loc[i, 'vote_count'] >= min_votos:
            recomendados.append(i)
        if len(recomendados) == num_recomendacoes:
            break
    
    return movies.loc[recomendados, ['title', 'release_date', 'vote_average', 'vote_count']]

In [16]:
recomendar_filmes_detalhado_filtrado('The Dark Knight Rises')

Unnamed: 0,title,release_date,vote_average,vote_count
65,The Dark Knight,2008-07-16,8.2,12002
299,Batman Forever,1995-05-31,5.2,1498
428,Batman Returns,1992-06-19,6.6,1673
1359,Batman,1989-06-23,7.0,2096
3854,"Batman: The Dark Knight Returns, Part 2",2013-01-18,7.9,419
