In [None]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import matplotlib.pyplot as plt

#carregando datasets
df_overview = pd.read_csv('data/movies_overview.csv')
movies_genres = pd.read_csv('data/movies_genres.csv')
genre_dict = dict(zip(movies_genres['id'], movies_genres['name']))
    
#convertendo colunas de gêneros
df_overview['genres'] = df_overview['genre_ids'].apply(
lambda x: [genre_dict[int(id)] for id in x.strip("[]").split(",") if id.strip().isdigit()]
)
    
# filtrando sinopses
df_overview.dropna(subset=['overview'], inplace=True)
overviews = df_overview['overview'].astype(str).tolist()
overviews = [doc for doc in overviews if doc.strip()]
    
print(f"\nDataset carregado.")
print(f"- Total de filmes: {len(df_overview)}")
print(f"- Sinopses válidas: {len(overviews)}")
print(f"- Gêneros disponíveis: {len(genre_dict)}")

  from .autonotebook import tqdm as notebook_tqdm



Dataset carregado com sucesso!
- Total de filmes: 9980
- Sinopses válidas: 9980
- Gêneros disponíveis: 19


In [None]:
# carregando modelo de embedding
embedding_model_name = 'all-MiniLM-L6-v2'
print(f"Carregando modelo de embedding: {embedding_model_name}...")
sentence_model = SentenceTransformer(embedding_model_name)

#carregando o count vectorizer
vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=2,
    max_df=0.9, 
    ngram_range=(1, 3) 
)


Carregando modelo de embedding: all-MiniLM-L6-v2...


In [None]:
print("Inicializando e treinando o modelo BERTopic")
topic_model = BERTopic(
    embedding_model=sentence_model,         
    vectorizer_model=vectorizer_model,      
    language="english",                     
    calculate_probabilities=True,           
    verbose=True,                           
    #min_topic_size=20,                      
    nr_topics=19                       
)

topics, probabilities = topic_model.fit_transform(overviews)

2025-06-06 10:26:00,340 - BERTopic - Embedding - Transforming documents to embeddings.


Inicializando e treinando o modelo BERTopic...


Batches: 100%|██████████| 312/312 [04:46<00:00,  1.09it/s]
2025-06-06 10:30:47,398 - BERTopic - Embedding - Completed ✓
2025-06-06 10:30:47,399 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-06 10:31:30,555 - BERTopic - Dimensionality - Completed ✓
2025-06-06 10:31:30,559 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-06 10:31:38,204 - BERTopic - Cluster - Completed ✓
2025-06-06 10:31:38,206 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-06-06 10:31:39,904 - BERTopic - Representation - Completed ✓
2025-06-06 10:31:39,907 - BERTopic - Topic reduction - Reducing number of topics
2025-06-06 10:31:39,936 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-06 10:31:41,605 - BERTopic - Representation - Completed ✓
2025-06-06 10:31:41,609 - BERTopic - Topic reduction - Reduced number of topics from 109 to 19


In [None]:
# criando coluna de tópicos no df
df_overview['topic'] = topics

print("\nAnalisando resultados...")

# infos sobre os tópicos
topic_info = topic_model.get_topic_info()
topic_info = topic_info[topic_info['Topic'] != -1]  # Remover outliers
print(f"\nTópicos identificados ({len(topic_info)}):")
print(topic_info[['Topic', 'Count', 'Name']].head(10))

# mapeando palavras-chave para tópicos
topic_keywords = {}
for topic_id in topic_info['Topic']:
    keywords = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_keywords[topic_id] = ", ".join(keywords)

# relacionar tópicos com gêneros
print("\nRelacionando tópicos com gêneros:")

# função para encontrar gêneros mais comuns por tópico
def get_top_genres(group, n=3):
    all_genres = [genre for sublist in group for genre in sublist]
    counter = Counter(all_genres)
    return [genre for genre, _ in counter.most_common(n)]

# calcular top gêneros por tópico
topic_genres = (
    df_overview.groupby('topic')['genres']
    .apply(get_top_genres)
    .reset_index()
)

# combinar as infos
topic_analysis = pd.merge(
    topic_info[['Topic', 'Count', 'Name']],
    topic_genres,
    left_on='Topic',
    right_on='topic'
)

topic_analysis['Keywords'] = topic_analysis['Topic'].map(topic_keywords)
topic_analysis['Top Genres'] = topic_analysis['genres'].apply(lambda x: ", ".join(x))
topic_analysis = topic_analysis[['Topic', 'Count', 'Top Genres', 'Keywords', 'Name']]

print("\nTópicos com gêneros associados:")
print(topic_analysis.head(10))


# print("\n--- Resultados do Topic Modeling ---")

# topic_info_df = topic_model.get_topic_info()
# print("\nInformações dos Tópicos Descobertos:")
# print(topic_info_df)


# print("\nPalavras-chave para os Tópicos Principais (Top 10 palavras por tópico):")
# num_top_topics_to_show = min(10, len(topic_info_df[topic_info_df.Topic != -1]))

# for i in range(num_top_topics_to_show):
#     topic_id = topic_info_df.Topic.iloc[i+1] # pular o tópico -1 se for o primeiro
#     if topic_id == -1 and len(topic_info_df) > 1: # se o primeiro tópico é -1, pegue o próximo da lista
#         if i+1 < len(topic_info_df):
#              topic_id = topic_info_df.Topic.iloc[i+1]
#         else:
#             continue
#     elif topic_id == -1: # se o único tópico for -1 ou todos os mostrados forem -1
#         continue

#     topic_words = topic_model.get_topic(topic_id)
#     if topic_words:
#         print(f"Tópico {topic_id}: {', '.join([word for word, score in topic_words])}")
#     else:
#         print(f"Tópico {topic_id}: (sem palavras representativas suficientes com as configurações atuais)")



print("\nAnálise de Topic Modeling com BERTopic concluída.")


Analisando resultados...

Tópicos identificados (18):
    Topic  Count                                  Name
1       0    665             0_agent_police_murder_cia
2       1    664            1_woman_mother_love_father
3       2    471             2_coach_charlie_love_jack
4       3    370               3_king_world_goku_china
5       4    319           4_virus_group_robot_zombies
6       5    314           5_christmas_dog_santa_named
7       6    290             6_earth_planet_crew_space
8       7    242              7_music_singer_rock_band
9       8    235             8_italy_italian_jean_wife
10      9    233  9_vampire_vampires_halloween_dracula

Relacionando tópicos com gêneros...

Tópicos com gêneros associados:
   Topic  Count                          Top Genres  \
0      0    665             Thriller, Crime, Action   
1      1    664              Drama, Comedy, Romance   
2      2    471              Drama, Comedy, Romance   
3      3    370            Action, Adventure, Dram

In [6]:

fig_topics = topic_model.visualize_topics()
fig_topics.show()


In [7]:
fig_hierarchy = topic_model.visualize_hierarchy()
fig_hierarchy.show()


In [8]:
fig_barchart = topic_model.visualize_barchart(top_n_topics=10) 
fig_barchart.show()


In [9]:
topic_model.save("models/modelo_bertopic_filmes.pkl", serialization="pickle", save_ctfidf=True)

