In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from bertopic import BERTopic

# carregando os dados
overview_df = pd.read_csv("data/movies_overview.csv")
genres_df = pd.read_csv("data/movies_genres.csv")

# juntando os generos
genre_map = dict(zip(genres_df["id"], genres_df["name"]))

def decode_genres(genre_ids_str):
    import ast
    try:
        genre_ids = ast.literal_eval(genre_ids_str)
        return ", ".join([genre_map.get(gid, "") for gid in genre_ids])
    except:
        return ""

overview_df["genres"] = overview_df["genre_ids"].apply(decode_genres)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# fine tuning
# criando pares de sinopses (exemplo: mesma sinopse com mesmo gênero)
examples = [
    InputExample(texts=[row["overview"], row["overview"]], label=1.0)
    for _, row in overview_df.iterrows()
    if isinstance(row["overview"], str) and row["overview"].strip() != ""
]

# modelo base para fine tuning
base_model = SentenceTransformer("all-MiniLM-L6-v2")

#dividindo os dados
train_dataloader = DataLoader(examples, shuffle=True, batch_size=32)
train_loss = losses.CosineSimilarityLoss(model=base_model)

# treinando
base_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100
)

# Salvar o modelo
base_model.save("fine_tuned_imdb_model")


                                                                     

Step,Training Loss


In [3]:
# gerando embeddings
valid_overviews = overview_df["overview"].dropna().tolist()
embeddings = base_model.encode(valid_overviews, show_progress_bar=True)



Batches: 100%|██████████| 312/312 [00:37<00:00,  8.23it/s]


In [4]:
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer

#umap para redução de dimensionalidade
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

#  vetorizador para remover stopwords 
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 3))

topic_model = BERTopic(
    embedding_model=base_model,
    umap_model=umap_model,
    vectorizer_model=vectorizer_model,
    verbose=True
)


In [5]:
# treinando o modelo BERTopic
topics, probs = topic_model.fit_transform(valid_overviews, embeddings)

# 6. visualizações e resultados
topic_model.visualize_topics().show()
topic_model.visualize_barchart(top_n_topics=10).show()
topic_model.save("bertopic_imdb_model")


2025-06-08 21:06:52,988 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-08 21:07:25,262 - BERTopic - Dimensionality - Completed ✓
2025-06-08 21:07:25,262 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-08 21:07:25,730 - BERTopic - Cluster - Completed ✓
2025-06-08 21:07:25,730 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-08 21:07:26,465 - BERTopic - Representation - Completed ✓




In [6]:
# visualização hierárquica dos tópicos
hierarchical_topics = topic_model.hierarchical_topics(valid_overviews)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).show(height=800, width=1200)


100%|██████████| 62/62 [00:00<00:00, 456.16it/s]


In [7]:
import ast

movies_overview = pd.read_csv('data/movies_overview.csv')
movies_genres = pd.read_csv('data/movies_genres.csv')

# convertendo 'genre_ids' de string para lista
movies_overview['genre_ids'] = movies_overview['genre_ids'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

# dicionário de gêneros
genre_dict = movies_genres.set_index('id')['name'].to_dict()

# coluna com nomes dos gêneros
movies_overview['genres'] = movies_overview['genre_ids'].apply(
    lambda ids: [genre_dict[id] for id in ids if id in genre_dict]
)

# remoção de linhas sem overview
movies_overview = movies_overview.dropna(subset=['overview']).reset_index(drop=True)

In [8]:
from bertopic.plotting import visualize_topics_per_class


#informações dos tópicos
topic_info = topic_model.get_topic_info()

#  mapeamentos
topic_name_map = dict(zip(topic_info['Topic'], topic_info['Name']))

# extração de palavras-chave 
def get_topic_words(topic):
    try:
        topic_words = topic_model.get_topic(topic)
        if isinstance(topic_words[0], tuple):
            return [word for word, _ in topic_words[:5]]
        else:
            return topic_words[:5]
    except:
        return ["N/A"]

# criação do dataframe para tópicos por classe
data = []
for idx, (topic, genres) in enumerate(zip(topics, movies_overview['genres'])):
    for genre in genres:
        data.append({
            'Genre': genre,
            'Topic': topic,
            'Name': topic_name_map.get(topic, f"Topic_{topic}"),
            'Words': ", ".join(get_topic_words(topic))
        })

topics_per_class = pd.DataFrame(data)

# cálculo das frequências dos tópicos por gênero
counts = topics_per_class.groupby(['Genre', 'Topic', 'Name', 'Words']).size().reset_index(name='Count')
genre_totals = counts.groupby('Genre')['Count'].sum()
counts['Frequency'] = counts.apply(lambda x: x['Count'] / genre_totals[x['Genre']], axis=1)


try:
    #método padrão primeiro
    fig = topic_model.visualize_topics_per_class(
        counts.rename(columns={'Genre': 'Class'}),
        normalize_frequency=True,
        title='Distribuição de Tópicos por Gênero',
        height=600,
        width=1200
    )
    fig.show()
except Exception as e:
    print(f"Usando visualização alternativa devido a: {str(e)}")
    
    # visualização alternativa 
    import plotly.express as px
    
    top_genres = counts.groupby('Genre')['Count'].sum().nlargest(15).index
    filtered = counts[counts['Genre'].isin(top_genres)]
    
    fig = px.bar(
        filtered.sort_values(['Genre', 'Frequency'], ascending=[True, False]),
        x='Genre',
        y='Frequency',
        color='Name',
        hover_data=['Words'],
        facet_row='Name',
        height=1200,
        title='Distribuição de Tópicos por Gênero (Top 15)'
    )
    fig.update_layout(showlegend=False)
    fig.show()

In [9]:
#matriz de similaridade entre tópicos
topic_model.visualize_heatmap().show()


In [10]:
from collections import Counter


df_overview = movies_overview
df_overview['topic'] = topics



In [11]:
topic_keywords = {}
for topic_id in topic_info['Topic']:
    keywords = [word for word, _ in topic_model.get_topic(topic_id)]
    topic_keywords[topic_id] = ", ".join(keywords)

# relacionar tópicos com gêneros
print("\nRelacionando tópicos com gêneros:")

# função para encontrar gêneros mais comuns por tópico
def get_top_genres(group, n=3):
    all_genres = [genre for sublist in group for genre in sublist]
    counter = Counter(all_genres)
    return [genre for genre, _ in counter.most_common(n)]

# calcular top gêneros por tópico
topic_genres = (
    df_overview.groupby('topic')['genres']
    .apply(get_top_genres)
    .reset_index()
)

# combinar as infos
topic_analysis = pd.merge(
    topic_info[['Topic', 'Count', 'Name']],
    topic_genres,
    left_on='Topic',
    right_on='topic'
)

topic_analysis['Keywords'] = topic_analysis['Topic'].map(topic_keywords)
topic_analysis['Top Genres'] = topic_analysis['genres'].apply(lambda x: ", ".join(x))
topic_analysis = topic_analysis[['Topic', 'Count', 'Top Genres', 'Keywords', 'Name']]

print("\nTópicos com gêneros associados:")
print(topic_analysis.head(10))


Relacionando tópicos com gêneros:

Tópicos com gêneros associados:
   Topic  Count                          Top Genres  \
0     -1   6553             Drama, Comedy, Thriller   
1      0    557              Drama, Comedy, Romance   
2      1    389          Adventure, Action, Fantasy   
3      2    314             Crime, Thriller, Action   
4      3    140  Science Fiction, Action, Adventure   
5      4    134                 Drama, War, History   
6      5    128            Horror, Thriller, Comedy   
7      6     98         Thriller, Action, Adventure   
8      7     88   Science Fiction, Horror, Thriller   
9      8     87              Romance, Comedy, Drama   

                                            Keywords  \
0  life, young, new, family, world, man, old, lov...   
1  life, film, story, star, music, world, band, f...   
2  king, ancient, goku, power, evil, dragon, warr...   
3  police, cop, killer, case, officer, murder, dr...   
4  earth, planet, space, alien, crew, mission,