In [1]:
from umap import UMAP
from datasets import load_dataset
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("nedjmaou/MLMA_hate_speech")
tweets = dataset['train']['tweet']

In [4]:
# For illustration purposes, we make sure the output is fixed when running this code multiple times
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

# We can choose any number of seed words for which we want their representation
# to be strengthen. We increase the importance of these words as we want them to be more
# likely to end up in the topic representations.
ctfidf_model = ClassTfidfTransformer(
    seed_words=["agent", "robot", "behavior", "policies", "environment"], 
    seed_multiplier=2,
    reduce_frequent_words=True
)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

vectorizer_model = CountVectorizer(min_df=10, stop_words="english")

In [5]:
# We run the topic model with the seeded words
topic_model = BERTopic(
    umap_model=umap_model,
    min_topic_size=15,
    ctfidf_model=ctfidf_model,
    embedding_model=embedding_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model
).fit(tweets)


In [6]:
topic_labels = topic_model.generate_topic_labels(nr_words=3,
                                            topic_prefix=True,
                                            word_length=10,
                                            separator=", ")

In [7]:
topic_model.set_topic_labels(topic_labels)
topic_model.custom_labels_


['-1, u201d, america, game',
 '0, انت, هذا, لا',
 '1, long, eyes, racist',
 '2, mental, vraiment, es',
 '3, mongoloid, absolute, gt',
 '4, retard, ok, lmao',
 '5, gros, quel, vrai',
 '6, spic, called, racist',
 '7, gauchiste, rt, ca',
 '8, vote, care, home',
 '9, twat, little, funny',
 '10, dick, faggot, yo',
 '11, feel, nigga, dumb',
 '12, renois, ils, trop',
 '13, country, shithole, leave',
 '14, faggot, mother, mom',
 '15, short, baby, dick',
 '16, ceux, pro, aux',
 '17, men, women, fake',
 '18, lt, different, went',
 '19, send, europe, truth',
 '20, sick, cunt, complete',
 '21, ude02, ud83d, ud83c',
 '22, type, die, send',
 '23, media, right, anti',
 '24, jesus, horrible, absolute',
 '25, fans, year, second',
 '26, sale, toi, ta',
 '27, بايرة, اللاجئون, ولا',
 '28, absolutely, went, account',
 '29, word, black, lost',
 '30, negro, black, watching',
 '31, peu, question, aux',
 '32, منحرفة, وسخ, هل',
 '33, countries, world, america',
 '34, told, thats, calling',
 '35, okay, white, ra

In [8]:
# Get the current date
current_date = datetime.now().strftime("%Y-%m-%d")

# Create the unique model path with the current date
unique_model_path = f"./output/model_{current_date}/"

# Save the topic model
topic_model.save(unique_model_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

In [9]:
fig = topic_model.visualize_documents(tweets)
fig.write_html("./output/doc_viz.html")

In [None]:
fig = topic_model.visualize_heatmap()
fig.write_html("./output/heatmap_viz.html")

In [10]:
topic_model.visualize_topics()