*This notebook implements BERTopic for TikTok comments on child-featuring videos.*

---



Install relevant libraries

In [None]:
!pip install -q bertopic umap-learn hdbscan scikit-learn transformers sentence-transformers tqdm polars



Run everything until here to start new runtime!

In [None]:
import pandas as pd

# Load dataset
file_path = "/INSERT-DATA-PATH.csv"
df = pd.read_csv(file_path)
texts = df['text'].tolist()


Remove comments that are not meaningful for TM, such as mentions

In [None]:
# Define regex pattern to match ONLY mentions (like "@name", "@user123")
mention_pattern = r"^@\w+$"

# Remove rows where the comment consists of only a mention
df = df[~df['text'].str.match(mention_pattern, na=False)]

# Convert to list
texts = df['text'].tolist()


Embed comments

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

# Load a pre-trained sentence transformer model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Ensure all texts are strings and replace NaNs with an empty string
texts = [str(text) if isinstance(text, (str, int, float)) else "" for text in texts]

# Generate embeddings with progress tracking
embeddings = []
for text in tqdm(texts, desc="Generating embeddings"):
    embedding = embedding_model.encode(text)
    embeddings.append(embedding)

# Convert to numpy array
embeddings = np.array(embeddings)

# Save embeddings for later reuse
np.save(f"{SAVE_PATH}/text_embeddings.npy", embeddings)


Saving embeddings separately to reload at a later stage (optional)

In [None]:
import json

# Save the cleaned text list as a JSON file
with open(f"{SAVE_PATH}/cleaned_texts.json", "w") as f:
    json.dump(texts, f)



Run TM

In [None]:
from bertopic import BERTopic
import umap

# Initialize UMAP for dimensionality reduction
umap_model = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine')

# Fit BERTopic model
topic_model = BERTopic(umap_model=umap_model, embedding_model="all-MiniLM-L6-v2", verbose=True)
topics, probs = topic_model.fit_transform(texts, embeddings)

# Save topic assignments to DataFrame
df['topic'] = topics


Save model and toppic assignments (optional)

In [None]:
import joblib
import os

# Ensure directory exists
os.makedirs(SAVE_PATH, exist_ok=True)

# Save BERTopic model
topic_model.save(f"{SAVE_PATH}/bertopic_model")

# Save UMAP model
joblib.dump(umap_model, f"{SAVE_PATH}/umap_model.pkl")

# Save topic assignments
df.to_csv(f"{SAVE_PATH}/comments_with_topics.csv", index=False)


Load more libraries for visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Ensure topics are integers
df["new_topics"] = df["new_topics"].astype(int)

# Load reduced 2D embeddings (fix missing "x" and "y")
df["x"], df["y"] = reduced_embeddings[:, 0], reduced_embeddings[:, 1]


##### Exclude non-meaningful topics

# Define topics to exclude from visualization
excluded_topics = {-1, 0, 1, 3, 7, 8, 13, 17, 18, 19, 22}

# Exclude these topics
df_filtered = df[~df["new_topics"].isin(excluded_topics)]
#####

# Apply a slight logarithmic transformation to both x and y
alpha = 0.6  # Adjust between 0 (no log effect) and 1 (full log effect)

df_filtered["x"] = alpha * (np.sign(df_filtered["x"]) * np.log1p(np.abs(df_filtered["x"]))) + (1 - alpha) * df_filtered["x"]
df_filtered["y"] = alpha * (np.sign(df_filtered["y"]) * np.log1p(np.abs(df_filtered["y"]))) + (1 - alpha) * df_filtered["y"]


# Add jitter
jitter_strength = 0.6  # Adjust this value for more or less jiggle
df_filtered["x"] += np.random.normal(0, jitter_strength, len(df_filtered))
df_filtered["y"] += np.random.normal(0, jitter_strength, len(df_filtered))

### Normalize and Rescale UMAP Embeddings
scaler = MinMaxScaler(feature_range=(-1, 1))  # Normalize x and y to fit between -1 and 1
df_filtered[["x", "y"]] = scaler.fit_transform(df_filtered[["x", "y"]])

scaling_factor = 0.5  # to condense space
df_filtered["x"] *= scaling_factor
df_filtered["y"] *= scaling_factor

### Re-center the image by excluding extreme outliers
# Compute the 10th percentile of x-values
x_lower = np.percentile(df_filtered["x"], 0.2)  # Find the threshold

# Filter out the first 10% of x-values
df_filtered = df_filtered[df_filtered["x"] > x_lower]

# Get the 25 most prevalent topics (excluding the specified ones)
top_25_topics = df_filtered["new_topics"].value_counts().index[:25]
df_filtered = df_filtered[df_filtered["new_topics"].isin(top_25_topics)]


# Get topic frequencies
topic_counts = df_filtered["new_topics"].value_counts().to_dict()

# 🎨 Use multiple high-contrast color palettes to get 50+ distinct colors
palette = (
    sns.color_palette("tab20", 20) + # Combine color palettes
    sns.color_palette("Dark2", 5)

)

# Ensure we only use the needed number of colors
palette = palette[:len(top_25_topics)]  # Trim or extend to match topic count

# Create a more compact figure
fig, ax = plt.subplots(figsize=(10, 7), dpi=300)  # Reduced figure size for more compact layout

# KDE (density visualization) with more emphasis on frequent topics
max_count = max(topic_counts.values())  # Get max topic size

for topic in top_25_topics:
    topic_data = df_filtered[df_filtered["new_topics"] == topic]
    topic_size = topic_counts[topic]  # Get size of topic

    # Skip topics with too few unique points
    if len(topic_data["x"].unique()) < 5 or len(topic_data["y"].unique()) < 5:
        continue

    # Dynamically adjust KDE density scaling
    bw_factor = 0.7 * (topic_size / max_count) + 0.5  # More smoothing for smaller topics

    sns.kdeplot(
        x=topic_data["x"], y=topic_data["y"],
        ax=ax, fill=True, alpha=0.1 + 0.2 * (topic_size / max_count),  # Larger topics get stronger alpha
        cmap=sns.light_palette(palette[top_50_topics.tolist().index(topic)], as_cmap=True),
        levels=3,  # More levels for larger topics
        bw_adjust=bw_factor,  # Adjust KDE smoothing dynamically
        thresh=0.05  # Avoids gaps
    )


# Scatter plot for topic clusters with dynamic size based on topic frequency
df_filtered["point_size"] = df_filtered["new_topics"].map(topic_counts.get)
df_filtered["point_size"] = df_filtered["point_size"].fillna(df_filtered["point_size"].mean()).astype(int)

# Increased size range: minimum size 2, maximum size 20
df_filtered["point_size"] = df_filtered["point_size"].apply(lambda x: 2 + 25 * (x / max_count))

scatter = sns.scatterplot(
    x="x", y="y", hue="new_topics", palette=palette, data=df_filtered,
    alpha=0.3, size="point_size", edgecolor="w", ax=ax,
    rasterized=True
)

# Define new labels for selected topics
topic_labels = {
    2: "cuteness\nexpressions",
    #5: "cuteness expressions (cuteeee)",
    9: "outfits",
    10: "babies and pregnancy",
    24: "family",
    43: "videos",
    12: "beauty",
    6: "tiktok",
    34: "pets",
    11: "music",
    21: "crying",

}

fixed_fontsize = 9  # Set a constant font size for all topic labels

for topic, label in topic_labels.items():  # Only iterate through topics in topic_labels
    topic_data = df_filtered[df_filtered["new_topics"] == topic]
    if not topic_data.empty:
        x_mean, y_mean = topic_data["x"].median(), topic_data["y"].median()

        ax.text(
            x_mean, y_mean, label,
            fontsize=fixed_fontsize,
            fontfamily="serif",
            weight="semibold",
            ha="center", va="center", color="black",
            bbox=dict(facecolor="white", edgecolor="none", alpha=0.6, boxstyle="round,pad=0.3")
        )


import matplotlib.ticker as ticker

ax.grid(False)
ax.legend([], [], frameon=False) # no legend

# Finer lines and ticks
ax.tick_params(axis="both", which="major", labelsize=7, length=3, width=0.6)
ax.tick_params(axis="both", which="minor", labelsize=6, length=2, width=0.4)

for spine in ax.spines.values():
    spine.set_linewidth(0.4)

ax.set_xticks([0.1, 0.5])
ax.set_yticks([0.1, 0.5])

plt.rc("axes", labelsize=8)
plt.rc("xtick", labelsize=7)
plt.rc("ytick", labelsize=7)


# Add padding
plt.subplots_adjust(left=0.12, right=0.88, top=0.92, bottom=0.12)  # Increased top padding

# Save visualization
plt.savefig(f"{SAVE_PATH}/prominent_topic_visualization_log.pdf", dpi=300, bbox_inches="tight")
plt.show()



In [None]:
# Get topic words for the topics displayed in the graph
topic_words_dict = {}

for topic in top_25_topics:
    if topic in topic_model.get_topics():
        words = topic_model.get_topic(topic)
        word_list = [w[0] for w in words]
        topic_words_dict[topic] = word_list[:5]  # Get top 5 words for each topic

# Print topic labels and corresponding words
print("Topics & Their Representative Words in the Graph:")
for topic, words in topic_words_dict.items():
    print(f"Topic {topic}: {', '.join(words)}")



🔹 **Topics & Their Representative Words in the Graph:**
Topic 0: awwwww, awwww, awwwwww, wow, awww
Topic 2: cute, so, byba, massaallah, ooow
Topic 4: cute, so, damn, yeah, 
Topic 5: cutee, cuteeee, cuteeeee, cuteee, cuteeeeee
Topic 6: tiktok, tiktoks, tiktoker, tiktokers, tictok
Topic 9: dressing, crop, clothes, dress, dressed
Topic 10: cute, usual, stuff, different, as
Topic 11: song, music, tune, songs, lyrics
Topic 12: pretties, hoe, coffeeformycream, neff, zoooo
Topic 15: food, eat, touched, hungry, eating
Topic 16: wow, wowl, ln, uhm, there
Topic 20: sis, sister, sisters, sissy, big
Topic 21: cry, cries, crys, crying, cried
Topic 23: gymnastics, gymnast, gymnastic, coach, gymnasts
Topic 24: babys, hanging, baby, dear, boinggg
Topic 27: trigg, triggs, triggie, nos, skincare
Topic 29: father, dad, great, respect, fatherhood
Topic 31: nice, niceeeee, nicedo, eeh, waoooooo
Topic 30: awww, bur, standards, emmy, raise
Topic 32: momma, mama, mamas, mommas, doing
Topic 34: pet, ducklings

In [None]:
# Get topic information for full BERTopic
topic_info = topic_model.get_topic_info()

# Print first 50 topics
print("First 50 topics with frequency and keywords:")
for i in range(68):
    topic_id = topic_info.iloc[i]["Topic"]
    freq = topic_info.iloc[i]["Count"]
    words = topic_info.iloc[i]["Representation"]

    print(f"Topic {topic_id}: ({freq} occurrences)")
    print(f"   Keywords: {', '.join(words)}\n")