In [None]:
import json

with open(file="../../echo-chamber/data/mastodon.trump.json") as f:
    data = json.load(f)

import re
from bs4 import BeautifulSoup
from fast_langdetect import detect_language
md = []
for post in data:
    if not post["in_reply_to_id"]:
        reply = False
    else:
        reply = True
    content = post["content"]
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(content, "html.parser")

    # Remove all 'a' tags entirely
    for a_tag in soup.find_all("a"):
        a_tag.unwrap()

    # Get the plain text
    plain_text = soup.get_text(separator=" ")

    # Regular expression to match and remove URLs
    cleaned_content = re.sub(
        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F])|\s)+",
        "",
        plain_text,
    )

    # Remove excess spaces left by removing the link
    cleaned_content = re.sub(r"\s+", " ", cleaned_content).strip()
    lang = detect_language(cleaned_content)
    if cleaned_content:
        md.append([post["id"], cleaned_content, lang, reply, post.get("in_reply_to_id", None),])

    

import pandas as pd

md_df = pd.DataFrame(md, columns=["id", "post", "lang", "reply", "in_reply_to_id"])

In [None]:
#BlueSky
with open(file="../../echo-chamber/data/bsky.trump.json") as f:
    bsky = json.load(f)

bsky_data = []
from fast_langdetect import detect_language

for post in bsky:
    if not post["record"].get("reply", None):
        reply = False
    else:
        reply = True
    if post["record"].get("text", None):
        language = post["record"].get("langs", None)
        language = language[0] if language else None
        if not language:
            language = detect_language(
                post["record"]["text"].replace("\n", " ")
            ).lower()
        bsky_data.append([post["_id"], post["record"]["text"], language, reply, post["record"].get("reply", {}).get("parent", {}).get('uri',None),])

import pandas as pd

bsky_df = pd.DataFrame(bsky_data, columns=["id", "post", "lang", "reply", "in_reply_to_id"])



#Truth Social
with open(file="../../echo-chamber/data/truthsocial.trump.json") as f:
    ts = json.load(f)

ts_data = []
for post in ts:
    if post["in_reply_to_id"] or post["quote_id"]:
        reply = True
    else:
        reply = False
    content = post["content"]
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(content, "html.parser")

    # Remove all 'a' tags entirely
    for a_tag in soup.find_all("a"):
        a_tag.unwrap()

    # Get the plain text
    plain_text = soup.get_text(separator=" ")

    # Regular expression to match and remove URLs
    cleaned_content = re.sub(
        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F])|\s)+",
        "",
        plain_text,
    )

    # Remove excess spaces left by removing the link
    cleaned_content = re.sub(r"\s+", " ", cleaned_content).strip()
    lang = detect_language(cleaned_content.replace("\n", " ").lower()).lower()
    if cleaned_content:
        ts_data.append([post["id"], cleaned_content, lang, reply, post.get("in_reply_to_id", None),])

ts_df = pd.DataFrame(ts_data, columns=["id", "post", 'lang', 'reply', 'in_reply_to_id'])

# Data Preprocessing

In [None]:
# Only keep the english posts
bsky_posts = list(bsky_df.loc[(bsky_df["lang"] == "en"), "post"].dropna())

# drop post contain "event":"initializing","ts
bsky_posts = [post for post in bsky_posts if 'event":"initializing","ts"' not in post]
bsky_posts = [
    post
    for post in bsky_posts
    if detect_language(post.replace("\n", "")).lower() == "en"
]
# ts_df posts remove digits and hyperlinks
ts_posts = list(ts_df["post"].dropna())


all_posts = bsky_posts + ts_posts
target_posts = [
    post for post in all_posts if "trump" in post.lower() or "biden" in post.lower()
]

In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from nltk.corpus import stopwords
import nltk
import cudf
from sentence_transformers import SentenceTransformer
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.cluster import KMeans

cluster_model = KMeans(n_clusters=20)
# Fit BERTopic without actually performing any dimensionality reduction
empty_dimensionality_model = BaseDimensionalityReduction()

# # Initialize multilingual stopwords
# languages = ['EN', 'JA', 'DE', 'FR', 'PT', 'IT', 'ZH']

# # Define custom stopword lists for each language
# stopword_lists = {
#     'EN': stopwords.words('english'),
#     'DE': stopwords.words('german'),
#     'FR': stopwords.words('french'),
#     'PT': stopwords.words('portuguese'),
#     'IT': stopwords.words('italian'),
#     # For unsupported languages, provide your own stopword lists
#     'JA': ['これ', 'それ', 'あれ', 'この', 'その', 'あの', 'ここ', 'そこ', 'あそこ', 'こちら', 'どれ', 'なぜ', 'なに', 'どうして'],  # Example
#     'ZH': ['的', '一', '是', '在', '了', '和', '有', '不', '人', '我', '他', '这', '个', '上', '们', '来'],  # Example
# }
# # Combine and deduplicate stopwords
# all_stopwords = set()
# for lang, stopword_list in stopword_lists.items():
#     all_stopwords.update(stopword_list)

# all_stopwords = list(all_stopwords)  # Convert back to a list if needed
all_stopwords = stopwords.words("english")
custom_stopwords = [
    "com",
    "www",
    "2024",
    "http",
    "https",
    "bsky",
    "social",
]  # Add domain-specific stopwords
all_stopwords.extend(custom_stopwords)
all_stopwords = list(set(all_stopwords))  # Deduplicate again


# # Initialize the CountVectorizer with the custom stopwords
vectorizer_model = CountVectorizer(
    stop_words=all_stopwords, min_df=2, ngram_range=(1, 2)
)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

umap_model = UMAP(n_components=15, n_neighbors=5, min_dist=0.1)
hdbscan_model = HDBSCAN(
    min_cluster_size=200,
    min_samples=10,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)
# Combine posts from each component
# bsky_df posts remove digits and hyperlinks and only keep en
bsky_posts = list(bsky_df.loc[(bsky_df["lang"] == "en"), "post"].dropna())

# drop post contain "event":"initializing","ts
bsky_posts = [post for post in bsky_posts if 'event":"initializing","ts"' not in post]
bsky_posts = [
    post
    for post in bsky_posts
    if detect_language(post.replace("\n", "")).lower() == "en"
]
# ts_df posts remove digits and hyperlinks
ts_posts = list(ts_df["post"].dropna())


all_posts = bsky_posts + ts_posts
target_posts = [
    post for post in all_posts if "trump" in post.lower() or "biden" in post.lower()
]
# Fit BERTopic model
embeddings = embedding_model.encode(target_posts, show_progress_bar=True)
topic_model = BERTopic(
    umap_model=empty_dimensionality_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    verbose=True,
)
topics, probs = topic_model.fit_transform(target_posts, embeddings)


In [None]:
topic_df = topic_model.get_topic_info()

In [None]:
new_topics = topic_model.reduce_outliers(target_posts, topics)

In [None]:
topic_model.update_topics(
    target_posts, topics=new_topics, vectorizer_model=vectorizer_model
)

In [None]:
# Then we randomly sampled 10 posts from teach cateory to make sense of the topic label
# label topics
# Topic -1: Criticism of Trump and Support for Democratic Policies
# Topic 0: MAGA and Pro-Trump Hashtags and Advocacy
# Topic 1: Trump’s Legal Convictions and Felony Charges
# Topic 2: Pro-Trump and MAGA Advocacy
# Topic 3: Celebrations of Trump (e.g., Birthdays and Tributes)
# Topic 4: Hunter Biden’s Legal Troubles (e.g., Gun Charges)
# Topic 5: U.S. Policy on Ukraine and Russia
# Topic 6: Israel-Hamas Conflict and Biden’s Ceasefire Proposal
# Topic 7: Trump’s Tax Promises and Election Campaign
# Topic 8: Trump’s Rallies and Live Events Coverage
# Topic 9: Biden’s Immigration Policies and Executive Orders
# Topic 10: Legal Proceedings in Georgia’s 2020 Election Case Against Trump
# Topic 11: Biden vs. Trump Presidential Debates
topic_id_to_label = {
    -1: "Criticism of Trump and Support for Democratic Policies",
    0: "MAGA and Pro-Trump Hashtags and Advocacy",
    1: "Trump’s Legal Convictions and Felony Charges",
    2: "Pro-Trump and MAGA Advocacy",
    3: "Celebrations of Trump (e.g., Birthdays and Tributes)",
    4: "Hunter Biden’s Legal Troubles (e.g., Gun Charges)",
    5: "U.S. Policy on Ukraine and Russia",
    6: "Israel-Hamas Conflict and Biden’s Ceasefire Proposal",
    7: "Trump’s Tax Promises and Election Campaign",
    8: "Trump’s Rallies and Live Events Coverage",
    9: "Biden’s Immigration Policies and Executive Orders",
    10: "Legal Proceedings in Georgia’s 2020 Election Case Against Trump",
    11: "Biden vs. Trump Presidential Debates",
}
topic_model.set_topic_labels(
    {
        -1: "Criticism of Trump and Support for Democratic Policies",
        0: "MAGA and Pro-Trump Hashtags and Advocacy",
        1: "Trump’s Legal Convictions and Felony Charges",
        2: "Pro-Trump and MAGA Advocacy",
        3: "Celebrations of Trump (e.g., Birthdays and Tributes)",
        4: "Hunter Biden’s Legal Troubles (e.g., Gun Charges)",
        5: "U.S. Policy on Ukraine and Russia",
        6: "Israel-Hamas Conflict and Biden’s Ceasefire Proposal",
        7: "Trump’s Tax Promises and Election Campaign",
        8: "Trump’s Rallies and Live Events Coverage",
        9: "Biden’s Immigration Policies and Executive Orders",
        10: "Legal Proceedings in Georgia’s 2020 Election Case Against Trump",
        11: "Biden vs. Trump Presidential Debates",
    }
)