In [10]:
import re
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm
import numpy as np
import random
from collections import defaultdict
import requests
import time
from sklearn.decomposition import PCA
from umap import UMAP
import hdbscan


# Text cleaning #

In [11]:
def clean_tweet(text):
    text = text.lower()
    text = re.sub(r"http\S+|www.\S+", "", text)  # URLs
    text = re.sub(r"@\w+|#\w+", "", text)        # mentions/hashtags
    text = re.sub(r"[^\w\s]", "", text)          # punctuation
    text = re.sub(r"\s+", " ", text).strip()
    return text if len(text.split()) >= 5 else None


In [12]:
df = pd.read_csv("../data/bot_tweets_by_user.csv")
df["clean_text"] = df["text"].apply(clean_tweet)
df = df.dropna(subset=["clean_text"])

In [7]:
model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

batch_size = 256
embeddings = []
texts = df["clean_text"].tolist()

for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i + batch_size]
    with torch.no_grad():
        emb = model.encode(batch, show_progress_bar=False)
    embeddings.extend(emb)

# Optionally save
np.save("model_data/tweet_embeddings.npy", embeddings)

100%|██████████| 3173/3173 [14:20<00:00,  3.69it/s]


# Clustering Topics #

In [18]:
df = pd.read_csv("../data/bot_tweets_by_user.csv")
df["clean_text"] = df["text"].apply(clean_tweet)
df = df.dropna(subset=["clean_text"])
texts = df["clean_text"].tolist()

In [None]:
embeddings = np.load("model_data/tweet_embeddings.npy")

pca = PCA(n_components=50)
reduced_embeddings = pca.fit_transform(embeddings)

umap_model = UMAP(
    n_neighbors=5,
    n_components=5,
    metric="cosine",
    n_epochs=200,
    low_memory=True,
    n_jobs=-1,
    verbose=True
)

umap_embeddings = umap_model.fit_transform(reduced_embeddings)
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=30,
    metric="euclidean",
    prediction_data=True
)

cluster_labels = hdbscan_model.fit_predict(umap_embeddings)
np.save("model_data/umap_embeddings.npy", umap_embeddings)
np.save("model_data/cluster_labels.npy", cluster_labels)

# Test Clustering #

In [30]:
df_clusters = pd.DataFrame({
    "text": texts,
    "cluster": cluster_labels
})

# Example: Count of texts per cluster
cluster_counts = df_clusters["cluster"].value_counts()
print(cluster_counts)

# Example: Show sample texts from a specific cluster (e.g., cluster 0)
print(df_clusters[df_clusters["cluster"] == 5908].head())

cluster
-1       314670
 4154     12408
 772       3018
 5908      2315
 3436      2185
          ...  
 3994        30
 7179        30
 5051        30
 6107        30
 6516        30
Name: count, Length: 7229, dtype: int64
                                                   text  cluster
1617          war until the death of the last ukrainian     5908
1713  situation militaire en ukraine au 14 janvier 2...     5908
3242  2 so i was thinking about this whole war in uk...     5908
3261  2 so i was thinking about this whole war in uk...     5908
4078  heres an interesting analysis on by an author ...     5908


# Labeling Clusters with LLM #
* using Together.ai API to label clusters based on sampled tweets
* requires an API key from Together.ai as an user input
* LLM model: Mistral-7B-Instruct-v0.2

In [38]:
# Group tweets by cluster
cluster_to_texts = defaultdict(list)
for text, label in zip(texts, cluster_labels):
    if label != -1:
        cluster_to_texts[label].append(text)

# Sampling function
def sample_cluster(cluster_texts, max_samples=20):
    if len(cluster_texts) <= max_samples:
        return cluster_texts
    return random.sample(cluster_texts, max_samples)

# Together.ai config
TOGETHER_API_KEY = input("Enter your Together API key: ").strip()
TOGETHER_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
TOGETHER_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"

headers = {
    "Authorization": f"Bearer {TOGETHER_API_KEY}",
    "Content-Type": "application/json"
}

# LLM request function
def label_cluster(cluster_texts):
    sample = "\n".join(cluster_texts)
    prompt = (
        f"Here are some tweets from the same topic:\n\n{sample}\n\n"
        "Please summarize this topic in 3-5 keywords or short phrases that best describe it:"
    )

    data = {
        "model": TOGETHER_MODEL,
        "prompt": prompt,
        "max_tokens": 50,
        "temperature": 0.2,
        "stop": None
    }

    for attempt in range(5):
        response = requests.post("https://api.together.xyz/v1/completions", headers=headers, json=data)
        if response.status_code == 200:
            return response.json()["choices"][0]["text"].strip()
        else:
            print(f"Error {response.status_code}, retrying...")
            time.sleep(2)  # Retry delay

    return "[LLM_ERROR]"

# Full batch loop
cluster_labels_dict = {}

for cluster_id, texts_in_cluster in tqdm(cluster_to_texts.items(), desc="Labeling clusters"):
    sampled_texts = sample_cluster(texts_in_cluster, max_samples=20)
    topic_label = label_cluster(sampled_texts)
    cluster_labels_dict[cluster_id] = topic_label

# Save topics
labels_df = pd.DataFrame.from_dict(cluster_labels_dict, orient='index', columns=['topic_label'])
labels_df.index.name = 'cluster_id'
labels_df.to_csv("model_data/llm_topic_labels.csv")

# Merge with original dataset (optional)
df_full = pd.DataFrame({
    'text': texts,
    'cluster_id': cluster_labels
})
df_full = df_full.merge(labels_df, how='left', on='cluster_id')
df_full.to_csv("../data/labeled_tweets.csv", index=False)


Labeling clusters:  90%|█████████ | 6509/7228 [1:02:57<08:09,  1.47it/s]

Error 402, retrying...


Labeling clusters:  90%|█████████ | 6510/7228 [1:03:00<15:54,  1.33s/it]

Error 402, retrying...
Error 402, retrying...
Error 402, retrying...


Labeling clusters:  90%|█████████ | 6513/7228 [1:03:09<22:15,  1.87s/it]

Error 402, retrying...
Error 402, retrying...


Labeling clusters:  90%|█████████ | 6518/7228 [1:03:17<13:24,  1.13s/it]

Error 402, retrying...
Error 402, retrying...
Error 402, retrying...


Labeling clusters:  90%|█████████ | 6520/7228 [1:03:24<26:04,  2.21s/it]

Error 402, retrying...


Labeling clusters:  90%|█████████ | 6522/7228 [1:03:28<22:34,  1.92s/it]

Error 402, retrying...
Error 402, retrying...
Error 402, retrying...


Labeling clusters:  90%|█████████ | 6524/7228 [1:03:36<31:07,  2.65s/it]

Error 402, retrying...


Labeling clusters:  90%|█████████ | 6525/7228 [1:03:39<32:35,  2.78s/it]

Error 402, retrying...
Error 402, retrying...
Error 402, retrying...
Error 402, retrying...
Error 402, retrying...


Labeling clusters:  90%|█████████ | 6526/7228 [1:03:50<1:02:27,  5.34s/it]

Error 402, retrying...
Error 402, retrying...


Labeling clusters:  90%|█████████ | 6527/7228 [1:03:56<1:01:46,  5.29s/it]

Error 402, retrying...
Error 402, retrying...
Error 402, retrying...
Error 402, retrying...
Error 402, retrying...


Labeling clusters:  90%|█████████ | 6528/7228 [1:04:07<1:24:46,  7.27s/it]

Error 402, retrying...
Error 402, retrying...
Error 402, retrying...


Labeling clusters:  90%|█████████ | 6529/7228 [1:04:15<1:26:50,  7.45s/it]

Error 402, retrying...


Labeling clusters:  90%|█████████ | 6530/7228 [1:04:18<1:10:28,  6.06s/it]

Error 402, retrying...
Error 402, retrying...
Error 402, retrying...
Error 402, retrying...
Error 402, retrying...


Labeling clusters:  90%|█████████ | 6531/7228 [1:04:29<1:28:17,  7.60s/it]

Error 402, retrying...
Error 402, retrying...
Error 402, retrying...
Error 402, retrying...
Error 402, retrying...


Labeling clusters:  90%|█████████ | 6532/7228 [1:04:41<1:43:35,  8.93s/it]

Error 402, retrying...
Error 402, retrying...
Error 402, retrying...
Error 402, retrying...
Error 402, retrying...


Labeling clusters:  90%|█████████ | 6533/7228 [1:04:53<1:52:09,  9.68s/it]

Error 402, retrying...


Labeling clusters:  90%|█████████ | 6534/7228 [1:04:55<1:27:54,  7.60s/it]

Error 402, retrying...


Labeling clusters:  90%|█████████ | 6535/7228 [1:04:58<1:11:08,  6.16s/it]

Error 402, retrying...
Error 402, retrying...


Labeling clusters:  90%|█████████ | 6537/7228 [1:05:04<49:36,  4.31s/it]  

Error 402, retrying...


Labeling clusters:  90%|█████████ | 6541/7228 [1:05:09<19:47,  1.73s/it]

Error 402, retrying...


Labeling clusters:  91%|█████████ | 6542/7228 [1:05:12<23:31,  2.06s/it]

Error 402, retrying...


Labeling clusters:  91%|█████████ | 6559/7228 [1:05:23<05:58,  1.86it/s]

Error 402, retrying...


Labeling clusters:  91%|█████████ | 6561/7228 [1:05:27<12:02,  1.08s/it]

Error 402, retrying...


Labeling clusters: 100%|██████████| 7228/7228 [1:11:47<00:00,  1.68it/s]


In [41]:
print(df_full.head())

print("\n\n\n",labels_df.head)
print("\n",labels_df.shape)

                                                text  cluster_id  \
0  they traded her freedom for a real supervillai...          -1   
1  its not the weapons its avoiding the sanctions...        5673   
2  its not the weapons its avoiding the sanctions...        5673   
3  they traded her freedom for a real supervillai...          -1   
4  sacrificio estremo degli ucraini attacchi suic...        4154   

                                         topic_label  
0                                                NaN  
1  * Iran Sanctions\n* Human Rights Violations\n*...  
2  * Iran Sanctions\n* Human Rights Violations\n*...  
3                                                NaN  
4  1. Russia-Ukraine War\n2. Vladimir Putin\n3. X...  



 <bound method NDFrame.head of                                                   topic_label
cluster_id                                                   
5673        * Iran Sanctions\n* Human Rights Violations\n*...
4154        1. Russia-Ukraine War\n2. Vla