In [1]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [2]:
posts_path = '4chan/output/posts.csv'
threads_path = '4chan/output/threads.csv'

chan4_posts = pd.read_csv(posts_path)
chan4_threads = pd.read_csv(threads_path)

posts_end_path  = 'endchan V2/output/posts.csv'
threads_end_path = 'endchan V2/output/threads.csv'

endchan_posts = pd.read_csv(posts_end_path)
endchan_threads = pd.read_csv(threads_end_path)

In [3]:
chan4_posts.head(5)
# 40459 x 4

Unnamed: 0,subpost_id,content,time,thread_id
0,p501099183,>>501098958 (OP)I used an anal egg to make me ...,03/21/25(Fri)13:43:02No.501099183,p501098958
1,p501099437,corporate farming was a mistakepicrel is the c...,03/21/25(Fri)13:46:36No.501099437,p501098958
2,p501100092,>>501098958 (OP)Almost like the old administra...,03/21/25(Fri)13:55:59No.501100092,p501098958
3,p501100224,">>501100092Trump started the fake pandemic, st...",03/21/25(Fri)13:57:44No.501100224,p501098958
4,p501101758,>>501098958 (OP)>what did americans mean by th...,03/21/25(Fri)14:17:04No.501101758,p501098958


In [4]:
chan4_threads.head(5)
# 470 x 6

Unnamed: 0,thread_id,thread_text,time
0,p501098958,>kill all of our domestic chickens because fak...,03/21/25(Fri)13:40:04No.501098958
1,p501104660,Denmark is the best country in Europe and I'm ...,03/21/25(Fri)14:53:50No.501104660
2,p501098958,>kill all of our domestic chickens because fak...,03/21/25(Fri)13:40:04No.501098958
3,p501101640,,03/21/25(Fri)14:15:39No.501101640
4,p501099426,You can't be racist and pagan.You can't be pro...,03/21/25(Fri)13:46:30No.501099426


Already noticing the time is off, so will have to clean this up

In [5]:
endchan_posts.head(5)
# 6014 x 4

Unnamed: 0,subpost_id,content,time,thread_id
0,91434,>>91417>killed off most of the savage Natives ...,10/28/2023 (Sat) 08:22,91417
1,91435,Netanyahu's not a white boy. Nor is any Yidd. ...,10/28/2023 (Sat) 11:34,91417
2,91448,,10/29/2023 (Sun) 14:19,91417
3,92023,Idk but it makes me want to pull my hair out. ...,12/26/2023 (Tue) 02:46,91417
4,92024,>>92023,12/26/2023 (Tue) 07:50,91417


In [6]:
endchan_threads.head(5)
# 135 x 3

Unnamed: 0,thread_id,thread_text,time
0,91417,"What's wrong with all of you poor, pathetic, s...",10/27/2023 (Fri) 15:50
1,95327,1) disable or heavily damage the kike propagan...,12/08/2024 (Sun) 23:36
2,93932,They like to call Jerry Seinfeld a “funnyman” ...,06/15/2024 (Sat) 11:36
3,90261,The Natashas 2003.https://www.scribd.com/docum...,04/14/2023 (Fri) 02:05
4,94806,Hello from soyjak.party.gemmy board,09/28/2024 (Sat) 02:57


# Topic modeling with 4chan data

Thread is the main post, posts is the comments underneath one thread <br>
Topics across one thread should be the same, as the thread decides the topic. Therefore, it is most logical to use threads to do topic modeling.

In [7]:
# Cleaning the time column of chan4_threads
chan4_threads['time'] = chan4_threads['time'].str.split('No.').str[0]

## Basic topic modeling with regular libraries

### NLTK and Scikit-Learn

In [8]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

In [9]:
# cleaning master function
def clean_thread(thread, bigrams=False):
    if not isinstance(thread, str):  # Handle non-string values
        thread = ''
    thread = thread.lower()  # lower case
    thread = re.sub('[' + my_punctuation + ']+', ' ', thread)  # strip punctuation
    # thread = re.sub('\s+', ' ', thread)  # remove double spacing
    thread = re.sub('([0-9]+)', '', thread)  # remove numbers
    thread_token_list = [word for word in thread.split(' ')
                         if word not in my_stopwords]  # remove stopwords

    # thread_token_list = [word_rooter(word) if '#' not in word else word
    #                      for word in thread_token_list]  # apply word rooter
    if bigrams:
        thread_token_list = thread_token_list + [thread_token_list[i] + '_' + thread_token_list[i + 1]
                                                 for i in range(len(thread_token_list) - 1)]
    thread = ' '.join(thread_token_list)
    return thread


chan4_threads['cleaned'] = chan4_threads['thread_text'].apply(clean_thread)
chan4_threads['cleaned']

0       kill domestic chickens fake pcr test said egg...
1          denmark best country europe tired pretending 
2       kill domestic chickens fake pcr test said egg...
3                                                       
4      racist pagan pro white pagan pro christian pag...
                             ...                        
465     white east asian interbreeding le bad princip...
466    docsits relevantisraeli nuclear program spotte...
467    young white men across world turning fascism n...
468    libs slated get majority  accelerationism answ...
469     baltics   whitebaltics   white baltics   whit...
Name: cleaned, Length: 470, dtype: object

In [10]:

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(chan4_threads['cleaned']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names_out()

In [11]:
number_of_topics = 5
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
model.fit(tf)

In [12]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [13]:
no_top_words = 15
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights
0,https,128.2,us,42.1,make,70.4,even,39.5,white,78.5
1,trump,120.0,go,35.0,like,45.9,www,36.0,would,61.3
2,com,101.2,europe,30.2,get,23.1,uk,33.2,people,36.1
3,www,55.4,’s,26.4,year,18.4,new,26.2,world,29.7
4,year,14.0,like,18.5,would,7.3,’s,19.8,literally,27.2
5,every,10.4,uk,7.3,people,6.7,full,18.5,government,14.7
6,uk,9.1,world,5.7,white,0.9,go,15.4,even,9.9
7,make,5.3,www,0.2,even,0.2,people,12.9,every,9.5
8,full,4.9,year,0.2,go,0.2,make,11.9,get,8.3
9,government,4.8,would,0.2,government,0.2,government,8.1,full,4.2


### BERTopic

In [14]:
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
topic_model = BERTopic()
docs = chan4_threads['cleaned'].tolist()
topics, probs = topic_model.fit_transform(docs)
chan4_threads['topics'] = topics
chan4_threads['topic_probs'] = probs

In [16]:
BERTopics = topic_model.get_topic_info()
BERTopics

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,35,-1_india_china_manufacturing_make,"[india, china, manufacturing, make, chinese, p...",[india failed attract factories china went wr...
1,0,37,0_white_search_google_anti,"[white, search, google, anti, nobody, men, fam...",[objectively nasty world class rbf tds nut f...
2,1,36,1_recall_european_would_tesla,"[recall, european, would, tesla, glue, musk, e...",[morons using wrong glue tesla forced recall e...
3,2,34,2_notice_courts_something_libs,"[notice, courts, something, libs, slated, acce...",[libs slated get majority accelerationism ans...
4,3,29,3_ukraine_bbc_uk_catbox,"[ukraine, bbc, uk, catbox, moe, news, co, arti...",[previous →day — daily battlefield assessme...
5,4,29,4_poilievre_polls_policy_real,"[poilievre, polls, policy, real, never, pol, k...",[poilievre afraid policy conservatives policy...
6,5,29,5_white_baltics_tomatokikes_shitalians,"[white, baltics, tomatokikes, shitalians, conv...",[convinced majority shitalians white they’re ...
7,6,23,6_step_great_every_literal,"[step, great, every, literal, america, assassi...",[antifa violence endless new waves vandalism ...
8,7,21,7_prison_lockdown_inmates_staff,"[prison, lockdown, inmates, staff, thomson, gu...",[whoever cares hear massive defrauding taxpay...
9,8,21,8_go_coffee_br_woke,"[go, coffee, br, woke, drink, delicious, britb...",[sorry britbros coffee much delicious tea th...


In [17]:
topic_model.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

## DarkBERT

In [18]:
from transformers import pipeline

In [19]:
access_token = "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"

In [20]:
unmasker = pipeline('fill-mask', model="s2w-ai/DarkBERT", token=access_token)
unmasker("RagnarLocker, LockBit, and REvil are types of <mask>.")

Device set to use cpu


[{'score': 0.49523264169692993,
  'token': 25346,
  'token_str': ' ransomware',
  'sequence': 'RagnarLocker, LockBit, and REvil are types of ransomware.'},
 {'score': 0.04661604389548302,
  'token': 16886,
  'token_str': ' malware',
  'sequence': 'RagnarLocker, LockBit, and REvil are types of malware.'},
 {'score': 0.042176585644483566,
  'token': 28811,
  'token_str': ' wallets',
  'sequence': 'RagnarLocker, LockBit, and REvil are types of wallets.'},
 {'score': 0.02898237481713295,
  'token': 2196,
  'token_str': ' drugs',
  'sequence': 'RagnarLocker, LockBit, and REvil are types of drugs.'},
 {'score': 0.020001336932182312,
  'token': 11344,
  'token_str': ' hackers',
  'sequence': 'RagnarLocker, LockBit, and REvil are types of hackers.'}]

In [24]:
# Iterate through the 'Representation' column in BERTopics DataFrame
for representation in BERTopics['Representation']:
    # Take only the first 3 words from the representation list
    words = ', '.join(representation[:6])
    # Create the input sentence for the unmasker
    sentence = f"{words} are all types of <mask>"
    # Use the unmasker to predict the masked word
    result = unmasker(sentence)
    print(f"Input: {sentence}")
    print(f"Prediction: {result}\n")

Input: india, china, manufacturing, make, chinese, plan are all types of <mask>
Prediction: [{'score': 0.14465756714344025, 'token': 708, 'token_str': ' plans', 'sequence': 'india, china, manufacturing, make, chinese, plan are all types of plans'}, {'score': 0.09403733909130096, 'token': 563, 'token_str': ' plan', 'sequence': 'india, china, manufacturing, make, chinese, plan are all types of plan'}, {'score': 0.02282850816845894, 'token': 1437, 'token_str': ' ', 'sequence': 'india, china, manufacturing, make, chinese, plan are all types of '}, {'score': 0.018375804647803307, 'token': 418, 'token_str': ' money', 'sequence': 'india, china, manufacturing, make, chinese, plan are all types of money'}, {'score': 0.01382406335324049, 'token': 2799, 'token_str': ' books', 'sequence': 'india, china, manufacturing, make, chinese, plan are all types of books'}]

Input: white, search, google, anti, nobody, men are all types of <mask>
Prediction: [{'score': 0.4385855793952942, 'token': 82, 'token_