In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import networkx as nx
import nltk 
from nltk.corpus import stopwords
import re
from itertools import combinations



# Dataload

We are loading the graph where each node has the attribute "community". To work with text-elements, we are removing every row with empty posts and rows where posts are either deleted or removed. This is classic Reddit terminology where either the user has deleted (\[deleted\]) a post, or the post has been removed (\[removed\]) by an admin or moderator.

In [19]:
G = nx.read_gml('reddit_graph_with_communities.gml')

In [20]:
rows = []
rows_count = 0
allowed_rows = 0

for node, data in G.nodes(data=True):
    community = data.get("community")
    posts_dict = data.get("posts", {})

    # Ensure it's a dictionary
    if not isinstance(posts_dict, dict):
        posts_dict = {"default": posts_dict}

    # Loop through each list of posts in the dictionary
    for key, posts in posts_dict.items():
        if not isinstance(posts, list):
            posts = [posts]

        for post in posts:
            rows_count += 1
            # Skip empty, deleted/removed posts, or posts containing the bot line
            if post and post not in ['[deleted]', '[removed]'] and \
               "*I am a bot, and this action was performed automatically." not in post:
                allowed_rows += 1
                rows.append({
                    "id": node,
                    "text": post,
                    "community": community
                })

df = pd.DataFrame(rows)
print(df.head())

print(rows_count-allowed_rows)

  id                                               text  community
0  1  "Huh it's still not legalized yet. America is ...       5595
1  1  "Hey charisma helps. Everybody wants to sleep ...       5595
2  1  Aren't the jedis not really good guys though? ...       5595
3  1  Wait but ferb is the better looking one with a...       5595
4  1  Great now you live with a hole in your head fo...       5595
163205


In [21]:
communities = df['community'].unique().tolist()
communities_dict = dict.fromkeys(communities, 0)

for index, row in df.iterrows():
    communities_dict[row['community']] += 1

In [22]:
comms_list = list(sorted( ((v,k) for k,v in communities_dict.items()), reverse=True))

comms_list = comms_list[:6]
top_6_communities = []
for i in comms_list:
    top_6_communities.append(i[1])

#check compatibilty with 
print(top_6_communities)

[5595, 8821, 6051, 3475, 15981, 530]


In [6]:
df_filtered = df[df['community'].isin(top_6_communities)].copy()

In [7]:
print("Number of posts in top 6 communities:", len(df_filtered))

print("Number of posts removed based on non identity in top 6:", len(df)-len(df_filtered))

Number of posts in top 6 communities: 2503488
Number of posts removed based on non identity in top 6: 33226


In [8]:

# Download once (if not already done)
nltk.download('stopwords')

# Base English stopwords from NLTK
stop_words = set(stopwords.words('english'))

# Optionally extend with custom slang / internet terms
extra_stops = {
    'lol', 'xd', 'haha', 'hahaah', 'omg', 'u', 'ur', 'im', 'ive', 'idk', 
    'dont', 'cant', 'wont', 'aint', 'ya', 'tho', 'tho', 'nah', 'btw', 
    'like', 'yeah', 'yep', 'ok', 'okay', 'pls', 'please'
}
stop_words.update(extra_stops)

def preprocess_text(text):
    if not isinstance(text, str):
        return []
    # lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)
    # keep only letters and spaces
    text = re.sub(r"[^a-z\s]", " ", text)
    # tokenize by whitespace
    tokens = text.split()
    # remove stopwords and very short tokens
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

# Apply preprocessing
df_filtered["tokens"] = df_filtered["text"].apply(preprocess_text)
df_filtered["n_tokens"] = df_filtered["tokens"].apply(len)

# Preview
df_filtered.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/livdreyerjohansen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,text,community,tokens,n_tokens
0,1,"""Huh it's still not legalized yet. America is ...",5595,"[huh, still, legalized, yet, america, weirdly,...",7
1,1,"""Hey charisma helps. Everybody wants to sleep ...",5595,"[hey, charisma, helps, everybody, wants, sleep...",11
2,1,Aren't the jedis not really good guys though? ...,5595,"[jedis, really, good, guys, though, protect, s...",24
3,1,Wait but ferb is the better looking one with a...,5595,"[wait, ferb, better, looking, one, actual, gam...",13
4,1,Great now you live with a hole in your head fo...,5595,"[great, live, hole, head, eternity]",5


In [9]:
df_filtered.sort_values(by="community")

Unnamed: 0,id,text,community,tokens,n_tokens
2169965,86019,"As a fellow lanky, awkward ginger, I approve!",530,"[fellow, lanky, awkward, ginger, approve]",5
1603684,62545,I'm going to throw this out there since I have...,530,"[going, throw, since, seen, anyone, else, sayi...",125
1259071,48258,r/exredpill,530,[exredpill],1
1561009,60354,"I guess, she's just crazy and seeks out random...",530,"[guess, crazy, seeks, random, incels, message,...",9
1561010,60354,"I don't ask for that, but one mentally ill gir...",530,"[ask, one, mentally, ill, girl, keeps, sending]",7
...,...,...,...,...,...
780946,28308,https://www.reddit.com/r/Braincels/comments/bk...,15981,"[course, continued, walking, nothing, even, ha...",6
780945,28308,https://www.reddit.com/r/Braincels/comments/af...,15981,"[roomates, earlier, week, one, ugly, unless, b...",16
510060,20200,41PercentIsNotEnough,15981,[percentisnotenough],1
1204068,46076,Because of the horrible photoshoppery?,15981,"[horrible, photoshoppery]",2


Part into different communities

In [10]:
# [5595, 8821, 6051, 3475, 15981, 530]
df_5595 = df_filtered[df_filtered["community"] == 5595].copy()
df_8821 = df_filtered[df_filtered["community"] == 8821].copy()
df_6051 = df_filtered[df_filtered["community"] == 6051].copy()
df_3475 = df_filtered[df_filtered["community"] == 3475].copy()
df_15981 = df_filtered[df_filtered["community"] == 15981].copy()
df_530 = df_filtered[df_filtered["community"] == 530].copy()

In [11]:
df_530

Unnamed: 0,id,text,community,tokens,n_tokens
2428,123,If YOUR desire Is to have a romantic or sexual...,530,"[desire, romantic, sexual, relationship, woman...",60
2429,123,This also helps with you not overinvesting int...,530,"[also, helps, overinvesting, women, know, save...",8
2430,123,"i replied to the wron person, my reply Is down...",530,"[replied, wron, person, reply, theere, vvvvv]",6
2431,123,"Yes, it is, But remember that r.seduction has ...",530,"[yes, remember, seduction, lots, experience, l...",40
3423,180,One of the articles references a study and ano...,530,"[one, articles, references, study, another, po...",17
...,...,...,...,...,...
2536437,100117,"I didn’t have sex with either, no.",530,"[sex, either]",2
2536438,100117,I could never not appreciate her. She’s litera...,530,"[could, never, appreciate, literally, missing,...",7
2536439,100117,Do what?,530,[],0
2536440,100117,Thank you,530,[thank],1


# A-priory algorithm

In [12]:
# construct table as described in section 6.2.5 

#start with 530 as it is the shortest

list_of_tokens = []

for index, row in df_530.iterrows():
    list_of_tokens.extend(row['tokens'])

list_of_unique_tokens = list(set(list_of_tokens))

print("No. of words in 530 community:", len(list_of_tokens))
print("No. of unique words in 530 community:", len(set(list_of_tokens)))


No. of words in 530 community: 1292952
No. of unique words in 530 community: 32134


In [13]:
df_530_apriori = pd.DataFrame({
    'word': list_of_unique_tokens,
    'integer': range(len(list_of_unique_tokens))
})

word_to_int = dict(zip(df_530_apriori['word'], df_530_apriori['integer']))

array_of_counts = np.zeros((len(list_of_unique_tokens)), dtype=int)

for index, row in df_530.iterrows():
    unique_tokens = set(row['tokens'])

    for token in unique_tokens:
        array_of_counts[word_to_int[token]] += 1


df_530_apriori['count'] = array_of_counts

In [14]:
df_530_apriori.sort_values(by="count", ascending=False)

Unnamed: 0,word,integer,count
4559,people,4559,10210
16887,think,16887,7329
966,women,966,7196
10880,get,10880,6904
30438,know,30438,6351
...,...,...,...
23401,gatekeepy,23401,1
18206,hast,18206,1
23403,despondence,23403,1
15561,conforms,15561,1


In [None]:
# df_530_apriori has: 'word', 'integer', 'count'
# array_of_counts also has counts
threshold = 0.01 * len(df_530)  # 1% of documents

# Initialize frequent-items map: old_id -> new_id (0 if not frequent)
frequent_map = np.zeros(len(df_530_apriori), dtype=int)

new_id = 1
for old_id, count in enumerate(df_530_apriori['count']):
    if count >= threshold:
        frequent_map[old_id] = new_id
        new_id += 1
    else:
        frequent_map[old_id] = 0

# Optionally add this to your df_530_apriori table
df_530_apriori['freq_integer'] = frequent_map

#print(df_530_apriori.head())

df_freq_530 = df_530_apriori[df_530_apriori["freq_integer"] != 0].copy()

print(df_freq_530)

                word  integer  count  freq_integer
18     relationships       18   1772             1
76             black       76    462             2
176           easier      176    697             3
257             case      257    873             4
307          fucking      307    619             5
...              ...      ...    ...           ...
31740          super    31740    532           548
31743          never    31743   3465           549
31895           live    31895   1026           550
32033           move    32033    623           551
32065           fine    32065    704           552

[552 rows x 4 columns]


In [16]:
from itertools import combinations
from collections import Counter

# Prepare set of frequent words
frequent_words_set = set(df_freq_530["word"])

# Counter to accumulate all pairs
pair_counter = Counter()

# Loop over all documents
for index, row in df_530.iterrows():
    # Keep only frequent words in this document
    frequent_tokens = [token for token in row["tokens"] if token in frequent_words_set]
    
    # Remove duplicates in the document
    unique_tokens = set(frequent_tokens)
    
    # Generate all 2-item combinations and increment counter
    for pair in combinations(unique_tokens, 2):
        # Sort pair so ('a','b') and ('b','a') are treated the same
        pair_counter[tuple(sorted(pair))] += 1

# Optional: convert counter to DataFrame to see pairs and counts
df_freq_itempairs_530 = pd.DataFrame(
    pair_counter.items(), columns=["item_set", "count"]
)

print(df_freq_itempairs_530.head())


                item_set  count
0            (stop, way)    394
1          (stop, woman)    141
2        (stop, whether)     55
3           (stop, time)    368
4  (stop, understanding)     23


In [17]:
df_freq_itempairs_530.sort_values(by="count", ascending=False)

Unnamed: 0,item_set,count
1083,"(people, think)",3544
2304,"(get, people)",3329
6935,"(know, people)",2971
1335,"(one, people)",2840
1057,"(people, want)",2784
...,...,...
151846,"(sad, shorter)",1
149233,"(assume, thank)",1
152006,"(socially, taller)",1
142531,"(perhaps, thats)",1


In [79]:
import numpy as np
from itertools import combinations

# Step 1: Filter frequent items based on a minimum support threshold
min_support = 39  # example, adjust as needed
frequent_items = df_530_apriori[df_530_apriori['count'] >= min_support].copy()

# Step 2: Re-index frequent items for triangular matrix
frequent_items = frequent_items.reset_index(drop=True)
frequent_items['new_integer'] = range(len(frequent_items))
word_to_new_int = dict(zip(frequent_items['word'], frequent_items['new_integer']))

# Step 3: Create the triangular matrix
n = len(frequent_items)
tri_matrix = np.zeros((n*(n-1)//2,), dtype=int)  # flattened upper-triangular

# Helper function to map a pair (i,j) with i < j to index in flattened array
def tri_index(i, j):
    return i * n - i*(i+1)//2 + (j - i - 1)

# Step 4: Count 2-itemsets
for row in df_530['tokens']:
    # Keep only frequent items in this transaction
    filtered = [word_to_new_int[word] for word in row if word in word_to_new_int]
    # Count all pairs
    for i, j in combinations(sorted(filtered), 2):
        tri_matrix[tri_index(i, j)] += 1

# Step 5: Convert back to readable 2-itemsets
two_itemsets = []
counts = []

for i in range(n):
    for j in range(i+1, n):
        idx = tri_index(i, j)
        two_itemsets.append((frequent_items['word'][i], frequent_items['word'][j]))
        counts.append(tri_matrix[idx])

df_2_itemsets = pd.DataFrame({
    'pair': two_itemsets,
    'count': counts
})

# Optionally filter by minimum support
df_2_itemsets = df_2_itemsets[df_2_itemsets['count'] >= min_support].reset_index(drop=True)

print(df_2_itemsets.head())


KeyboardInterrupt: 

In [None]:
df_2_itemsets.sort_values(by="count", ascending=False)

Unnamed: 0,pair,count
229544,"(men, women)",23401
29207,"(react, empty)",22158
242667,"(fun, empty)",19475
30450,"(people, think)",16019
30887,"(people, get)",15927
...,...,...
220615,"(dating, desired)",39
10859,"(big, boyfriend)",39
108713,"(really, swiping)",39
41567,"(lives, case)",39


In [None]:
# construct table as described in section 6.2.5 

#start with 3475 as it is the shortest

list_of_tokens = []

for index, row in df_3475.iterrows():
    list_of_tokens.extend(row['tokens'])

list_of_unique_tokens = list(set(list_of_tokens))

print("No. of words in 3475 community:", len(list_of_tokens))
print("No. of unique words in 3475 community:", len(set(list_of_tokens)))


No. of words in 3475 community: 4754871
No. of unique words in 3475 community: 89289


In [None]:
df_3475_apriori = pd.DataFrame({
    'word': list_of_unique_tokens,
    'integer': range(len(list_of_unique_tokens))
})

word_to_int = dict(zip(df_3475_apriori['word'], df_3475_apriori['integer']))

array_of_counts = np.zeros((len(list_of_unique_tokens)), dtype=int)

for token in list_of_tokens:
    array_of_counts[word_to_int[token]] += 1

df_3475_apriori['count'] = array_of_counts

In [None]:
df_3475_apriori.sort_values(by="count", ascending=False)

Unnamed: 0,word,integer,count
40596,women,40596,51907
24450,get,24450,44934
47379,people,47379,36990
52993,would,52993,34581
10796,even,10796,31161
...,...,...,...
50584,retibution,50584,1
50586,elow,50586,1
15354,negroe,15354,1
50590,ellipses,50590,1


In [None]:
df_3475_apriori[df_3475_apriori["word"] == "criteria"]


Unnamed: 0,word,integer,count
25,criteria,25,132


In [None]:
import numpy as np
from itertools import combinations

# Step 1: Filter frequent items based on a minimum support threshold
min_support = 39  # example, adjust as needed
frequent_items = df_3475_apriori[df_3475_apriori['count'] >= min_support].copy()

# Step 2: Re-index frequent items for triangular matrix
frequent_items = frequent_items.reset_index(drop=True)
frequent_items['new_integer'] = range(len(frequent_items))
word_to_new_int = dict(zip(frequent_items['word'], frequent_items['new_integer']))

# Step 3: Create the triangular matrix
n = len(frequent_items)
tri_matrix = np.zeros((n*(n-1)//2,), dtype=int)  # flattened upper-triangular

# Helper function to map a pair (i,j) with i < j to index in flattened array
def tri_index(i, j):
    return i * n - i*(i+1)//2 + (j - i - 1)

# Step 4: Count 2-itemsets
for row in df_3475['tokens']:
    # Keep only frequent items in this transaction
    filtered = [word_to_new_int[word] for word in row if word in word_to_new_int]
    # Count all pairs
    for i, j in combinations(sorted(filtered), 2):
        tri_matrix[tri_index(i, j)] += 1

# Step 5: Convert back to readable 2-itemsets
two_itemsets = []
counts = []

for i in range(n):
    for j in range(i+1, n):
        idx = tri_index(i, j)
        two_itemsets.append((frequent_items['word'][i], frequent_items['word'][j]))
        counts.append(tri_matrix[idx])

df_2_itemsets = pd.DataFrame({
    'pair': two_itemsets,
    'count': counts
})

# Optionally filter by minimum support
df_2_itemsets = df_2_itemsets[df_2_itemsets['count'] >= min_support].reset_index(drop=True)

print(df_2_itemsets.head())


                 pair  count
0   (criteria, women)    141
1  (criteria, people)     44
2   (criteria, would)     55
3     (criteria, men)     52
4     (criteria, one)     49


In [None]:
df_2_itemsets.sort_values(by="count", ascending=False)

Unnamed: 0,pair,count
191072,"(mods, retard)",527079
191058,"(jocks, borders)",263295
241032,"(normshits, borders)",263197
229807,"(women, men)",43722
317238,"(finances, borders)",43001
...,...,...
9077,"(road, girls)",39
107365,"(percentile, maybe)",39
246658,"(divorce, new)",39
317395,"(enter, chad)",39


In [None]:
#borders, criteria, nbsp, amp, find out how to clean