# Frequent items analysis of reddit communities

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import networkx as nx
import nltk 
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
import html
from itertools import combinations
from collections import Counter
import math
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/livdreyerjohansen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Dataload and Cleaning

We analysize what frequent items we may see in the reddit clusters found in 05_SemanticClustering.ipynb. The output of the analysis performed in aforementioned notebook is organized in a csv file with columns: id (user), text, and community.


In [2]:
df_raw = pd.read_csv('post_clusters_new.csv')

In [3]:
df_raw = df_raw.rename(columns={"label": "community"})
df_raw = df_raw.rename(columns={"user": "id"})

To ensure we do not include posts that are either deleted ("\[deleted\]") or removed ("\[removed\]"), both basic reddit features that happen independently of what forum you are in, we remove both. Furthermore, we remove each post that was removed by a bot, which is clear in the text which the bot uses to explain why a post or comment is deleted. We then construct a dataframe with all posts and their community (and original poster (OP in reddit linguistics)).

In [4]:
# Expected columns in CSV:
# id, text, community
# (tell me if they differ)
df_raw.drop(df_raw[df_raw["community"] == -1].index, inplace=True)

rows = []
rows_count = 0
allowed_rows = 0

for _, r in df_raw.iterrows():
    rows_count += 1
    post = r["text"]

    # Skip empty, deleted/removed posts, or posts containing the bot line
    if post and post not in ['[deleted]', '[removed]'] and \
       "*I am a bot, and this action was performed automatically." not in post:

        allowed_rows += 1

        rows.append({
            "id": r["id"],
            "text": post,
            "community": r["community"]
        })

# Build the dataframe exactly like before
df = pd.DataFrame(rows)

print(df.head())

print("\nNumber of original posts:")
print(f"{rows_count:,}")
print("Number of removed posts:")
print(f"{rows_count-allowed_rows:,}")
print("Number of posts in dataframe:")
print(f"{allowed_rows:,}")

             id                                               text  community
0  9.290107e+08  hey charisma helps. everybody wants to sleep w...        214
1  9.290107e+08  aren't the jedis not really good guys though? ...        156
2  9.290107e+08  wait but ferb is the better looking one with a...         95
3  9.290107e+08  what? i mean i get where you're comming from b...         24
4  9.290107e+08         hey, i guess time to try out something new        146

Number of original posts:
1,119,261
Number of removed posts:
0
Number of posts in dataframe:
1,119,261


We identify the top-6 largest communities in terms of posts to continue working with only them.

In [5]:
communities = df['community'].unique().tolist()
communities_dict = dict.fromkeys(communities, 0)

for index, row in df.iterrows():
    communities_dict[row['community']] += 1

Filter the dataframe to only contain posts from top-6 communities

In [6]:
comms_list = list(sorted( ((v,k) for k,v in communities_dict.items()), reverse=True))
comms_list = comms_list[:6]
top_6_communities = [item[1] for item in comms_list]

df_filtered = df[df['community'].isin(top_6_communities)].copy()

print("\nTop 6 communities by number of posts (posts, community):", comms_list)

print(f"Number of posts in top 6 communities: {len(df_filtered):,}")

print(f"Number of posts removed based on non identity in top 6: {(len(df) - len(df_filtered)):,}")



Top 6 communities by number of posts (posts, community): [(109445, 58), (88639, 192), (79247, 24), (43864, 214), (35248, 100), (30420, 210)]
Number of posts in top 6 communities: 386,863
Number of posts removed based on non identity in top 6: 732,398


## Stop words

We filter out parts of post that we deem have little semantic value. We aim to find frequent items and frequent itemsets (item pairs), and would assume that stop words regularly occur in more than 1% of baskets. As we are working with online fora, we chose to add certain slang-terms as stop words. We furthermore remove:

- html entities
- URL's
- non-text artifacts (such as "/", "?", "!" etc.)
- remaining "removed" and "deleted" artifacts that were not removed in the previous code block due to the way the post was loaded
- short words (length of 2 or less)

Additionally, we make all words lowercase to steamline and tokenize by word (meaning each word will be its own token)

In [7]:
# tokenize and clean text data

stop_words = set(stopwords.words('english'))

# extend basic english stopwords with slang terms
extra_stops = {
    'lol', 'xd', 'haha', 'hahaah', 'omg', 'u', 'ur', 'im', 'ive', 'idk', 
    'dont', 'cant', 'wont', 'aint', 'ya', 'tho', 'tho', 'nah', 'btw', 
    'like', 'yeah', 'yep', 'ok', 'okay', 'pls', 'please', 'get'
}
stop_words.update(extra_stops)

def preprocess_text(text):
    if not isinstance(text, str):
        return []
    # decode HTML entities: &amp; → &, &#x200B; → zero-width space, etc.
    text = html.unescape(text)
    # lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)
    # keep only letters and spaces
    text = re.sub(r"[^a-z\s]", " ", text)
    # tokenize by whitespace
    tokens = text.split()
    # remove stopwords and very short tokens
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    if len(tokens) == 1 and tokens[0] in {"removed", "deleted"}:
        return []
    return tokens

df_filtered["tokens"] = df_filtered["text"].apply(preprocess_text)
df_filtered["n_tokens"] = df_filtered["tokens"].apply(len)

df_filtered.head()

Unnamed: 0,id,text,community,tokens,n_tokens
0,929010700.0,hey charisma helps. everybody wants to sleep w...,214,"[hey, charisma, helps, everybody, wants, sleep...",11
3,929010700.0,what? i mean i get where you're comming from b...,24,"[mean, comming, really, inspire, respect, chads]",6
5,929010700.0,really? i did the same with a girl friend and ...,214,"[really, girl, friend, got, laughs, one, ugly,...",9
7,929010700.0,chaggot? more like that fucking weirdo who sta...,24,"[chaggot, fucking, weirdo, stalks, grindr, gay...",21
9,929010700.0,"if it makes you feel any better, a good body c...",214,"[makes, feel, better, good, body, sex, fine, r...",16


# Frequent items and the A-priori algorithm

We process the tokenized posts by identifying the amount of unique tokens for each community.

In [8]:
# build token statistics for each of the top 6 communities

community_token_stats = {}

for community_id in top_6_communities:
    df_comm = df_filtered[df_filtered["community"] == community_id]

    # flatten all tokens for this community
    all_tokens = []
    for tokens in df_comm["tokens"]:
        all_tokens.extend(tokens)

    unique_tokens = set(all_tokens)

    community_token_stats[community_id] = {
        "n_tokens": len(all_tokens),
        "n_unique_tokens": len(unique_tokens),       
        "unique_tokens": unique_tokens 
    }


for cid in top_6_communities:
    print(f"Community {cid}:")
    print(f"  Total tokens: {community_token_stats[cid]['n_tokens']:,}")
    print(f"  Unique tokens: {community_token_stats[cid]['n_unique_tokens']:,}")

Community 58:
  Total tokens: 1,606,779
  Unique tokens: 57,962
Community 192:
  Total tokens: 1,906,858
  Unique tokens: 52,195
Community 24:
  Total tokens: 1,038,517
  Unique tokens: 42,537
Community 214:
  Total tokens: 1,024,180
  Unique tokens: 33,030
Community 100:
  Total tokens: 533,604
  Unique tokens: 24,716
Community 210:
  Total tokens: 575,470
  Unique tokens: 30,720


## First pass of the A-priori algorithm

In the first pass of the A-priori algoritm, we initialize a dataframe for each of the communities. In this dataframe, we will store each of the unique tokens found previously, assign them each an integer from 0 to n-1 (number of unique tokens), and count how many baskets (posts) the item (token) appears in. It is important to note that we do not count the total occurrence of the token but only the amount of posts it appears in. In Mining of Massive Datasets, Section 6.2.2, the first pass is described as labeling integers 1 to n, but to keep it within the python framework, we label 0 to n-1 and mentioned.

In [9]:
apriori_tables = {}

for cid in top_6_communities:
    # get df for this community
    df_comm = df_filtered[df_filtered["community"] == cid]
    unique_tokens = list(community_token_stats[cid]["unique_tokens"])

    # apriori table
    df_apriori = pd.DataFrame({
        "word": unique_tokens,
        "integer": range(len(unique_tokens))
    })

    # give each word an integer from 0 to n-1
    word_to_int = dict(zip(df_apriori["word"], df_apriori["integer"]))

    # count posts that contain each token
    array_of_counts = np.zeros(len(unique_tokens), dtype=int)

    for tokens in df_comm["tokens"]:
        for token in set(tokens):             
            array_of_counts[word_to_int[token]] += 1

    df_apriori["count"] = array_of_counts

    # save in dict
    apriori_tables[cid] = df_apriori


In [10]:
# print top 5 tokens by count for each community
for cid, df_apriori in apriori_tables.items():
    print(f"\nCommunity {cid}")
    print(df_apriori.sort_values(by="count", ascending=False).head(5))


Community 58
         word  integer  count
41146   white    41146  27910
8584    women     8584  14889
7907    black     7907  13629
25602   asian    25602  11049
43683  people    43683  10564

Community 192
         word  integer  count
48676   incel    48676  49933
32458  incels    32458  44285
39234  people    39234  13780
7857    women     7857  13309
5752    would     5752  10499

Community 24
        word  integer  count
32574   chad    32574  62386
37441  chads    37441  16060
6295   women     6295  10281
4580   would     4580   7614
37351   even    37351   7053

Community 214
             word  integer  count
27151        ugly    27151  19711
21498       looks    21498  11849
4972        women     4972  10190
24939      people    24939   9785
22871  attractive    22871   7602

Community 100
         word  integer  count
3309   height     3309  10812
22157   short    22157   8991
3626    women     3626   6354
10227    tall    10227   6254
10383     men    10383   4896

Communit

## Between the passes of A-priori

We create frequency tables where we assign each word an integer from 1-m, where m = number of frequent singletons (words), if the support of the word => 1%. In other words, it must appear in 1% or more of the baskets. If the word is not frequent, we assign it 0.

In [11]:

freq_tables = {}

threshold_factor = 0.01  

for cid, df_apriori in apriori_tables.items():

    # threshold is 1% of posts in that community
    threshold = threshold_factor * len(df_filtered[df_filtered["community"] == cid])

    frequent_map = np.zeros(len(df_apriori), dtype=int)
    new_id = 1

    for old_id, count in enumerate(df_apriori['count']):
        if count >= threshold:
            frequent_map[old_id] = new_id
            new_id += 1
        else:
            frequent_map[old_id] = 0

    # add freq_integer column
    df_apriori['freq_integer'] = frequent_map

    # store only frequent items in new dictionary
    df_freq = df_apriori[df_apriori['freq_integer'] != 0].copy()
    freq_tables[cid] = df_freq

    print(f"\nCommunity {cid} frequent items:")
    print(df_freq)



Community 58 frequent items:
             word  integer  count  freq_integer
379          find      379   2312             1
428        reason      428   1498             2
859       problem      859   1122             3
903    especially      903   1236             4
1097        every     1097   3059             5
...           ...      ...    ...           ...
56031      person    56031   1488           209
57156     someone    57156   1665           210
57225         sex    57225   1433           211
57640     getting    57640   2079           212
57747       never    57747   4076           213

[213 rows x 4 columns]

Community 192 frequent items:
               word  integer  count  freq_integer
340            find      340   3617             1
386          reason      386   2547             2
465    relationship      465   2008             3
680         virgins      680   1247             4
776         problem      776   2078             5
...             ...      ...    ...    

## Second pass of the A-priori algorithm

For the second pass, we first find all pairs of frequent words from the previous dataframes. We then create pairs of those, making sure to remove duplicates. We apply the support threshold of 1% here as well.

In [12]:
pair_tables = {}   # store results for each community

for cid in top_6_communities:

    print(f"\nCommunity {cid}")

    # pull posts for this community
    df_comm = df_filtered[df_filtered["community"] == cid]
    N = len(df_comm)

    # fetch frequent 1-itemset for this community
    df_freq = freq_tables[cid]
    frequent_words_set = set(df_freq["word"])

    # counter for all frequent pairs
    pair_counter = Counter()

    # iterate over all posts
    for tokens in df_comm["tokens"]:
        # keep only frequent tokens
        frequent_tokens = [t for t in tokens if t in frequent_words_set]

        # deduplicate within a post
        unique_tokens = set(frequent_tokens)

        # count each 2-item combination in this post
        for pair in combinations(unique_tokens, 2):
            pair_counter[tuple(sorted(pair))] += 1

    # convert counter → dataframe
    df_pairs = pd.DataFrame(pair_counter.items(), columns=["item_set", "count"])

    # threshold for frequent 2-itemsets (1% of posts)
    threshold = math.ceil(0.01 * N)
    df_pairs = df_pairs[df_pairs["count"] >= threshold]

    # store
    pair_tables[cid] = df_pairs

    # print summary
    print("Top pairs:")
    print(df_pairs.sort_values(by="count", ascending=False).head())
    print("Total pairs where Support(I) => s of 0.01:", len(df_pairs))


Community 58
Top pairs:
            item_set  count
834   (white, women)   7297
1562  (black, white)   5140
840     (men, white)   4779
831   (asian, white)   4727
893     (men, women)   4696
Total pairs where Support(I) => s of 0.01: 116

Community 192
Top pairs:
              item_set  count
33    (incels, people)   8951
155    (incels, women)   8217
891    (incel, incels)   8057
599    (incel, people)   7365
1765    (incel, women)   7263
Total pairs where Support(I) => s of 0.01: 741

Community 24
Top pairs:
          item_set  count
87   (chad, women)   8106
498  (chad, would)   6506
56    (chad, even)   5715
984   (chad, fuck)   4664
778   (chad, want)   4502
Total pairs where Support(I) => s of 0.01: 188

Community 214
Top pairs:
            item_set  count
689   (people, ugly)   4619
870    (ugly, women)   4511
872     (men, women)   3991
1042  (looks, women)   3338
890      (men, ugly)   3310
Total pairs where Support(I) => s of 0.01: 1216

Community 100
Top pairs:
           

# A-priori using library

To further validate the above results, we also implemented the A-priori algorithm using mlxtend. We find that the results of using the mlxtend framwork are congruent with the results found by implementing the A-priori algorithm as described in Mining of Massive Datasets.

In [13]:
te = TransactionEncoder()
encoded_tables = {} 

for cid in top_6_communities:
    print(f"\nCommunity {cid}")
    
    freq_words = set(freq_tables[cid]["word"])
    
    df_comm = df_filtered[df_filtered["community"] == cid]
    transactions = [
        [t for t in tokens if t in freq_words]
        for tokens in df_comm["tokens"]
    ]

    te_array = te.fit(transactions).transform(transactions)
    df_encoded = pd.DataFrame(te_array, columns=te.columns_)
    
    encoded_tables[cid] = df_encoded
    
    print(f"Number of frequent items (singletons) with Support(I) => s of 0.01: {df_encoded.shape[1]}")



Community 58
Number of frequent items (singletons) with Support(I) => s of 0.01: 213

Community 192
Number of frequent items (singletons) with Support(I) => s of 0.01: 320

Community 24
Number of frequent items (singletons) with Support(I) => s of 0.01: 182

Community 214
Number of frequent items (singletons) with Support(I) => s of 0.01: 361

Community 100
Number of frequent items (singletons) with Support(I) => s of 0.01: 214

Community 210
Number of frequent items (singletons) with Support(I) => s of 0.01: 255


In [14]:

frequent_2_itemsets_by_community = {}

for cid in top_6_communities:
    print(f"\nCommunity {cid}")
    
    df_encoded = encoded_tables[cid] 
    
    frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)
    
    # filter to only 2-itemsets
    frequent_2_itemsets = frequent_itemsets[
        frequent_itemsets['itemsets'].apply(lambda x: len(x) == 2)
    ].copy()
    
    frequent_2_itemsets_by_community[cid] = frequent_2_itemsets
    
    print("Number of frequent 2-itemsets:", len(frequent_2_itemsets))
    print("Top 5 pairs:")
    print(frequent_2_itemsets.sort_values(by="support", ascending=False).head())



Community 58
Number of frequent 2-itemsets: 116
Top 5 pairs:
      support        itemsets
326  0.066673  (white, women)
241  0.046964  (white, black)
290  0.043666    (men, white)
224  0.043191  (white, asian)
291  0.042907    (men, women)

Community 192
Number of frequent 2-itemsets: 741
Top 5 pairs:
      support          itemsets
754  0.100983  (people, incels)
816  0.092702   (incels, women)
593  0.090897   (incel, incels)
638  0.083090   (incel, people)
701  0.081939    (incel, women)

Community 24
Number of frequent 2-itemsets: 188
Top 5 pairs:
      support       itemsets
319  0.102288  (chad, women)
322  0.082098  (would, chad)
213  0.072116   (chad, even)
226  0.058854   (chad, fuck)
313  0.056810   (want, chad)

Community 214
Number of frequent 2-itemsets: 1216
Top 5 pairs:
       support        itemsets
1374  0.105303  (people, ugly)
1554  0.102841   (ugly, women)
1267  0.090986    (men, women)
1163  0.076099  (women, looks)
1261  0.075461     (men, ugly)

Community 100
Nu