# Frequent items analysis of reddit communities

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import networkx as nx
import nltk 
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
import html
from itertools import combinations
from collections import Counter
import math
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/livdreyerjohansen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Dataload and cleaning

We analysize what frequent items we may see in the reddit clusters found in 05_SemanticClustering.ipynb. The output of the analysis performed in aforementioned notebook is organized in a csv file with columns: id (user), text, and community.


In [2]:
df_raw = pd.read_csv('post_clusters.csv')

In [3]:
df_raw = df_raw.rename(columns={"label": "community"})
df_raw = df_raw.rename(columns={"user": "id"})

To ensure we do not include posts that are either deleted ("\[deleted\]") or removed ("\[removed\]"), both basic reddit features that happen independently of what forum you are in, we remove both. Furthermore, we remove each post that was removed by a bot, which is clear in the text which the bot uses to explain why a post or comment is deleted. We then construct a dataframe with all posts and their community (and original poster (OP in reddit linguistics)).

In [4]:
# Expected columns in CSV:
# id, text, community
# (tell me if they differ)
df_raw.drop(df_raw[df_raw["community"] == -1].index, inplace=True)

rows = []
rows_count = 0
allowed_rows = 0

for _, r in df_raw.iterrows():
    rows_count += 1
    post = r["text"]

    # Skip empty, deleted/removed posts, or posts containing the bot line
    if post and post not in ['[deleted]', '[removed]'] and \
       "*I am a bot, and this action was performed automatically." not in post:

        allowed_rows += 1

        rows.append({
            "id": r["id"],
            "text": post,
            "community": r["community"]
        })

# Build the dataframe exactly like before
df = pd.DataFrame(rows)

print(df.head())

print("\nNumber of original posts:")
print(f"{rows_count:,}")
print("Number of removed posts:")
print(f"{rows_count-allowed_rows:,}")
print("Number of posts in dataframe:")
print(f"{allowed_rows:,}")

             id                                               text  community
0  9.290107e+08  great now you live with a hole in your head fo...         44
1  9.290107e+08  what? i mean i get where you're comming from b...          9
2  9.290107e+08  chaggot? more like that fucking weirdo who sta...          9
3  9.290107e+08  but then i'd have to leave too and you wouldnt...        179
4  9.290107e+08  wait what's the deal with that guy? he's prett...        114

Number of original posts:
1,102,030
Number of removed posts:
0
Number of posts in dataframe:
1,102,030


We identify the top-6 largest communities in terms of posts to continue working with only them.

In [5]:
communities = df['community'].unique().tolist()
communities_dict = dict.fromkeys(communities, 0)

for index, row in df.iterrows():
    communities_dict[row['community']] += 1

Filter the dataframe to only contain posts from top-6 communities

In [6]:
comms_list = list(sorted( ((v,k) for k,v in communities_dict.items()), reverse=True))
comms_list = comms_list[:6]
top_6_communities = [item[1] for item in comms_list]

df_filtered = df[df['community'].isin(top_6_communities)].copy()

print("\nTop 6 communities by number of posts (posts, community):", comms_list)

print(f"Number of posts in top 6 communities: {len(df_filtered):,}")

print(f"Number of posts removed based on non identity in top 6: {(len(df) - len(df_filtered)):,}")



Top 6 communities by number of posts (posts, community): [(95532, 9), (93385, 130), (54467, 81), (51607, 94), (32530, 204), (21818, 198)]
Number of posts in top 6 communities: 349,339
Number of posts removed based on non identity in top 6: 752,691


## Stop words

We filter out parts of post that we deem have little semantic value. We aim to find frequent items and frequent itemsets (item pairs), and would assume that stop words regularly occur in more than 1% of baskets. As we are working with online fora, we chose to add certain slang-terms as stop words. We furthermore remove:

- html entities
- URL's
- non-text artifacts (such as "/", "?", "!" etc.)
- remaining "removed" and "deleted" artifacts that were not removed in the previous code block due to the way the post was loaded
- short words (length of 2 or less)

Additionally, we make all words lowercase to steamline and tokenize by word (meaning each word will be its own token)

In [7]:
# tokenize and clean text data

stop_words = set(stopwords.words('english'))

# extend basic english stopwords with slang terms
extra_stops = {
    'lol', 'xd', 'haha', 'hahaah', 'omg', 'u', 'ur', 'im', 'ive', 'idk', 
    'dont', 'cant', 'wont', 'aint', 'ya', 'tho', 'tho', 'nah', 'btw', 
    'like', 'yeah', 'yep', 'ok', 'okay', 'pls', 'please', 'get'
}
stop_words.update(extra_stops)

def preprocess_text(text):
    if not isinstance(text, str):
        return []
    # decode HTML entities: &amp; → &, &#x200B; → zero-width space, etc.
    text = html.unescape(text)
    # lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)
    # keep only letters and spaces
    text = re.sub(r"[^a-z\s]", " ", text)
    # tokenize by whitespace
    tokens = text.split()
    # remove stopwords and very short tokens
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    if len(tokens) == 1 and tokens[0] in {"removed", "deleted"}:
        return []
    return tokens

df_filtered["tokens"] = df_filtered["text"].apply(preprocess_text)
df_filtered["n_tokens"] = df_filtered["tokens"].apply(len)

df_filtered.head()

Unnamed: 0,id,text,community,tokens,n_tokens
1,929010700.0,what? i mean i get where you're comming from b...,9,"[mean, comming, really, inspire, respect, chads]",6
2,929010700.0,chaggot? more like that fucking weirdo who sta...,9,"[chaggot, fucking, weirdo, stalks, grindr, gay...",21
5,929010700.0,isn't that the opposite of incels? she has so ...,130,"[opposite, incels, manny, options, settle, cre...",7
7,929010700.0,what's so bad about hiring incels anyways? wou...,130,"[bad, hiring, incels, anyways, productive, acc...",13
8,929010700.0,hey she deserved what's coming for her. can't ...,198,"[hey, deserved, coming, feel, bad]",5


# Frequent items and the A-priori algorithm

We process the tokenized posts by identifying the amount of unique tokens for each community.

In [8]:
# build token statistics for each of the top 6 communities

community_token_stats = {}

for community_id in top_6_communities:
    df_comm = df_filtered[df_filtered["community"] == community_id]

    # flatten all tokens for this community
    all_tokens = []
    for tokens in df_comm["tokens"]:
        all_tokens.extend(tokens)

    unique_tokens = set(all_tokens)

    community_token_stats[community_id] = {
        "n_tokens": len(all_tokens),
        "n_unique_tokens": len(unique_tokens),       
        "unique_tokens": unique_tokens 
    }


for cid in top_6_communities:
    print(f"Community {cid}:")
    print(f"  Total tokens: {community_token_stats[cid]['n_tokens']:,}")
    print(f"  Unique tokens: {community_token_stats[cid]['n_unique_tokens']:,}")

Community 9:
  Total tokens: 1,342,981
  Unique tokens: 49,183
Community 130:
  Total tokens: 1,868,461
  Unique tokens: 52,898
Community 81:
  Total tokens: 868,501
  Unique tokens: 38,103
Community 94:
  Total tokens: 762,842
  Unique tokens: 31,028
Community 204:
  Total tokens: 681,874
  Unique tokens: 27,089
Community 198:
  Total tokens: 253,499
  Unique tokens: 19,440


## First pass of the A-priori algorithm

In the first pass of the A-priori algoritm, we initialize a dataframe for each of the communities. In this dataframe, we will store each of the unique tokens found previously, assign them each an integer from 0 to n-1 (number of unique tokens), and count how many baskets (posts) the item (token) appears in. It is important to note that we do not count the total occurrence of the token but only the amount of posts it appears in. In Mining of Massive Datasets, Section 6.2.2, the first pass is described as labeling integers 1 to n, but to keep it within the python framework, we label 0 to n-1 and mentioned.

In [9]:
apriori_tables = {}

for cid in top_6_communities:
    # get df for this community
    df_comm = df_filtered[df_filtered["community"] == cid]
    unique_tokens = list(community_token_stats[cid]["unique_tokens"])

    # apriori table
    df_apriori = pd.DataFrame({
        "word": unique_tokens,
        "integer": range(len(unique_tokens))
    })

    # give each word an integer from 0 to n-1
    word_to_int = dict(zip(df_apriori["word"], df_apriori["integer"]))

    # count posts that contain each token
    array_of_counts = np.zeros(len(unique_tokens), dtype=int)

    for tokens in df_comm["tokens"]:
        for token in set(tokens):             
            array_of_counts[word_to_int[token]] += 1

    df_apriori["count"] = array_of_counts

    # save in dict
    apriori_tables[cid] = df_apriori


In [10]:
# print top 5 tokens by count for each community
for cid, df_apriori in apriori_tables.items():
    print(f"\nCommunity {cid}")
    print(df_apriori.sort_values(by="count", ascending=False).head(5))


Community 9
        word  integer  count
43652   chad    43652  70535
44159  chads    44159  18492
3403   women     3403  12601
28993  would    28993   9553
42656   even    42656   8937

Community 130
         word  integer  count
35014   incel    35014  52750
5465   incels     5465  45845
3707    women     3707  13515
9308   people     9308  13499
31114   would    31114  10744

Community 81
            word  integer  count
9257       white     9257  19078
2599       women     2599  10160
34732      black    34732   9942
29224      asian    29224   8863
10580  blackpill    10580   7791

Community 94
         word  integer  count
28412  height    28412  16448
25648   short    25648  11274
4627     tall     4627   8733
2125    women     2125   7709
29841     men    29841   5939

Community 204
             word  integer  count
19739        ugly    19739  16418
13225       looks    13225   8750
4778       people     4778   7296
1857        women     1857   6437
26975  attractive    26975 

## Between the passes of A-priori

We create frequency tables where we assign each word an integer from 1-m, where m = number of frequent singletons (words), if the support of the word => 1%. In other words, it must appear in 1% or more of the baskets. If the word is not frequent, we assign it 0.

In [11]:

freq_tables = {}

threshold_factor = 0.01  

for cid, df_apriori in apriori_tables.items():

    # threshold is 1% of posts in that community
    threshold = threshold_factor * len(df_filtered[df_filtered["community"] == cid])

    frequent_map = np.zeros(len(df_apriori), dtype=int)
    new_id = 1

    for old_id, count in enumerate(df_apriori['count']):
        if count >= threshold:
            frequent_map[old_id] = new_id
            new_id += 1
        else:
            frequent_map[old_id] = 0

    # add freq_integer column
    df_apriori['freq_integer'] = frequent_map

    # store only frequent items in new dictionary
    df_freq = df_apriori[df_apriori['freq_integer'] != 0].copy()
    freq_tables[cid] = df_freq

    print(f"\nCommunity {cid} frequent items:")
    print(df_freq)



Community 9 frequent items:
             word  integer  count  freq_integer
309         thing      309   2255             1
505          ever      505   1957             2
782           see      782   3338             3
1005         date     1005   1658             4
1037        wants     1037   1843             5
...           ...      ...    ...           ...
47840         way    47840   3022           195
47989         try    47989   1408           196
48074        come    48074   1391           197
48672      dating    48672   1342           198
48954  attractive    48954   2478           199

[199 rows x 4 columns]

Community 130 frequent items:
             word  integer  count  freq_integer
348         thing      348   3964             1
552          ever      552   2964             2
873           see      873   5461             3
1103         date     1103   1656             4
1141        wants     1141   1215             5
...           ...      ...    ...           ...
5196

## Second pass of the A-priori algorithm

For the second pass, we first find all pairs of frequent words from the previous dataframes. We then create pairs of those, making sure to remove duplicates. We apply the support threshold of 1% here as well.

In [12]:
pair_tables = {}   # store results for each community

for cid in top_6_communities:

    print(f"\nCommunity {cid}")

    # pull posts for this community
    df_comm = df_filtered[df_filtered["community"] == cid]
    N = len(df_comm)

    # fetch frequent 1-itemset for this community
    df_freq = freq_tables[cid]
    frequent_words_set = set(df_freq["word"])

    # counter for all frequent pairs
    pair_counter = Counter()

    # iterate over all posts
    for tokens in df_comm["tokens"]:
        # keep only frequent tokens
        frequent_tokens = [t for t in tokens if t in frequent_words_set]

        # deduplicate within a post
        unique_tokens = set(frequent_tokens)

        # count each 2-item combination in this post
        for pair in combinations(unique_tokens, 2):
            pair_counter[tuple(sorted(pair))] += 1

    # convert counter → dataframe
    df_pairs = pd.DataFrame(pair_counter.items(), columns=["item_set", "count"])

    # threshold for frequent 2-itemsets (1% of posts)
    threshold = math.ceil(0.01 * N)
    df_pairs = df_pairs[df_pairs["count"] >= threshold]

    # store
    pair_tables[cid] = df_pairs

    # print summary
    print("Top pairs:")
    print(df_pairs.sort_values(by="count", ascending=False).head())
    print("Total pairs where Support(I) => s of 0.01:", len(df_pairs))


Community 9
Top pairs:
          item_set  count
64   (chad, women)   9652
337  (chad, would)   7643
59    (chad, even)   6866
292   (chad, fuck)   5460
890   (chad, want)   5359
Total pairs where Support(I) => s of 0.01: 211

Community 130
Top pairs:
              item_set  count
696   (incels, people)   8725
132    (incels, women)   8261
1003   (incel, incels)   8194
1266    (incel, women)   7332
529    (incel, people)   7214
Total pairs where Support(I) => s of 0.01: 584

Community 81
Top pairs:
            item_set  count
718   (white, women)   5521
723   (asian, white)   3973
1284  (black, white)   3854
800     (men, white)   3565
782     (men, women)   3446
Total pairs where Support(I) => s of 0.01: 226

Community 94
Top pairs:
            item_set  count
11    (face, height)   3373
227  (height, women)   3338
164   (short, women)   3123
32      (men, women)   3052
182     (men, short)   3052
Total pairs where Support(I) => s of 0.01: 326

Community 204
Top pairs:
             i

# A-priori using library

To further validate the above results, we also implemented the A-priori algorithm using mlxtend. We find that the results of using the mlxtend framwork are congruent with the results found by implementing the A-priori algorithm as described in Mining of Massive Datasets.

In [13]:
te = TransactionEncoder()
encoded_tables = {} 

for cid in top_6_communities:
    print(f"\nCommunity {cid}")
    
    freq_words = set(freq_tables[cid]["word"])
    
    df_comm = df_filtered[df_filtered["community"] == cid]
    transactions = [
        [t for t in tokens if t in freq_words]
        for tokens in df_comm["tokens"]
    ]

    te_array = te.fit(transactions).transform(transactions)
    df_encoded = pd.DataFrame(te_array, columns=te.columns_)
    
    encoded_tables[cid] = df_encoded
    
    print(f"Number of frequent items (singletons) with Support(I) => s of 0.01: {df_encoded.shape[1]}")



Community 9
Number of frequent items (singletons) with Support(I) => s of 0.01: 199

Community 130
Number of frequent items (singletons) with Support(I) => s of 0.01: 298

Community 81
Number of frequent items (singletons) with Support(I) => s of 0.01: 224

Community 94
Number of frequent items (singletons) with Support(I) => s of 0.01: 205

Community 204
Number of frequent items (singletons) with Support(I) => s of 0.01: 318

Community 198
Number of frequent items (singletons) with Support(I) => s of 0.01: 190


In [14]:

frequent_2_itemsets_by_community = {}

for cid in top_6_communities:
    print(f"\nCommunity {cid}")
    
    df_encoded = encoded_tables[cid] 
    
    frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)
    
    # filter to only 2-itemsets
    frequent_2_itemsets = frequent_itemsets[
        frequent_itemsets['itemsets'].apply(lambda x: len(x) == 2)
    ].copy()
    
    frequent_2_itemsets_by_community[cid] = frequent_2_itemsets
    
    print("Number of frequent 2-itemsets:", len(frequent_2_itemsets))
    print("Top 5 pairs:")
    print(frequent_2_itemsets.sort_values(by="support", ascending=False).head())



Community 9
Number of frequent 2-itemsets: 211
Top 5 pairs:
      support       itemsets
344  0.101034  (women, chad)
347  0.080005  (chad, would)
231  0.071871   (even, chad)
247  0.057154   (fuck, chad)
338  0.056096   (want, chad)

Community 130
Number of frequent 2-itemsets: 584
Top 5 pairs:
      support          itemsets
647  0.093430  (people, incels)
708  0.088462   (women, incels)
498  0.087744   (incel, incels)
599  0.078514    (incel, women)
538  0.077250   (incel, people)

Community 81
Number of frequent 2-itemsets: 226
Top 5 pairs:
      support        itemsets
445  0.101364  (women, white)
251  0.072943  (asian, white)
286  0.070758  (black, white)
378  0.065452    (white, men)
379  0.063268    (women, men)

Community 94
Number of frequent 2-itemsets: 326
Top 5 pairs:
      support         itemsets
270  0.065359   (height, face)
395  0.064681  (height, women)
500  0.060515   (women, short)
445  0.059139     (women, men)
436  0.059139     (short, men)

Community 204
Numbe