# Frequent items analysis of reddit communities

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import networkx as nx
import nltk 
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
import html
from itertools import combinations
from collections import Counter
import math
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/livdreyerjohansen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Dataload and cleaning

To analysize the what frequent items we may see in the reddit communities found in 03_NetworkAnalysis.ipynb, we must first load the graph with the added attributes, that tell what community each node belongs in and the posts created by each node. We have chosen to only look at the 6 largest communities, by number of nodes, as the distribution of nodes / community is very heavly right skewed.



In [41]:
#load graph

G = nx.read_gml('FINAL_reddit_graph_with_louvain_communities.gml')

To ensure we do not include posts that are either deleted ("\[deleted\]") or removed ("\[removed\]"), both basic reddit features that happen independently of what forum you are in, we remove both. Furthermore, we remove each post that was removed by a bot, which is clear in the text which the bot uses to explain why a post or comment is deleted. We then construct a dataframe with all posts and their community (and original poster (OP in reddit linguistics)).

In [42]:
rows = []
rows_count = 0
allowed_rows = 0

for node, data in G.nodes(data=True):
    community = data.get("community")
    posts_dict = data.get("posts", {})

    # Ensure it's a dictionary
    if not isinstance(posts_dict, dict):
        posts_dict = {"default": posts_dict}

    # Loop through each list of posts in the dictionary
    for key, posts in posts_dict.items():
        if not isinstance(posts, list):
            posts = [posts]

        for post in posts:
            rows_count += 1
            # Skip empty, deleted/removed posts, or posts containing the bot line
            if post and post not in ['[deleted]', '[removed]'] and \
               "*I am a bot, and this action was performed automatically." not in post:
                allowed_rows += 1
                rows.append({
                    "id": node,
                    "text": post,
                    "community": community
                })

df = pd.DataFrame(rows)

print(df.head())

print("\nNumber of original posts:")
print(f"{rows_count:,}")
print("Number of removed posts:")
print(f"{rows_count-allowed_rows:,}")
print("Number of posts in dataframe:")
print(f"{allowed_rows:,}")

  id                                               text  community
0  1  "Huh it's still not legalized yet. America is ...        130
1  1  "Hey charisma helps. Everybody wants to sleep ...        130
2  1  Aren't the jedis not really good guys though? ...        130
3  1  Wait but ferb is the better looking one with a...        130
4  1  Great now you live with a hole in your head fo...        130

Number of original posts:
2,664,156
Number of removed posts:
158,251
Number of posts in dataframe:
2,505,905


We identify the top-6 largest communities in terms of nodes to continue working with only them.

In [43]:
communities = df['community'].unique().tolist()
communities_dict = dict.fromkeys(communities, 0)

for index, row in df.iterrows():
    communities_dict[row['community']] += 1

Filter the dataframe to only contain posts from top-6 communities

In [44]:
comms_list = list(sorted( ((v,k) for k,v in communities_dict.items()), reverse=True))
comms_list = comms_list[:6]
top_6_communities = [item[1] for item in comms_list]

df_filtered = df[df['community'].isin(top_6_communities)].copy()

print("\nTop 6 communities by number of posts (posts, community):", comms_list)

print(f"Number of posts in top 6 communities: {len(df_filtered):,}")

print(f"Number of posts removed based on non identity in top 6: {(len(df) - len(df_filtered)):,}")



Top 6 communities by number of posts (posts, community): [(644383, 130), (635828, 89), (630172, 129), (399843, 191), (154109, 220), (38490, 188)]
Number of posts in top 6 communities: 2,502,825
Number of posts removed based on non identity in top 6: 3,080


## Stop words

We filter out parts of post that we deem have little semantic value. We aim to find frequent items and frequent itemsets (item pairs), and would assume that stop words regularly occur in more than 1% of baskets. As we are working with online fora, we chose to add certain slang-terms as stop words. We furthermore remove:

- html entities
- URL's
- non-text artifacts (such as "/", "?", "!" etc.)
- remaining "removed" and "deleted" artifacts that were not removed in the previous code block due to the way the post was loaded
- short words (length of 2 or less)

Additionally, we make all words lowercase to steamline and tokenize by word (meaning each word will be its own token)

In [45]:
# tokenize and clean text data

stop_words = set(stopwords.words('english'))

# extend basic english stopwords with slang terms
extra_stops = {
    'lol', 'xd', 'haha', 'hahaah', 'omg', 'u', 'ur', 'im', 'ive', 'idk', 
    'dont', 'cant', 'wont', 'aint', 'ya', 'tho', 'tho', 'nah', 'btw', 
    'like', 'yeah', 'yep', 'ok', 'okay', 'pls', 'please'
}
stop_words.update(extra_stops)

def preprocess_text(text):
    if not isinstance(text, str):
        return []
    # decode HTML entities: &amp; → &, &#x200B; → zero-width space, etc.
    text = html.unescape(text)
    # lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)
    # keep only letters and spaces
    text = re.sub(r"[^a-z\s]", " ", text)
    # tokenize by whitespace
    tokens = text.split()
    # remove stopwords and very short tokens
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    if len(tokens) == 1 and tokens[0] in {"removed", "deleted"}:
        return []
    return tokens

df_filtered["tokens"] = df_filtered["text"].apply(preprocess_text)
df_filtered["n_tokens"] = df_filtered["tokens"].apply(len)

df_filtered.head()

Unnamed: 0,id,text,community,tokens,n_tokens
0,1,"""Huh it's still not legalized yet. America is ...",130,"[huh, still, legalized, yet, america, weirdly,...",7
1,1,"""Hey charisma helps. Everybody wants to sleep ...",130,"[hey, charisma, helps, everybody, wants, sleep...",11
2,1,Aren't the jedis not really good guys though? ...,130,"[jedis, really, good, guys, though, protect, s...",24
3,1,Wait but ferb is the better looking one with a...,130,"[wait, ferb, better, looking, one, actual, gam...",13
4,1,Great now you live with a hole in your head fo...,130,"[great, live, hole, head, eternity]",5


# Frequent items and the A-priori algorithm

We process the tokenized posts by identifying the amount of unique tokens for each community.

In [46]:
# build token statistics for each of the top 6 communities

community_token_stats = {}

for community_id in top_6_communities:
    df_comm = df_filtered[df_filtered["community"] == community_id]

    # flatten all tokens for this community
    all_tokens = []
    for tokens in df_comm["tokens"]:
        all_tokens.extend(tokens)

    unique_tokens = set(all_tokens)

    community_token_stats[community_id] = {
        "n_tokens": len(all_tokens),
        "n_unique_tokens": len(unique_tokens),       
        "unique_tokens": unique_tokens 
    }


for cid in top_6_communities:
    print(f"Community {cid}:")
    print(f"  Total tokens: {community_token_stats[cid]['n_tokens']:,}")
    print(f"  Unique tokens: {community_token_stats[cid]['n_unique_tokens']:,}")

Community 130:
  Total tokens: 8,227,111
  Unique tokens: 140,320
Community 89:
  Total tokens: 7,790,340
  Unique tokens: 149,732
Community 129:
  Total tokens: 6,759,663
  Unique tokens: 146,731
Community 191:
  Total tokens: 4,754,898
  Unique tokens: 89,035
Community 220:
  Total tokens: 1,236,945
  Unique tokens: 59,694
Community 188:
  Total tokens: 1,315,555
  Unique tokens: 33,101


## First pass of the A-priori algorithm

In the first pass of the A-priori algoritm, we initialize a dataframe for each of the communities. In this dataframe, we will store each of the unique tokens found previously, assign them each an integer from 0 to n-1 (number of unique tokens), and count how many baskets (posts) the item (token) appears in. It is important to note that we do not count the total occurrence of the token but only the amount of posts it appears in. In Mining of Massive Datasets, Section 6.2.2, the first pass is described as labeling integers 1 to n, but to keep it within the python framework, we label 0 to n-1 and mentioned.

In [47]:
apriori_tables = {}

for cid in top_6_communities:
    # get df for this community
    df_comm = df_filtered[df_filtered["community"] == cid]
    unique_tokens = list(community_token_stats[cid]["unique_tokens"])

    # apriori table
    df_apriori = pd.DataFrame({
        "word": unique_tokens,
        "integer": range(len(unique_tokens))
    })

    # give each word an integer from 0 to n-1
    word_to_int = dict(zip(df_apriori["word"], df_apriori["integer"]))

    # count posts that contain each token
    array_of_counts = np.zeros(len(unique_tokens), dtype=int)

    for tokens in df_comm["tokens"]:
        for token in set(tokens):             
            array_of_counts[word_to_int[token]] += 1

    df_apriori["count"] = array_of_counts

    # save in dict
    apriori_tables[cid] = df_apriori


In [48]:
# print top 5 tokens by count for each community
for cid, df_apriori in apriori_tables.items():
    print(f"\nCommunity {cid}")
    print(df_apriori.sort_values(by="count", ascending=False).head(5))


Community 130
          word  integer  count
44224    women    44224  64054
30485      get    30485  54969
134754  people   134754  54841
137670   would   137670  45478
98667     even    98667  41610

Community 89
          word  integer  count
46905    women    46905  61740
32425      get    32425  55819
143859  people   143859  47917
146996   would   146996  41218
105328    even   105328  39135

Community 129
          word  integer  count
46115    women    46115  49985
31846      get    31846  49251
102973    even   102973  37953
140853  people   140853  36781
144013   would   144013  35755

Community 191
         word  integer  count
19336     get    19336  37192
28003   women    28003  36120
87399   would    87399  28225
85470  people    85470  27657
62613    even    62613  27309

Community 220
         word  integer  count
26143     get    26143   8165
37703   women    37703   7367
24451    even    24451   6899
57474   would    57474   5976
54904  people    54904   5766

Communi

## Between the passes of A-priori

We create frequency tables where we assign each word an integer from 1-m, where m = number of frequent singletons (words), if the support of the word => 1%. In other words, it must appear in 1% or more of the baskets. If the word is not frequent, we assign it 0.

In [49]:

freq_tables = {}

threshold_factor = 0.01  

for cid, df_apriori in apriori_tables.items():

    # threshold is 1% of posts in that community
    threshold = threshold_factor * len(df_filtered[df_filtered["community"] == cid])

    frequent_map = np.zeros(len(df_apriori), dtype=int)
    new_id = 1

    for old_id, count in enumerate(df_apriori['count']):
        if count >= threshold:
            frequent_map[old_id] = new_id
            new_id += 1
        else:
            frequent_map[old_id] = 0

    # add freq_integer column
    df_apriori['freq_integer'] = frequent_map

    # store only frequent items in new dictionary
    df_freq = df_apriori[df_apriori['freq_integer'] != 0].copy()
    freq_tables[cid] = df_freq

    print(f"\nCommunity {cid} frequent items:")
    print(df_freq)



Community 130 frequent items:
              word  integer  count  freq_integer
159          place      159   7391             1
600           used      600   6884             2
1597           end     1597   6460             3
3313    attractive     3313  13017             4
3731         looks     3731  18848             5
...            ...      ...    ...           ...
136959        also   136959  19237           173
137670       would   137670  45478           174
137781         sub   137781  16094           175
139255         guy   139255  21330           176
139807        shit   139807  24811           177

[177 rows x 4 columns]

Community 89 frequent items:
              word  integer  count  freq_integer
167          place      167   6568             1
626           used      626   6731             2
1762           end     1762   6531             3
3531    attractive     3531  11692             4
3980         looks     3980  18198             5
...            ...      ...    ..

## Second pass of the A-priori algorithm

For the second pass, we first find all pairs of frequent words from the previous dataframes. We then create pairs of those, making sure to remove duplicates. We apply the support threshold of 1% here as well.

In [50]:
pair_tables = {}   # store results for each community

for cid in top_6_communities:

    print(f"\nCommunity {cid}")

    # pull posts for this community
    df_comm = df_filtered[df_filtered["community"] == cid]
    N = len(df_comm)

    # fetch frequent 1-itemset for this community
    df_freq = freq_tables[cid]
    frequent_words_set = set(df_freq["word"])

    # counter for all frequent pairs
    pair_counter = Counter()

    # iterate over all posts
    for tokens in df_comm["tokens"]:
        # keep only frequent tokens
        frequent_tokens = [t for t in tokens if t in frequent_words_set]

        # deduplicate within a post
        unique_tokens = set(frequent_tokens)

        # count each 2-item combination in this post
        for pair in combinations(unique_tokens, 2):
            pair_counter[tuple(sorted(pair))] += 1

    # convert counter → dataframe
    df_pairs = pd.DataFrame(pair_counter.items(), columns=["item_set", "count"])

    # threshold for frequent 2-itemsets (1% of posts)
    threshold = math.ceil(0.01 * N)
    df_pairs = df_pairs[df_pairs["count"] >= threshold]

    # store
    pair_tables[cid] = df_pairs

    # print summary
    print("Top pairs:")
    print(df_pairs.sort_values(by="count", ascending=False).head())
    print("Total pairs where Support(I) => s of 0.01:", len(df_pairs))


Community 130
Top pairs:
             item_set  count
923      (men, women)  17992
409      (get, women)  12466
2374  (people, women)  11308
96      (get, people)  11257
963   (people, think)  10248
Total pairs where Support(I) => s of 0.01: 39

Community 89
Top pairs:
             item_set  count
35       (men, women)  17505
594      (get, women)  12642
557     (get, people)  10492
127   (people, women)   9955
1084    (even, women)   9487
Total pairs where Support(I) => s of 0.01: 30

Community 129
Top pairs:
           item_set  count
533    (men, women)  14147
1889   (get, women)   9308
146     (even, get)   8102
1187  (even, women)   7761
680   (get, people)   7473
Total pairs where Support(I) => s of 0.01: 11

Community 191
Top pairs:
           item_set  count
2893   (men, women)   9671
10     (get, women)   7181
2209    (even, get)   6164
543   (get, people)   5949
4144  (even, women)   5711
Total pairs where Support(I) => s of 0.01: 23

Community 220
Top pairs:
        item_se

# A-priori using library

To further validate the above results, we also implemented the A-priori algorithm using mlxtend. We find that the results of using the mlxtend framwork are congruent with the results found by implementing the A-priori algorithm as described in Mining of Massive Datasets.

In [51]:
te = TransactionEncoder()
encoded_tables = {} 

for cid in top_6_communities:
    print(f"\nCommunity {cid}")
    
    freq_words = set(freq_tables[cid]["word"])
    
    df_comm = df_filtered[df_filtered["community"] == cid]
    transactions = [
        [t for t in tokens if t in freq_words]
        for tokens in df_comm["tokens"]
    ]

    te_array = te.fit(transactions).transform(transactions)
    df_encoded = pd.DataFrame(te_array, columns=te.columns_)
    
    encoded_tables[cid] = df_encoded
    
    print(f"Number of frequent items (singletons) with Support(I) => s of 0.01: {df_encoded.shape[1]}")



Community 130
Number of frequent items (singletons) with Support(I) => s of 0.01: 177

Community 89
Number of frequent items (singletons) with Support(I) => s of 0.01: 171

Community 129
Number of frequent items (singletons) with Support(I) => s of 0.01: 145

Community 191
Number of frequent items (singletons) with Support(I) => s of 0.01: 168

Community 220
Number of frequent items (singletons) with Support(I) => s of 0.01: 94

Community 188
Number of frequent items (singletons) with Support(I) => s of 0.01: 543


In [52]:

frequent_2_itemsets_by_community = {}

for cid in top_6_communities:
    print(f"\nCommunity {cid}")
    
    df_encoded = encoded_tables[cid] 
    
    frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)
    
    # filter to only 2-itemsets
    frequent_2_itemsets = frequent_itemsets[
        frequent_itemsets['itemsets'].apply(lambda x: len(x) == 2)
    ].copy()
    
    frequent_2_itemsets_by_community[cid] = frequent_2_itemsets
    
    print("Number of frequent 2-itemsets:", len(frequent_2_itemsets))
    print("Top 5 pairs:")
    print(frequent_2_itemsets.sort_values(by="support", ascending=False).head())



Community 130
Number of frequent 2-itemsets: 39
Top 5 pairs:
      support         itemsets
200  0.027921     (women, men)
190  0.019346     (women, get)
207  0.017549  (people, women)
187  0.017469    (people, get)
205  0.015904  (people, think)

Community 89
Number of frequent 2-itemsets: 30
Top 5 pairs:
      support         itemsets
189  0.027531     (women, men)
183  0.019883     (women, get)
180  0.016501    (people, get)
194  0.015657  (people, women)
175  0.014921    (women, even)

Community 129
Number of frequent 2-itemsets: 11
Top 5 pairs:
      support       itemsets
151  0.022449   (women, men)
149  0.014771   (women, get)
145  0.012857    (even, get)
147  0.012316  (even, women)
148  0.011859  (people, get)

Community 191
Number of frequent 2-itemsets: 23
Top 5 pairs:
      support       itemsets
181  0.024187   (women, men)
178  0.017960   (women, get)
168  0.015416    (even, get)
175  0.014878  (people, get)
170  0.014283  (even, women)

Community 220
Number of frequent