# Frequent items analysis of reddit communities

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import networkx as nx
import nltk 
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
from itertools import combinations
import html

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/livdreyerjohansen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Dataload and cleaning

To analysize the what frequent items we may see in the reddit communities found in 03_NetworkAnalysis.ipynb, we must first load the graph with the added attributes, that tell what community each node belongs in and the posts created by each node. We have chosen to only look at the 6 largest communities, by number of nodes, as the distribution of nodes / community is very heavly right skewed.



In [3]:
#load graph

G = nx.read_gml('reddit_graph_with_communities.gml')

To ensure we do not include posts that are either deleted ("\[deleted\]") or removed ("\[removed\]"), both basic reddit features that happen independently of what forum you are in, we remove both. Furthermore, we remove each post that was removed by a bot, which is clear in the text which the bot uses to explain why a post or comment is deleted. We then construct a dataframe with all posts and their community (and original poster (OP in reddit linguistics)).

In [4]:
rows = []
rows_count = 0
allowed_rows = 0

for node, data in G.nodes(data=True):
    community = data.get("community")
    posts_dict = data.get("posts", {})

    # Ensure it's a dictionary
    if not isinstance(posts_dict, dict):
        posts_dict = {"default": posts_dict}

    # Loop through each list of posts in the dictionary
    for key, posts in posts_dict.items():
        if not isinstance(posts, list):
            posts = [posts]

        for post in posts:
            rows_count += 1
            # Skip empty, deleted/removed posts, or posts containing the bot line
            if post and post not in ['[deleted]', '[removed]'] and \
               "*I am a bot, and this action was performed automatically." not in post:
                allowed_rows += 1
                rows.append({
                    "id": node,
                    "text": post,
                    "community": community
                })

df = pd.DataFrame(rows)

print(df.head())


print("\nNumber of original posts:")
print(rows_count)
print("Number of removed posts:")
print(rows_count-allowed_rows)
print("Number of posts in dataframe:")
print(rows_count-allowed_rows)

  id                                               text  community
0  1  "Huh it's still not legalized yet. America is ...       5595
1  1  "Hey charisma helps. Everybody wants to sleep ...       5595
2  1  Aren't the jedis not really good guys though? ...       5595
3  1  Wait but ferb is the better looking one with a...       5595
4  1  Great now you live with a hole in your head fo...       5595

Number of original posts:
2699919
Number of removed posts:
163205
Number of posts in dataframe:
163205


We identify the top-6 largest communities in terms of nodes to continue working with only them.

In [5]:
communities = df['community'].unique().tolist()
communities_dict = dict.fromkeys(communities, 0)

for index, row in df.iterrows():
    communities_dict[row['community']] += 1

Filter the dataframe to only contain posts from top-6 communities

In [6]:
comms_list = list(sorted( ((v,k) for k,v in communities_dict.items()), reverse=True))
comms_list = comms_list[:6]
top_6_communities = [item[1] for item in comms_list]

df_filtered = df[df['community'].isin(top_6_communities)].copy()

print(f"Number of posts in top 6 communities: {len(df_filtered):,}")

print(f"Number of posts removed based on non identity in top 6: {(len(df) - len(df_filtered)):,}")


Number of posts in top 6 communities: 2,503,488
Number of posts removed based on non identity in top 6: 33,226


In [7]:
# tokenize and clean text data

stop_words = set(stopwords.words('english'))

# extend basic english stopwords with slang terms
extra_stops = {
    'lol', 'xd', 'haha', 'hahaah', 'omg', 'u', 'ur', 'im', 'ive', 'idk', 
    'dont', 'cant', 'wont', 'aint', 'ya', 'tho', 'tho', 'nah', 'btw', 
    'like', 'yeah', 'yep', 'ok', 'okay', 'pls', 'please'
}
stop_words.update(extra_stops)

def preprocess_text(text):
    if not isinstance(text, str):
        return []
    # decode HTML entities: &amp; → &, &#x200B; → zero-width space, etc.
    text = html.unescape(text)
    # lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)
    # keep only letters and spaces
    text = re.sub(r"[^a-z\s]", " ", text)
    # tokenize by whitespace
    tokens = text.split()
    # remove stopwords and very short tokens
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    if len(tokens) == 1 and tokens[0] in {"removed", "deleted"}:
        return []
    return tokens

df_filtered["tokens"] = df_filtered["text"].apply(preprocess_text)
df_filtered["n_tokens"] = df_filtered["tokens"].apply(len)

df_filtered.head()

Unnamed: 0,id,text,community,tokens,n_tokens
0,1,"""Huh it's still not legalized yet. America is ...",5595,"[huh, still, legalized, yet, america, weirdly,...",7
1,1,"""Hey charisma helps. Everybody wants to sleep ...",5595,"[hey, charisma, helps, everybody, wants, sleep...",11
2,1,Aren't the jedis not really good guys though? ...,5595,"[jedis, really, good, guys, though, protect, s...",24
3,1,Wait but ferb is the better looking one with a...,5595,"[wait, ferb, better, looking, one, actual, gam...",13
4,1,Great now you live with a hole in your head fo...,5595,"[great, live, hole, head, eternity]",5


# Apriori algorithm

Construct table as described in section 6.2.5 

In [8]:
# build token statistics for each of the top 6 communities

community_token_stats = {}

for community_id in top_6_communities:
    df_comm = df_filtered[df_filtered["community"] == community_id]

    # flatten all tokens for this community
    all_tokens = []
    for tokens in df_comm["tokens"]:
        all_tokens.extend(tokens)

    unique_tokens = set(all_tokens)

    community_token_stats[community_id] = {
        "n_tokens": len(all_tokens),
        "n_unique_tokens": len(unique_tokens),       
        "unique_tokens": unique_tokens 
    }

# print summary
for cid in top_6_communities:
    print(f"Community {cid}:")
    print(f"   Total tokens: {community_token_stats[cid]['n_tokens']:,}")
    print(f"   Unique tokens: {community_token_stats[cid]['n_unique_tokens']:,}")
    print()

Community 5595:
   Total tokens: 8,557,801
   Unique tokens: 143,800

Community 8821:
   Total tokens: 7,075,460
   Unique tokens: 151,489

Community 6051:
   Total tokens: 7,164,056
   Unique tokens: 141,186

Community 3475:
   Total tokens: 4,751,481
   Unique tokens: 89,289

Community 15981:
   Total tokens: 1,253,961
   Unique tokens: 60,521

Community 530:
   Total tokens: 1,292,929
   Unique tokens: 32,133



In [9]:
apriori_tables = {}

for cid in top_6_communities:
    # get df for this community
    df_comm = df_filtered[df_filtered["community"] == cid]

    # use the unique tokens you already computed
    unique_tokens = list(community_token_stats[cid]["unique_tokens"])

    # apriori base table
    df_apriori = pd.DataFrame({
        "word": unique_tokens,
        "integer": range(len(unique_tokens))
    })

    # map word → index
    word_to_int = dict(zip(df_apriori["word"], df_apriori["integer"]))

    # count posts that contain each token
    array_of_counts = np.zeros(len(unique_tokens), dtype=int)

    for tokens in df_comm["tokens"]:
        for token in set(tokens):              # once per post
            array_of_counts[word_to_int[token]] += 1

    df_apriori["count"] = array_of_counts

    # save in dictionary
    apriori_tables[cid] = df_apriori


In [23]:
for cid, df_apriori in apriori_tables.items():
    print(f"\n=== Community {cid} ===")
    print(df_apriori.sort_values(by="count", ascending=False).head(5))
    print(len(df_apriori))





=== Community 5595 ===
         word  integer  count  freq_integer
12691   women    12691  66748            12
82727     get    82727  57474           108
29108  people    29108  57189            36
30372   would    30372  47288            37
38159    even    38159  43302            52
143800

=== Community 8821 ===
         word  integer  count  freq_integer
13340   women    13340  52030             8
87105     get    87105  51672            88
40169    even    40169  39618            42
30611  people    30611  38358            28
31977   would    31977  37459            30
151489

=== Community 6051 ===
         word  integer  count  freq_integer
12293   women    12293  57228            12
81125     get    81125  50986           109
28442  people    28442  44169            36
29738   would    29738  37832            37
37419    even    37419  35930            51
141186

=== Community 3475 ===
         word  integer  count  freq_integer
51284     get    51284  37196           103
774

In [11]:
# dictionary to store frequent-item tables
freq_tables = {}

threshold_factor = 0.01  # 1% of documents (chapter 6, between passes of A-priori)

for cid, df_apriori in apriori_tables.items():

    # threshold is 1% of posts in that community
    threshold = threshold_factor * len(df_filtered[df_filtered["community"] == cid])

    # initialize frequent-item map: old_id -> new_id (0 if not frequent)
    frequent_map = np.zeros(len(df_apriori), dtype=int)
    new_id = 1

    for old_id, count in enumerate(df_apriori['count']):
        if count >= threshold:
            frequent_map[old_id] = new_id
            new_id += 1
        else:
            frequent_map[old_id] = 0

    # add freq_integer column
    df_apriori['freq_integer'] = frequent_map

    # store only frequent items in new dictionary
    df_freq = df_apriori[df_apriori['freq_integer'] != 0].copy()
    freq_tables[cid] = df_freq

    print(f"\nCommunity {cid} frequent items:")
    print(df_freq)



Community 5595 frequent items:
         word  integer  count  freq_integer
874      long      874   9045             1
1915      try     1915  12994             2
4129      bad     4129  13740             3
4431     read     4431   8070             4
7145     laid     7145   7010             5
...       ...      ...    ...           ...
138421   hard   138421   9365           171
139573   look   139573  21564           172
141676  makes   141676  11392           173
142634  chads   142634   7671           174
143537   talk   143537   9282           175

[175 rows x 4 columns]

Community 8821 frequent items:
         word  integer  count  freq_integer
936      long      936   7406             1
2022      try     2022   9843             2
4274      bad     4274  11720             3
8721      one     8721  33064             4
10348   since    10348   7375             5
...       ...      ...    ...           ...
145822   hard   145822   8530           141
147078   look   147078  21036   

In [13]:
from itertools import combinations
from collections import Counter

pair_tables = {}   # store results for each community

for cid in top_6_communities:

    # --- get relevant data for the community ---
    df_comm = df_filtered[df_filtered["community"] == cid]

    # table with only frequent tokens
    df_freq = freq_tables[cid]

    # set of frequent words
    frequent_words_set = set(df_freq["word"])

    # counter for accumulating all pairs
    pair_counter = Counter()

    # --- iterate over all posts in that community ---
    for tokens in df_comm["tokens"]:
        # keep only frequent words in this document
        frequent_tokens = [t for t in tokens if t in frequent_words_set]

        # remove duplicates
        unique_tokens = set(frequent_tokens)

        # generate all 2-item sets
        for pair in combinations(unique_tokens, 2):
            pair_counter[tuple(sorted(pair))] += 1

    # convert to dataframe
    df_pairs = pd.DataFrame(
        pair_counter.items(), columns=["item_set", "count"]
    )

    pair_tables[cid] = df_pairs

    print(f"\nCommunity {cid} – Frequent Item Pairs:")
    print(df_pairs.sort_values(by="count",ascending = False).head())
    print(len(df_pairs))



Community 5595 – Frequent Item Pairs:
             item_set  count
924      (men, women)  18756
398      (get, women)  13002
2347  (people, women)  11776
117     (get, people)  11767
964   (people, think)  10718
15225

Community 8821 – Frequent Item Pairs:
           item_set  count
456    (men, women)  14642
1664   (get, women)   9686
155     (even, get)   8414
1216  (even, women)   8052
651   (get, people)   7747
10440

Community 6051 – Frequent Item Pairs:
             item_set  count
23       (men, women)  16316
465      (get, women)  11768
462     (get, people)   9754
91    (people, women)   9333
1131    (even, women)   8868
15576

Community 3475 – Frequent Item Pairs:
           item_set  count
2920   (men, women)   9654
10     (get, women)   7168
2006    (even, get)   6147
333   (get, people)   5947
4152  (even, women)   5676
14028

Community 15981 – Frequent Item Pairs:
          item_set  count
6     (men, women)   1952
556    (even, get)   1542
2     (get, women)   1448
706 

In [20]:
from itertools import combinations
from collections import Counter
import math

pair_tables = {}   # store results for each community

for cid in top_6_communities:

    print(f"\n--- Community {cid} ---")

    # --- get relevant data for the community ---
    df_comm = df_filtered[df_filtered["community"] == cid]
    N = len(df_comm)

    # table with only frequent tokens
    df_freq = freq_tables[cid]

    # set of frequent words
    frequent_words_set = set(df_freq["word"])

    # counter for accumulating all pairs
    pair_counter = Counter()

    # --- iterate over all posts in that community ---
    for tokens in df_comm["tokens"]:
        # keep only frequent words in this document
        frequent_tokens = [t for t in tokens if t in frequent_words_set]

        # remove duplicates
        unique_tokens = set(frequent_tokens)

        # generate all 2-item sets
        for pair in combinations(unique_tokens, 2):
            pair_counter[tuple(sorted(pair))] += 1

    # convert to dataframe
    df_pairs = pd.DataFrame(pair_counter.items(), columns=["item_set", "count"])

    # --- apply the 1% threshold for pairs ---
    threshold = math.ceil(0.01 * N)
    df_pairs = df_pairs[df_pairs["count"] >= threshold]

    pair_tables[cid] = df_pairs

    print("Top pairs:")
    print(df_pairs.sort_values(by="count", ascending=False).head())
    print("Total surviving pairs:", len(df_pairs))



--- Community 5595 ---
Top pairs:
             item_set  count
924      (men, women)  18756
398      (get, women)  13002
2347  (people, women)  11776
117     (get, people)  11767
964   (people, think)  10718
Total surviving pairs: 38

--- Community 8821 ---
Top pairs:
           item_set  count
456    (men, women)  14642
1664   (get, women)   9686
155     (even, get)   8414
1216  (even, women)   8052
651   (get, people)   7747
Total surviving pairs: 10

--- Community 6051 ---
Top pairs:
             item_set  count
23       (men, women)  16316
465      (get, women)  11768
462     (get, people)   9754
91    (people, women)   9333
1131    (even, women)   8868
Total surviving pairs: 33

--- Community 3475 ---
Top pairs:
           item_set  count
2920   (men, women)   9654
10     (get, women)   7168
2006    (even, get)   6147
333   (get, people)   5947
4152  (even, women)   5676
Total surviving pairs: 22

--- Community 15981 ---
Top pairs:
       item_set  count
6  (men, women)   1952
To

from itertools import combinations
from collections import Counter

pair_tables = {}   # store results for each community

for cid in top_6_communities:

    # --- get relevant data for the community ---
    df_comm = df_filtered[df_filtered["community"] == cid]

    # table with only frequent tokens
    df_freq = freq_tables[cid]

    # set of frequent words
    frequent_words_set = set(df_freq["word"])

    # counter for accumulating all pairs
    pair_counter = Counter()

    # --- iterate over all posts in that community ---
    for tokens in df_comm["tokens"]:
        # keep only frequent words in this document
        frequent_tokens = [t for t in tokens if t in frequent_words_set]

        # remove duplicates
        unique_tokens = set(frequent_tokens)

        # generate all 2-item sets
        for pair in combinations(unique_tokens, 3):
            pair_counter[tuple(sorted(pair))] += 1

    # convert to dataframe
    df_pairs = pd.DataFrame(
        pair_counter.items(), columns=["item_set", "count"]
    )

    pair_tables[cid] = df_pairs

    print(f"\nCommunity {cid} – Frequent Item Pairs:")
    print(df_pairs.sort_values(by="count",ascending = False).head())


# Apriori using library 

In [14]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [17]:
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()

freq_words = set(freq_tables[530]["word"])

df_530_test = [
    [t for t in tokens if t in freq_words]
    for tokens in df_filtered[df_filtered["community"] == 530]["tokens"]
]

te_array = te.fit(df_530_test).transform(df_530_test)
df_encoded = pd.DataFrame(te_array, columns=te.columns_)

In [18]:
print("Documents:", len(df_530_test))
print("Unique tokens:", len(te.columns_))


Documents: 37371
Unique tokens: 552


In [19]:
from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)
print("Total Frequent Itemsets:", frequent_itemsets.shape[0])

Total Frequent Itemsets: 12939


In [21]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.034385,(able)
1,0.018624,(absolutely)
2,0.017152,(accept)
3,0.012202,(act)
4,0.013219,(actual)
...,...,...
12934,0.010811,"(think, get, want, people, way)"
12935,0.010356,"(think, get, want, people, would)"
12936,0.010463,"(know, think, one, want, people)"
12937,0.010623,"(know, think, one, people, way)"


In [24]:
from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)
frequent_2_itemsets = frequent_itemsets[
    frequent_itemsets['itemsets'].apply(lambda x: len(x) == 2)
]

print("Frequent 2-itemsets:", len(frequent_2_itemsets))


Frequent 2-itemsets: 4171
