# Frequent items analysis of reddit communities

In [101]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import networkx as nx
import nltk 
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
import html
from itertools import combinations
from collections import Counter
import math
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/livdreyerjohansen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Dataload and cleaning

To analysize the what frequent items we may see in the reddit communities found in 03_NetworkAnalysis.ipynb, we must first load the graph with the added attributes, that tell what community each node belongs in and the posts created by each node. We have chosen to only look at the 6 largest communities, by number of nodes, as the distribution of nodes / community is very heavly right skewed.



In [102]:
#load graph

G = nx.read_gml('FINAL_reddit_graph_with_louvain_communities.gml')

To ensure we do not include posts that are either deleted ("\[deleted\]") or removed ("\[removed\]"), both basic reddit features that happen independently of what forum you are in, we remove both. Furthermore, we remove each post that was removed by a bot, which is clear in the text which the bot uses to explain why a post or comment is deleted. We then construct a dataframe with all posts and their community (and original poster (OP in reddit linguistics)).

In [103]:
rows = []
rows_count = 0
allowed_rows = 0

for node, data in G.nodes(data=True):
    community = data.get("community")
    posts_dict = data.get("posts", {})

    # Ensure it's a dictionary
    if not isinstance(posts_dict, dict):
        posts_dict = {"default": posts_dict}

    # Loop through each list of posts in the dictionary
    for key, posts in posts_dict.items():
        if not isinstance(posts, list):
            posts = [posts]

        for post in posts:
            rows_count += 1
            # Skip empty, deleted/removed posts, or posts containing the bot line
            if post and post not in ['[deleted]', '[removed]'] and \
               "*I am a bot, and this action was performed automatically." not in post:
                allowed_rows += 1
                rows.append({
                    "id": node,
                    "text": post,
                    "community": community
                })

df = pd.DataFrame(rows)

print(df.head())

print("\nNumber of original posts:")
print(f"{rows_count:,}")
print("Number of removed posts:")
print(f"{rows_count-allowed_rows:,}")
print("Number of posts in dataframe:")
print(f"{allowed_rows:,}")

  id                                               text  community
0  1  "Huh it's still not legalized yet. America is ...        130
1  1  "Hey charisma helps. Everybody wants to sleep ...        130
2  1  Aren't the jedis not really good guys though? ...        130
3  1  Wait but ferb is the better looking one with a...        130
4  1  Great now you live with a hole in your head fo...        130

Number of original posts:
2,664,156
Number of removed posts:
158,251
Number of posts in dataframe:
2,505,905


We identify the top-6 largest communities in terms of nodes to continue working with only them.

In [104]:
communities = df['community'].unique().tolist()
communities_dict = dict.fromkeys(communities, 0)

for index, row in df.iterrows():
    communities_dict[row['community']] += 1

Filter the dataframe to only contain posts from top-6 communities

In [105]:
comms_list = list(sorted( ((v,k) for k,v in communities_dict.items()), reverse=True))
comms_list = comms_list[:6]
top_6_communities = [item[1] for item in comms_list]

df_filtered = df[df['community'].isin(top_6_communities)].copy()

print("\nTop 6 communities by number of posts (posts, community):", comms_list)

print(f"Number of posts in top 6 communities: {len(df_filtered):,}")

print(f"Number of posts removed based on non identity in top 6: {(len(df) - len(df_filtered)):,}")



Top 6 communities by number of posts (posts, community): [(644383, 130), (635828, 89), (630172, 129), (399843, 191), (154109, 220), (38490, 188)]
Number of posts in top 6 communities: 2,502,825
Number of posts removed based on non identity in top 6: 3,080


## Stop words

We filter out parts of post that we deem have little semantic value. We aim to find frequent items and frequent itemsets (item pairs), and would assume that stop words regularly occur in more than 1% of baskets. As we are working with online fora, we chose to add certain slang-terms as stop words. We furthermore remove:

- html entities
- URL's
- non-text artifacts (such as "/", "?", "!" etc.)
- remaining "removed" and "deleted" artifacts that were not removed in the previous code block due to the way the post was loaded
- short words (length of 2 or less)

Additionally, we make all words lowercase to steamline and tokenize by word (meaning each word will be its own token)

In [106]:
# tokenize and clean text data

stop_words = set(stopwords.words('english'))

# extend basic english stopwords with slang terms
extra_stops = {
    'lol', 'xd', 'haha', 'hahaah', 'omg', 'u', 'ur', 'im', 'ive', 'idk', 
    'dont', 'cant', 'wont', 'aint', 'ya', 'tho', 'tho', 'nah', 'btw', 
    'like', 'yeah', 'yep', 'ok', 'okay', 'pls', 'please', 'get'
}
stop_words.update(extra_stops)

def preprocess_text(text):
    if not isinstance(text, str):
        return []
    # decode HTML entities: &amp; → &, &#x200B; → zero-width space, etc.
    text = html.unescape(text)
    # lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)
    # keep only letters and spaces
    text = re.sub(r"[^a-z\s]", " ", text)
    # tokenize by whitespace
    tokens = text.split()
    # remove stopwords and very short tokens
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    if len(tokens) == 1 and tokens[0] in {"removed", "deleted"}:
        return []
    return tokens

df_filtered["tokens"] = df_filtered["text"].apply(preprocess_text)
df_filtered["n_tokens"] = df_filtered["tokens"].apply(len)

df_filtered.head()

Unnamed: 0,id,text,community,tokens,n_tokens
0,1,"""Huh it's still not legalized yet. America is ...",130,"[huh, still, legalized, yet, america, weirdly,...",7
1,1,"""Hey charisma helps. Everybody wants to sleep ...",130,"[hey, charisma, helps, everybody, wants, sleep...",11
2,1,Aren't the jedis not really good guys though? ...,130,"[jedis, really, good, guys, though, protect, s...",24
3,1,Wait but ferb is the better looking one with a...,130,"[wait, ferb, better, looking, one, actual, gam...",13
4,1,Great now you live with a hole in your head fo...,130,"[great, live, hole, head, eternity]",5


# Frequent items and the A-priori algorithm

We process the tokenized posts by identifying the amount of unique tokens for each community.

In [107]:
# build token statistics for each of the top 6 communities

community_token_stats = {}

for community_id in top_6_communities:
    df_comm = df_filtered[df_filtered["community"] == community_id]

    # flatten all tokens for this community
    all_tokens = []
    for tokens in df_comm["tokens"]:
        all_tokens.extend(tokens)

    unique_tokens = set(all_tokens)

    community_token_stats[community_id] = {
        "n_tokens": len(all_tokens),
        "n_unique_tokens": len(unique_tokens),       
        "unique_tokens": unique_tokens 
    }


for cid in top_6_communities:
    print(f"Community {cid}:")
    print(f"  Total tokens: {community_token_stats[cid]['n_tokens']:,}")
    print(f"  Unique tokens: {community_token_stats[cid]['n_unique_tokens']:,}")

Community 130:
  Total tokens: 8,159,749
  Unique tokens: 140,319
Community 89:
  Total tokens: 7,721,869
  Unique tokens: 149,731
Community 129:
  Total tokens: 6,700,663
  Unique tokens: 146,730
Community 191:
  Total tokens: 4,709,952
  Unique tokens: 89,034
Community 220:
  Total tokens: 1,226,959
  Unique tokens: 59,693
Community 188:
  Total tokens: 1,305,336
  Unique tokens: 33,100


## First pass of the A-priori algorithm

In the first pass of the A-priori algoritm, we initialize a dataframe for each of the communities. In this dataframe, we will store each of the unique tokens found previously, assign them each an integer from 0 to n-1 (number of unique tokens), and count how many baskets (posts) the item (token) appears in. It is important to note that we do not count the total occurrence of the token but only the amount of posts it appears in. In Mining of Massive Datasets, Section 6.2.2, the first pass is described as labeling integers 1 to n, but to keep it within the python framework, we label 0 to n-1 as mentioned.

In [108]:
apriori_tables = {}

for cid in top_6_communities:
    # get df for this community
    df_comm = df_filtered[df_filtered["community"] == cid]
    unique_tokens = list(community_token_stats[cid]["unique_tokens"])

    # apriori table
    df_apriori = pd.DataFrame({
        "word": unique_tokens,
        "integer": range(len(unique_tokens))
    })

    # give each word an integer from 0 to n-1
    word_to_int = dict(zip(df_apriori["word"], df_apriori["integer"]))

    # count posts that contain each token
    array_of_counts = np.zeros(len(unique_tokens), dtype=int)

    for tokens in df_comm["tokens"]:
        for token in set(tokens):             
            array_of_counts[word_to_int[token]] += 1

    df_apriori["count"] = array_of_counts

    # save in dict
    apriori_tables[cid] = df_apriori


In [109]:
# print top 5 tokens by count for each community
for cid, df_apriori in apriori_tables.items():
    print(f"\nCommunity {cid}")
    print(df_apriori.sort_values(by="count", ascending=False).head(5))


Community 130
          word  integer  count
44223    women    44223  64054
134753  people   134753  54841
137669   would   137669  45478
98666     even    98666  41610
108766     one   108766  40846

Community 89
          word  integer  count
46904    women    46904  61740
143858  people   143858  47917
146995   would   146995  41218
105327    even   105327  39135
116017     one   116017  38397

Community 129
          word  integer  count
46114    women    46114  49985
102972    even   102972  37953
140852  people   140852  36781
144012   would   144012  35755
113479     one   113479  31571

Community 191
         word  integer  count
28002   women    28002  36120
87398   would    87398  28225
85469  people    85469  27657
62612    even    62612  27309
68953     one    68953  23836

Community 220
         word  integer  count
37702   women    37702   7367
24451    even    24451   6899
57473   would    57473   5976
54903  people    54903   5766
32958     one    32958   5658

Communi

## Between the passes of A-priori

We create frequency tables where we assign each word an integer from 1-m, where m = number of frequent singletons (words), if the support of the word => 1%. In other words, it must appear in 1% or more of the baskets. If the word is not frequent, we assign it 0.

In [110]:

freq_tables = {}

threshold_factor = 0.01  

for cid, df_apriori in apriori_tables.items():

    # threshold is 1% of posts in that community
    threshold = threshold_factor * len(df_filtered[df_filtered["community"] == cid])

    frequent_map = np.zeros(len(df_apriori), dtype=int)
    new_id = 1

    for old_id, count in enumerate(df_apriori['count']):
        if count >= threshold:
            frequent_map[old_id] = new_id
            new_id += 1
        else:
            frequent_map[old_id] = 0

    # add freq_integer column
    df_apriori['freq_integer'] = frequent_map

    # store only frequent items in new dictionary
    df_freq = df_apriori[df_apriori['freq_integer'] != 0].copy()
    freq_tables[cid] = df_freq

    print(f"\nCommunity {cid} frequent items:")
    print(df_freq)



Community 130 frequent items:
              word  integer  count  freq_integer
159          place      159   7391             1
600           used      600   6884             2
1597           end     1597   6460             3
3313    attractive     3313  13017             4
3731         looks     3731  18848             5
...            ...      ...    ...           ...
136958        also   136958  19237           172
137669       would   137669  45478           173
137780         sub   137780  16094           174
139254         guy   139254  21330           175
139806        shit   139806  24811           176

[176 rows x 4 columns]

Community 89 frequent items:
              word  integer  count  freq_integer
167          place      167   6568             1
626           used      626   6731             2
1762           end     1762   6531             3
3531    attractive     3531  11692             4
3980         looks     3980  18198             5
...            ...      ...    ..

## Second pass of the A-priori algorithm

For the second pass, we first find all pairs of frequent words from the previous dataframes. We then create pairs of those, making sure to remove duplicates. We apply the support threshold of 1% here as well.

In [111]:
pair_tables = {}   # store results for each community

for cid in top_6_communities:

    print(f"\nCommunity {cid}")

    # pull posts for this community
    df_comm = df_filtered[df_filtered["community"] == cid]
    N = len(df_comm)

    # fetch frequent 1-itemset for this community
    df_freq = freq_tables[cid]
    frequent_words_set = set(df_freq["word"])

    # counter for all frequent pairs
    pair_counter = Counter()

    # iterate over all posts
    for tokens in df_comm["tokens"]:
        # keep only frequent tokens
        frequent_tokens = [t for t in tokens if t in frequent_words_set]

        # deduplicate within a post
        unique_tokens = set(frequent_tokens)

        # count each 2-item combination in this post
        for pair in combinations(unique_tokens, 2):
            pair_counter[tuple(sorted(pair))] += 1

    # convert counter → dataframe
    df_pairs = pd.DataFrame(pair_counter.items(), columns=["item_set", "count"])

    # threshold for frequent 2-itemsets (1% of posts)
    threshold = math.ceil(0.01 * N)
    df_pairs = df_pairs[df_pairs["count"] >= threshold]

    # store
    pair_tables[cid] = df_pairs

    # print summary
    print("Top pairs:")
    print(df_pairs.sort_values(by="count", ascending=False).head())
    print("Total pairs where Support(I) => s of 0.01:", len(df_pairs))


Community 130
Top pairs:
             item_set  count
881      (men, women)  17992
2287  (people, women)  11308
919   (people, think)  10248
713    (women, would)  10004
1307   (think, women)   9892
Total pairs where Support(I) => s of 0.01: 30

Community 89
Top pairs:
             item_set  count
35       (men, women)  17505
127   (people, women)   9955
1066    (even, women)   9487
223     (want, women)   9398
771    (women, would)   9290
Total pairs where Support(I) => s of 0.01: 21

Community 129
Top pairs:
             item_set  count
505      (men, women)  14147
1134    (even, women)   7761
2885   (women, would)   6814
385   (people, women)   6649
1208    (want, women)   6643
Total pairs where Support(I) => s of 0.01: 7

Community 191
Top pairs:
             item_set  count
2783     (men, women)   9671
4021    (even, women)   5711
6289   (women, would)   5197
2827   (think, women)   4878
30    (people, think)   4870
Total pairs where Support(I) => s of 0.01: 14

Community 220
Top

To gain knowledge into the context of the frequent item pairs, we count the occurance of words around our top frequent item pair (men, women). We go through each post that contain both items, identify the tokens close (window of 5 on each side if possible) to the items, count the occurance of a token within the window of our item and sort them. We find that there is a difference in words between the communities even though we did not see one when looking only at the frequency pairs. This exercise (or in future work, and extenstion of this exercise) is furthermore important to obtain a sense of the context of the words. It is not possible from the frequent items analysis done previously to find context clues. 

In [None]:
WINDOW = 5

context_results = {}

for cid in top_6_communities:

    print(f"\n=== Community {cid} ===")

    df_comm = df_filtered[df_filtered["community"] == cid]
    N = len(df_comm)

    # context counters
    context_women = Counter()
    context_men = Counter()

    valid_posts = 0

    for tokens in df_comm["tokens"]:
        if "women" in tokens and "men" in tokens:
            valid_posts += 1

            # find all positions of each word
            women_positions = [i for i, t in enumerate(tokens) if t == "women"]
            men_positions =    [i for i, t in enumerate(tokens) if t == "men"]

            # collect window contexts
            for pos in women_positions:
                start = max(0, pos - WINDOW)
                end   = pos + WINDOW + 1
                local = set(tokens[start:end]) - {"women", "men"}
                for w in local:
                    context_women[w] += 1

            for pos in men_positions:
                start = max(0, pos - WINDOW)
                end   = pos + WINDOW + 1
                local = set(tokens[start:end]) - {"women", "men"}
                for w in local:
                    context_men[w] += 1

    # to DataFrames
    df_w = (
        pd.DataFrame.from_dict(context_women, orient="index", columns=["count"])
        .sort_values("count", ascending=False)
    )
    df_m = (
        pd.DataFrame.from_dict(context_men, orient="index", columns=["count"])
        .sort_values("count", ascending=False)
    )

    print(f"Posts containing both: {valid_posts}")
    print("\nTop window-context words for 'women':")
    print(df_w.head())

    print("\nTop window-context words for 'men':")
    print(df_m.head())

    context_results[cid] = {"women": df_w, "men": df_m}



=== Community 130 ===
Posts containing both: 17992

Top window-context words for 'women':
            count
want         2614
sex          2192
think        2088
would        1961
attractive   1859

Top window-context words for 'men':
            count
want         1871
sex          1825
attractive   1735
think        1518
ugly         1518

=== Community 89 ===
Posts containing both: 17505

Top window-context words for 'women':
       count
want    2448
sex     1953
think   1942
even    1869
would   1819

Top window-context words for 'men':
            count
want         1752
sex          1606
would        1464
even         1448
attractive   1417

=== Community 129 ===
Posts containing both: 14147

Top window-context words for 'women':
       count
want    1740
even    1456
white   1443
think   1366
would   1346

Top window-context words for 'men':
       count
want    1305
white   1293
even    1185
would   1137
sex     1102

=== Community 191 ===
Posts containing both: 9671

Top win

# A-priori using library

To further validate the above results, we also implemented the A-priori algorithm using mlxtend. We find that the results of using the mlxtend framwork are congruent with the results found by implementing the A-priori algorithm as described in Mining of Massive Datasets.

In [113]:
te = TransactionEncoder()
encoded_tables = {} 

for cid in top_6_communities:
    print(f"\nCommunity {cid}")
    
    freq_words = set(freq_tables[cid]["word"])
    
    df_comm = df_filtered[df_filtered["community"] == cid]
    transactions = [
        [t for t in tokens if t in freq_words]
        for tokens in df_comm["tokens"]
    ]

    te_array = te.fit(transactions).transform(transactions)
    df_encoded = pd.DataFrame(te_array, columns=te.columns_)
    
    encoded_tables[cid] = df_encoded
    
    print(f"Number of frequent items (singletons) with Support(I) => s of 0.01: {df_encoded.shape[1]}")



Community 130
Number of frequent items (singletons) with Support(I) => s of 0.01: 176

Community 89
Number of frequent items (singletons) with Support(I) => s of 0.01: 170

Community 129
Number of frequent items (singletons) with Support(I) => s of 0.01: 144

Community 191
Number of frequent items (singletons) with Support(I) => s of 0.01: 167

Community 220
Number of frequent items (singletons) with Support(I) => s of 0.01: 93

Community 188
Number of frequent items (singletons) with Support(I) => s of 0.01: 542


In [114]:

frequent_2_itemsets_by_community = {}

for cid in top_6_communities:
    print(f"\nCommunity {cid}")
    
    df_encoded = encoded_tables[cid] 
    
    frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)
    
    # filter to only 2-itemsets
    frequent_2_itemsets = frequent_itemsets[
        frequent_itemsets['itemsets'].apply(lambda x: len(x) == 2)
    ].copy()
    
    frequent_2_itemsets_by_community[cid] = frequent_2_itemsets
    
    print("Number of frequent 2-itemsets:", len(frequent_2_itemsets))
    print("Top 5 pairs:")
    print(frequent_2_itemsets.sort_values(by="support", ascending=False).head())



Community 130
Number of frequent 2-itemsets: 30
Top 5 pairs:
      support         itemsets
190  0.027921     (women, men)
197  0.017549  (people, women)
195  0.015904  (people, think)
205  0.015525   (would, women)
200  0.015351   (women, think)

Community 89
Number of frequent 2-itemsets: 21
Top 5 pairs:
      support         itemsets
179  0.027531     (women, men)
184  0.015657  (people, women)
173  0.014921    (women, even)
189  0.014781    (women, want)
190  0.014611   (would, women)

Community 129
Number of frequent 2-itemsets: 7
Top 5 pairs:
      support         itemsets
146  0.022449     (women, men)
145  0.012316    (even, women)
150  0.010813   (would, women)
147  0.010551  (people, women)
149  0.010542    (women, want)

Community 191
Number of frequent 2-itemsets: 14
Top 5 pairs:
      support         itemsets
171  0.024187     (women, men)
168  0.014283    (even, women)
180  0.012998   (would, women)
177  0.012200   (women, think)
174  0.012180  (people, think)

Community