# Investigating community language using wordclouds
We move on to analyse the communities based on the words that are used in their posts.



In [None]:
#Import relevant libraries
import json
import networkx as nx
from netwulf import visualize, draw_netwulf
import netwulf as nw
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ast
from itertools import product
from collections import Counter
import networkx as nx
import random
from wordcloud import WordCloud
from tqdm import tqdm
from joblib import Parallel, delayed
import warnings  
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from itertools import product
from itertools import product
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore", category=FutureWarning)

## Textual Cleaning

In [None]:
G_All_communities = nx.read_gml("data/graphs/reddit_graph_with_communities.gml", label=None)

In [None]:
# convert G_All_communities to pandas dataframe
G_text_df = pd.DataFrame.from_dict(dict(G_All_communities.nodes(data=True)), orient='index')
G_text_df.reset_index(inplace=True)
G_text_df.rename(columns={'index': 'node_id'}, inplace=True)    
G_text_df.head()

We are doing the following cleaning:
- Empty or placeholder entries – Many posts contain only markers like "[removed]", "[deleted]", or empty strings, which are excluded to avoid meaningless text in analysis.
- Non-text content – Posts that originally included only media, links, or formatting tags are stripped out during cleaning.
- Data consistency issues – The n_posts field in the graph sometimes includes deleted or missing posts that were never stored under the posts attribute.

In [None]:

DROP_TOKENS = {"", "[removed]", "[deleted]"}  # extend if needed

def flatten_posts(posts_dict):
    # posts_dict looks like: {"Incel": [p1, p2, ...], "Braincels": [p1, ...]}
    raw = []
    per_sub_counts = {}
    if isinstance(posts_dict, dict):
        for sub, lst in posts_dict.items():
            lst = lst if isinstance(lst, list) else []
            per_sub_counts[sub] = len(lst)
            raw.extend(lst)
    # counts before/after cleaning
    raw_count = len(raw)
    cleaned = [str(x).strip() for x in raw if str(x).strip() not in DROP_TOKENS]
    kept_count = len(cleaned)
    # final joined text
    text = " ".join(cleaned)
    return {
        "text": text,
        "raw_count": raw_count,
        "kept_count": kept_count,
        "per_sub_counts": per_sub_counts
    }

rows = []
for node, attrs in G_All_communities.nodes(data=True):
    info = flatten_posts(attrs.get("posts", {}))
    rows.append({
        "node_id": node,
        "label": attrs.get("label"),
        "community": attrs.get("community"),
        "subreddit_origin": attrs.get("subreddit_origin"),
        "n_posts_declared": attrs.get("n_posts"),
        "text": info["text"],
        "raw_count": info["raw_count"],
        "kept_count": info["kept_count"],
        "per_sub_counts": info["per_sub_counts"],
        "text_charlen": len(info["text"])
    })

df_nodes = pd.DataFrame(rows)
df_nodes.head()

In [None]:
# Mismatch diagnostics
df_nodes["raw_vs_declared_diff"] = df_nodes["raw_count"] - df_nodes["n_posts_declared"]
df_nodes["kept_vs_declared_ratio"] = df_nodes["kept_count"] / df_nodes["n_posts_declared"]

# Flag suspicious cases
suspect = df_nodes[
    (df_nodes["n_posts_declared"].notna()) & (
        (df_nodes["raw_vs_declared_diff"] != 0) |
        (df_nodes["kept_vs_declared_ratio"] < 0.90)  # tune threshold
    )
][["node_id","n_posts_declared","raw_count","kept_count","raw_vs_declared_diff","kept_vs_declared_ratio"]]

suspect.head(20)


In [None]:
df_nodes["n_chars"] = df_nodes["text"].str.len()
df_nodes["n_words"] = df_nodes["text"].apply(lambda x: len(x.split()))

min_words = int(df_nodes["n_words"].min())
max_words = int(df_nodes["n_words"].max())

summary = {
    "Total users": int(len(df_nodes)),
    "Total words": int(df_nodes["n_words"].sum()),
    "Average words per user": float(df_nodes["n_words"].mean()),
    "Median words per user": float(df_nodes["n_words"].median()),
    "Average characters per user": float(df_nodes["n_chars"].mean()),
    "Median characters per user": float(df_nodes["n_chars"].median()),
    "Min/Max post length (words)": (min_words, max_words),
}

pd.Series(summary)



In [None]:

vals = df_nodes["n_words"].astype(float).to_numpy()
vals = vals[np.isfinite(vals)]

positive = vals[vals > 0]
bins = np.logspace(0, np.log10(positive.max()), 60)

plt.figure(figsize=(5,3))  # smaller figure
plt.hist(positive, bins=bins, color="#4c72b0", alpha=0.8, edgecolor="none")
plt.xscale("log")
plt.xlabel("Words per user (log scale)", fontsize=9)
plt.ylabel("User count", fontsize=9)
plt.title("Distribution of words per user", fontsize=10)
plt.tight_layout(pad=0.5)
plt.show()

The dataset shows realistic Reddit-like activity: around 100k users and 65M words.
Most users post very little (median = 31 words), while a few contribute heavily (avg ≈ 650 words, max ≈ 710k).
The histogram confirms a strong right-skew, typical of online forums where a small group generates most content.

In [None]:


# Download once (if not already done)
nltk.download('stopwords')

# Base English stopwords from NLTK
stop_words = set(stopwords.words('english'))

# Optionally extend with custom slang / internet terms
extra_stops = {
    'lol', 'xd', 'haha', 'hahaah', 'omg', 'u', 'ur', 'im', 'ive', 'idk', 
    'dont', 'cant', 'wont', 'aint', 'ya', 'tho', 'tho', 'nah', 'btw', 
    'like', 'yeah', 'yep', 'ok', 'okay', 'pls', 'please'
}
stop_words.update(extra_stops)

def preprocess_text(text):
    if not isinstance(text, str):
        return []
    # lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)
    # keep only letters and spaces
    text = re.sub(r"[^a-z\s]", " ", text)
    # tokenize by whitespace
    tokens = text.split()
    # remove stopwords and very short tokens
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

# Apply preprocessing
df_nodes["tokens"] = df_nodes["text"].apply(preprocess_text)
df_nodes["n_tokens"] = df_nodes["tokens"].apply(len)

# Preview
df_nodes.head()


## Frequency-Rank-Plot

We will start by analysing our corpuses of product-descriptions for Zipf's law of abbreviation. This linguisic law states that the value of the n'th entry is inversly proportinal to n when token frequency is sorted in a list of decreasing order. This essentially means that the most common token in our corpus should occur twice as often as the next common one, three times as often as the third most common one and so on. We will check if this is the case in our corpuses by plotting the frequency of each token with the ideal zipf's law to compare:

In [None]:

# Flatten all tokens into one big list
all_tokens = [tok for tokens in df_nodes["tokens"] for tok in tokens]

# Count word frequencies
word_counts = Counter(all_tokens)

# Sort by frequency
sorted_counts = np.array(sorted(word_counts.values(), reverse=True))
ranks = np.arange(1, len(sorted_counts) + 1)

print(f"Total unique tokens: {len(word_counts):,}")
print(f"Most common words:\n{word_counts.most_common(10)}")


In [None]:

def plot_frequency_rank(tokens, top_n=5000):
    """
    Plot a Frequency–Rank (Zipf's Law) curve for a list of tokens.
    
    Parameters
    ----------
    tokens : list of str
        All tokens from your corpus.
    top_n : int
        Number of most frequent words to include in the plot.
    """
    # Count words
    word_counts = Counter(tokens)
    most_common = word_counts.most_common(top_n)
    freqs = np.array([f for _, f in most_common])
    ranks = np.arange(1, len(freqs) + 1)

    # Plot
    plt.figure(figsize=(9, 6))
    plt.plot(ranks, freqs, marker='o', markersize=3, linestyle='-', label='Observed frequencies')

    # Add Zipf’s Law reference line (theoretical expectation)
    constant = freqs[0]  # most frequent word
    zipf_line = constant / ranks
    plt.plot(ranks, zipf_line, linestyle='--', color='red', label="Zipf's Law (1/r)")

    # Log–log axes
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel('Rank (log scale)')
    plt.ylabel('Frequency (log scale)')
    plt.title(f'Frequency–Rank Plot (Top {top_n:,} words)')
    plt.grid(True, which='both', alpha=0.4)
    plt.legend()
    plt.tight_layout()
    plt.show()

# --- Run it on your corpus ---
all_tokens = [tok for tokens in df_nodes["tokens"] for tok in tokens]
plot_frequency_rank(all_tokens, top_n=5000)

The curve mostly follows Zipf’s law, but it bends away from the ideal red line, which is normal for real online text.

This happens because:
- People on Reddit migth often repeat slang, memes, or usernames, making some words more frequent.
- The text might focuses on a few main topics, so certain words appear much more than in general language.
- Cleaning and removing stopwords change how rare words appear at the tail.

## TF-IDF scores and Wordclouds

Term Frequency–Inverse Document Frequency (TF-IDF) is a measure that highlights how important a word is within one document (or community) compared to the entire corpus.
- Term Frequency (TF): how often a word appears in a document.
- Inverse Document Frequency (IDF): how rare that word is across all documents.
- TF-IDF = TF × IDF: high when a word is frequent in one document but rare overall.

In this context:
- Each community is treated as a “document.”
- TF-IDF identifies keywords that are characteristic for each community, i.e. terms that distinguish that community’s language use from others.

In [None]:
# Combine all cleaned text within each community
community_texts = (
    df_nodes.groupby("community")["tokens"]
    .apply(lambda x: " ".join([" ".join(tokens) for tokens in x]))
    .reset_index()
)
community_texts.columns = ["community", "clean_text"]
community_texts.head()


In [None]:
print("Unique communities:", df_nodes['community'].nunique())
print("Total users:", len(df_nodes))
print(df_nodes['community'].value_counts().head(10))

In [None]:
community_tokens = (
    df_nodes
    .dropna(subset=['tokens', 'community'])
    .groupby('community', observed=True)['tokens']
    .apply(lambda groups: [tok for sublist in groups for tok in sublist])
    .reset_index()
)

community_tokens['n_tokens'] = community_tokens['tokens'].apply(len)
community_tokens['n_docs'] = df_nodes.groupby('community')['node_id'].count().values

community_tokens.head()

In [None]:
# print amount of tokens and documents per community for the 10 largest communities
for idx, row in community_tokens.sort_values(by='n_tokens', ascending=False).head(10).iterrows():
    print(f"Community {row['community']} has {row['n_tokens']} tokens across {row['n_docs']} users")



In [None]:


Top_9_communities = list(df_nodes['community'].value_counts().nlargest(9).index)

TF_IDF = {}
TF = {}
top_users = {}   # top 3 users by token count

for c in Top_9_communities:
    # Documents in this community (each doc = one user's token list)
    docs_c = df_nodes.loc[df_nodes["community"] == c, ["node_id", "tokens"]].dropna(subset=["tokens"])
    N_docs = len(docs_c)
    if N_docs == 0:
        continue

    # --- 1) Compute per-community term and document frequencies ---
    token_doc_frequency = Counter()
    token_term_frequency = Counter()

    for _, row in docs_c.iterrows():
        toks = list(row["tokens"])
        token_term_frequency.update(toks)
        token_doc_frequency.update(set(toks))

    # --- 2) Compute TF, IDF, and TF-IDF ---
    total_tokens = token_term_frequency.total()
    TF[c] = {tok: freq / total_tokens for tok, freq in token_term_frequency.items()}

    TF_IDF[c] = {}
    for tok, df in token_doc_frequency.items():
        idf = np.log((1 + N_docs) / (1 + df)) + 1.0
        TF_IDF[c][tok] = TF[c].get(tok, 0.0) * idf

    # --- 3) Print top TF-IDF terms ---
    top_TF_IDF = sorted(TF_IDF[c].items(), key=lambda x: x[1], reverse=True)[:20]
    print("\n-----------------------------------------------------------------------")
    print(f"Community {c} — top 20 TF-IDF terms")
    print("-----------------------------------------------------------------------")
    for token, score in top_TF_IDF:
        print(f"{token}\t{score:.4f}")

    # --- 4) Top 3 users by token count ---
    users_sorted = (
        docs_c.assign(n_tokens=lambda d: d["tokens"].apply(len))
              .sort_values("n_tokens", ascending=False)
              .loc[:, ["node_id", "n_tokens"]]
              .head(3)
    )

    # Convert node_id to native int for clean printing
    top_users[c] = [str(uid) for uid in users_sorted["node_id"].values]

    print("-----------------------------------------------------------------------")
    print(f"Top 3 high-volume users (by token count) in community {c}:")
    print("-----------------------------------------------------------------------")
    for uid, tokcount in zip(top_users[c], users_sorted["n_tokens"]):
        print(f"User ID: {uid} — tokens: {tokcount}")


In [None]:


# Build top TF-IDF word lists and corresponding top users for display
top_tfidf_words = {}
Top_com_users = {}

for c in Top_9_communities:
    # top TF-IDF words for this community
    top_tfidf_words[c] = [
        word for word, score in sorted(TF_IDF[c].items(), key=lambda x: x[1], reverse=True)
    ]
    # top 3 users (already computed earlier)
    Top_com_users[c] = top_users.get(c, [])

# --- 3x3 grid of subplots ---
fig, axs = plt.subplots(3, 3, figsize=(16, 16))
axs = axs.flatten()

for ax, c in zip(axs, Top_9_communities):
    # generate word cloud from TF-IDF frequencies
    wordcloud = WordCloud(
        width=400,
        height=400,
        background_color='white',
        colormap='viridis'
    ).generate_from_frequencies(TF_IDF[c])

    ax.imshow(wordcloud, interpolation='bilinear')
    # format top 3 users nicely for the title
    user_lines = "\n".join([f"User {uid}" for uid in Top_com_users[c]])
    ax.set_title(f"Community {c}\nTop 3 users:\n{user_lines}", fontsize=12)
    ax.axis('off')

# Remove empty subplots if fewer than 9 communities
for i in range(len(Top_9_communities), len(axs)):
    axs[i].axis('off')

plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05, wspace=0.2, hspace=0.3)
plt.show()



## Compare to sklearn prebuilt

In [None]:
from itertools import chain
# Aggregate all tokens per community into one long string
community_texts = (
    df_nodes.groupby("community")["tokens"]
            .apply(lambda token_lists: " ".join(chain.from_iterable(token_lists)))
            .reset_index()
)

community_texts.columns = ["community", "clean_text"]

# Take the top 9 most populated communities
Top_9_communities = list(
    df_nodes["community"].value_counts().nlargest(9).index
)
community_texts = community_texts[community_texts["community"].isin(Top_9_communities)]

community_texts.head()

In [None]:


vectorizer = TfidfVectorizer(
    stop_words=None,          # already cleaned
    max_features=3000,        # optional cap
    sublinear_tf=True,
    min_df= 4,#5,                 # ignore words appearing in <5 communities
    max_df=0.8                # ignore very common words
)

X = vectorizer.fit_transform(community_texts["clean_text"])
feature_names = vectorizer.get_feature_names_out()
print(f"TF-IDF matrix shape: {X.shape}")


In [None]:

top_terms_sklearn = {}

for i, comm in enumerate(community_texts["community"]):
    row = X[i].toarray().flatten()
    top_idx = row.argsort()[-100:][::-1]
    top_terms_sklearn[comm] = dict(zip(feature_names[top_idx], row[top_idx]))

# --- Pretty print top words per community ---
for comm in community_texts["community"]:
    top_items = sorted(top_terms_sklearn[comm].items(), key=lambda x: x[1], reverse=True)[:20]
    
    print("\n" + "-"*70)
    print(f"Community {comm} — Top 20 TF-IDF terms (scikit-learn)")
    print("-"*70)
    print(f"{'Rank':<5}{'Word':<20}{'TF-IDF Score':>15}")
    print("-"*70)
    
    for rank, (word, score) in enumerate(top_items, start=1):
        print(f"{rank:<5}{word:<20}{score:>15.4f}")



In [None]:


fig, axs = plt.subplots(3, 3, figsize=(16, 16))
axs = axs.flatten()

for ax, c in zip(axs, Top_9_communities):
    words = top_terms_sklearn.get(c, {})
    
    # --- Fix 1: Replace NaNs/Infs and remove zero values ---
    clean_words = {k: float(v) for k, v in words.items() if np.isfinite(v) and v > 0}
    
    if not clean_words:
        ax.axis("off")
        continue

    # --- Fix 2: Normalize frequencies to avoid large-scale differences ---
    max_val = max(clean_words.values())
    clean_words = {k: v / max_val for k, v in clean_words.items()}

    wc = WordCloud(
        width=400,
        height=400,
        background_color="white",
        colormap="plasma"
    ).generate_from_frequencies(clean_words)
    
    ax.imshow(wc, interpolation="bilinear")
    ax.set_title(f"Community {c} — sklearn TF-IDF", fontsize=12)
    ax.axis("off")

# Turn off unused subplots
for i in range(len(Top_9_communities), len(axs)):
    axs[i].axis("off")

plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05, wspace=0.2, hspace=0.3)
plt.show()
