In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
smagnan_1_million_reddit_comments_from_40_subreddits_path = kagglehub.dataset_download('smagnan/1-million-reddit-comments-from-40-subreddits')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Install libraries
!pip install transformers datasets torch tqdm seaborn matplotlib plotly networkx wordcloud

import pandas as pd
import numpy as np
import re
import torch
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import pipeline


In [None]:
data = pd.read_csv('/root/.cache/kagglehub/datasets/smagnan/1-million-reddit-comments-from-40-subreddits/versions/1/kaggle_RC_2019-05.csv')
data.head()


In [None]:
# Drop null or deleted comments
data = data.dropna(subset=['body'])
data = data[data['body'].str.lower().ne('[deleted]')]
data = data[data['body'].str.lower().ne('[removed]')]

# Text cleaning
def clean_text(text):
    text = re.sub(r"http\S+", "", text)         # remove URLs
    text = re.sub(r"@\S+", "", text)            # remove mentions
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)            # normalize spaces
    return text.strip().lower()

data['clean_body'] = data['body'].apply(clean_text)

print(f"Dataset size after cleaning: {len(data)} rows")

In [None]:
from transformers import pipeline

# Use pretrained toxicity model from Hugging Face
toxic_detector = pipeline("text-classification", model="unitary/toxic-bert", truncation=True, device=0)

# Sample a manageable subset for GPU memory
sample_data = data.sample(2000, random_state=42).reset_index(drop=True)

# Predict toxicity
tqdm.pandas()
sample_data['toxicity_output'] = sample_data['clean_body'].progress_apply(lambda x: toxic_detector(x)[0])
sample_data['toxicity_label'] = sample_data['toxicity_output'].apply(lambda x: x['label'])
sample_data['toxicity_score'] = sample_data['toxicity_output'].apply(lambda x: x['score'])


In [None]:
sns.histplot(sample_data['toxicity_score'], bins=30, kde=True)
plt.title("Distribution of Toxicity Scores (sample)")
plt.show()


In [None]:
toxic_by_subreddit = sample_data.groupby('subreddit')['toxicity_score'].mean().sort_values(ascending=False).head(10)
plt.figure(figsize=(8,4))
sns.barplot(x=toxic_by_subreddit.values, y=toxic_by_subreddit.index, palette="Reds_r")
plt.title("Top 10 Subreddits by Average Toxicity")
plt.xlabel("Average Toxicity Score")
plt.show()


In [None]:
sns.scatterplot(data=sample_data, x='controversiality', y='toxicity_score', alpha=0.6)
plt.title("Toxicity vs Controversiality")
plt.show()


In [None]:
sns.scatterplot(data=sample_data, x='score', y='toxicity_score', alpha=0.5)
plt.xscale('symlog')
plt.title("Community Score vs Toxicity")
plt.xlabel("Upvotes (score)")
plt.ylabel("Toxicity")
plt.show()


In [None]:
from wordcloud import WordCloud

toxic_comments = ' '.join(sample_data[sample_data['toxicity_label'] == 'toxic']['clean_body'])
wordcloud = WordCloud(width=1000, height=500, background_color='black').generate(toxic_comments)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Most Frequent Words in Toxic Comments")
plt.show()


In [None]:
# Compute subreddit-level metrics
subreddit_stats = (
    sample_data.groupby('subreddit')
    .agg({
        'toxicity_score': ['mean', 'std'],
        'score': 'mean',
        'controversiality': 'mean',
        'body': 'count'
    })
)
subreddit_stats.columns = ['avg_toxicity', 'std_toxicity', 'avg_score', 'avg_controversiality', 'comment_count']
subreddit_stats = subreddit_stats.reset_index()

# Correlations
corr = subreddit_stats[['avg_toxicity', 'avg_score', 'avg_controversiality', 'comment_count']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix: Toxicity and Community Attributes")
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(
    data=subreddit_stats,
    x='avg_score',
    y='avg_toxicity',
    size='comment_count',
    hue='avg_controversiality',
    alpha=0.7,
    palette='coolwarm'
)
plt.title("Subreddit Toxicity vs Community Engagement")
plt.xlabel("Average Community Score (Upvotes)")
plt.ylabel("Average Toxicity")
plt.show()


In [None]:
summary = subreddit_stats[['subreddit', 'avg_toxicity', 'avg_score', 'avg_controversiality', 'comment_count']].sort_values('avg_toxicity', ascending=False).head(10)
print(summary)


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Normalize the numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(subreddit_stats[['avg_toxicity', 'avg_score', 'avg_controversiality']])

# Compute pairwise cosine similarity between subreddits
similarity_matrix = cosine_similarity(X_scaled)
similarity_df = pd.DataFrame(similarity_matrix,
                             index=subreddit_stats['subreddit'],
                             columns=subreddit_stats['subreddit'])


In [None]:
import networkx as nx

# Create graph
G = nx.Graph()

# Add nodes
for sub in subreddit_stats['subreddit']:
    G.add_node(sub)

# Add edges based on similarity threshold
threshold = 0.7
for i, sub_i in enumerate(subreddit_stats['subreddit']):
    for j, sub_j in enumerate(subreddit_stats['subreddit']):
        if i < j and similarity_df.iloc[i, j] > threshold:
            G.add_edge(sub_i, sub_j, weight=similarity_df.iloc[i, j])


In [None]:
# Basic metrics
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
density = nx.density(G)
avg_degree = sum(dict(G.degree()).values()) / num_nodes

print(f"Nodes: {num_nodes}, Edges: {num_edges}, Density: {density:.3f}, Avg Degree: {avg_degree:.2f}")

# Centrality measures
centrality = nx.degree_centrality(G)
betweenness = nx.betweenness_centrality(G)
closeness = nx.closeness_centrality(G)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, k=0.5, seed=42)
nx.draw_networkx_nodes(G, pos, node_size=800, node_color='skyblue', alpha=0.8)
nx.draw_networkx_edges(G, pos, width=1, alpha=0.5)
nx.draw_networkx_labels(G, pos, font_size=9, font_weight='bold')

plt.title("Subreddit Co-Toxicity Network", fontsize=14)
plt.axis("off")
plt.show()


In [None]:
import community.community_louvain as community_louvain

# Perform Louvain clustering
partition = community_louvain.best_partition(G, weight='weight')

# Add cluster labels to your data
subreddit_stats['cluster'] = subreddit_stats['subreddit'].map(partition)

# Visualize clusters
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, k=0.5, seed=42)
colors = [partition[node] for node in G.nodes()]
nx.draw_networkx(G, pos, node_color=colors, with_labels=True, cmap=plt.cm.Set3, node_size=800, font_size=8)
plt.title("Community Clusters Based on Co-Toxicity")
plt.axis("off")
plt.show()
