### Clustering Public Perception of Artificial Intelligence
### Author: Joshua Kwan
### Goal: Collect, clean, and cluster public discussions about AI

In [11]:
# 1. LIBRARIES
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re, nltk

from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_distances
from wordcloud import WordCloud

nltk.download('vader_lexicon')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Thanos\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Thanos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# 2 Prep Folders 
os.makedirs("outputs", exist_ok=True)
os.makedirs("data", exist_ok=True)

In [13]:
# 2. LOAD OR COLLECT DATA 

# Option A: Load pre-downloaded Kaggle / JSON data
# df = pd.read_csv("data/ai_discussions.csv")

# Option B: Placeholder dataframe if scraping manually
data = {
    "post_id": [1, 2, 3],
    "text": [
        "AI is changing everything, this is revolutionary!",
        "ChatGPT scares me... it's like Ultron becoming real.",
        "AI is useful but not magic, people exaggerate its power."
    ],
    "upvotes": [120, 45, 66]
}
df = pd.DataFrame(data)

In [None]:
# 3. Load Data 

# 3. Collect Reddit posts (public JSON API, no login required)

import requests
import pandas as pd
from tqdm import tqdm

def get_posts(subreddit, limit=200):
    url = f"https://www.reddit.com/r/{subreddit}/hot.json?limit={limit}"
    headers = {"User-Agent": "ai-clustering-demo/0.1"}
    res = requests.get(url, headers=headers)
    if res.status_code != 200:
        print(f"⚠️  Error fetching {subreddit}: {res.status_code}")
        return []
    data = res.json()["data"]["children"]
    return [
        {
            "subreddit": subreddit,
            "title": p["data"]["title"],
            "selftext": p["data"].get("selftext", ""),
            "score": p["data"]["score"],
            "num_comments": p["data"]["num_comments"],
            "url": p["data"]["url"]
        }
        for p in data
    ]

subreddits = ["ChatGPT", "ArtificialIntelligence", "MachineLearning", "Futurology", "Technology"]
posts = []
for s in tqdm(subreddits):
    posts.extend(get_posts(s, limit=200))

df = pd.DataFrame(posts)
df["text"] = df["title"].fillna("") + " " + df["selftext"].fillna("")
df = df[df["text"].str.len() > 40].drop_duplicates(subset="text")
df.to_csv("data/raw_reddit_ai.csv", index=False)
print(f"✅ Collected {len(df)} posts from {len(subreddits)} subreddits.")
df.head()

In [None]:
# 4. Cleaning 

def clean_text(t):
    t = t.lower()
    t = re.sub(r"http\S+|www\S+|https\S+", "", t)
    t = re.sub(r"[^a-z\s]", "", t)
    return t
df["clean_text"] = df["text"].apply(clean_text)

In [None]:
# 5. Feature Engineering 

sia = SentimentIntensityAnalyzer()
df["sentiment_score"] = df["clean_text"].apply(lambda x: sia.polarity_scores(x)["compound"])

def classify_impact(x):
    if any(w in x for w in ["destroy", "take over", "dangerous", "replace us", "ultron"]):
        return "Negative"
    elif any(w in x for w in ["help", "assist", "innovate", "transform", "improve", "enhance"]):
        return "Positive"
    return "Neutral"

def classify_understanding(x):
    if any(w in x for w in ["chatgpt", "prompt", "api", "tool", "assistant"]):
        return "Tool"
    elif any(w in x for w in ["ultron", "alive", "entity", "robot", "sentient", "conscious"]):
        return "Entity"
    return "Mystery"

mental_words = ["anxiety","fear","depress","excited","addicted","overwhelmed","productive"]
df["ai_future_impact"] = df["clean_text"].apply(classify_impact)
df["ai_understanding_type"] = df["clean_text"].apply(classify_understanding)
df["mental_health_keywords"] = df["clean_text"].apply(lambda x:any(w in x for w in mental_words))

print(df.head())

In [None]:
# 6. TF-IDF
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=3000)
X_text = vectorizer.fit_transform(df["clean_text"])
print("TF-IDF shape:", X_text.shape)

In [None]:
# 7. Clustering 

n_clusters = min(4, len(df))  # avoid >samples
dist_matrix = cosine_distances(X_text)
model = AgglomerativeClustering(n_clusters=n_clusters, metric="precomputed", linkage="average")
df["cluster_id"] = model.fit_predict(dist_matrix)
print("Clustering done with", n_clusters, "clusters")

In [None]:
# 8. SAFE SILHOUETTE / ELBOW ANALYSIS 
if len(df) > 10:
    sil_scores = []
    ks = range(2,8)
    for k in ks:
        km = KMeans(n_clusters=k, random_state=42)
        labels = km.fit_predict(X_text)
        if len(set(labels)) < 2:
            continue
        try:
            sil = silhouette_score(X_text, labels)
            sil_scores.append((k, sil))
        except ValueError as e:
            print(f"⚠️ Skipped k={k}: {e}")
    if sil_scores:
        plt.plot([k for k,_ in sil_scores],[s for _,s in sil_scores],marker="o")
        plt.title("Silhouette Scores vs k")
        plt.xlabel("k")
        plt.ylabel("Score")
        plt.show()
else:
    print("Too few samples for silhouette analysis. Add more posts first.")

In [None]:
# 9. Cluster Summary 

summary = df.groupby("cluster_id").agg({
    "sentiment_score":"mean",
    "ai_future_impact":lambda x:x.value_counts().index[0],
    "ai_understanding_type":lambda x:x.value_counts().index[0],
    "mental_health_keywords":"sum"
}).reset_index()
print(summary)

plt.figure(figsize=(7,4))
sns.barplot(data=summary,x="cluster_id",y="sentiment_score",palette="coolwarm")
plt.title("Average Sentiment per Cluster")
plt.show()

In [None]:
# 10. WORDCLOUDS

terms = vectorizer.get_feature_names_out()
for i in sorted(df["cluster_id"].unique()):
    text_blob = " ".join(df[df["cluster_id"]==i]["clean_text"])
    wc = WordCloud(width=600,height=400,background_color="white").generate(text_blob)
    plt.imshow(wc,interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Cluster {i} Word Cloud")
    plt.show()

In [None]:
# 11. SAVE 
df.to_csv("outputs/clustered_ai_perceptions.csv", index=False)
summary.to_csv("outputs/cluster_summary.csv", index=False)
print("Saved results to outputs/")