# ai_perception_clustering.ipynb
### Clustering Public Perception of Artificial Intelligence
### Author: Joshua Kwan
### Goal: Collect, clean, and cluster public discussions about AI

In [3]:
# 1. LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Text processing
import re
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer

# Clustering and metrics
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_distances

# Optional: Reddit scraping
# import praw
# Optional: Twitter scraping
# import snscrape.modules.twitter as sntwitter

nltk.download('vader_lexicon')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Thanos\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Thanos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
# 2. LOAD OR COLLECT DATA 

# Option A: Load pre-downloaded Kaggle / JSON data
# df = pd.read_csv("data/ai_discussions.csv")

# Option B: Placeholder dataframe if scraping manually
data = {
    "post_id": [1, 2, 3],
    "text": [
        "AI is changing everything, this is revolutionary!",
        "ChatGPT scares me... it's like Ultron becoming real.",
        "AI is useful but not magic, people exaggerate its power."
    ],
    "upvotes": [120, 45, 66]
}
df = pd.DataFrame(data)

In [8]:
# 3. CLEANING AND PREPROCESSING 

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df['clean_text'] = df['text'].apply(clean_text)

In [9]:
# 4. FEATURE ENGINEERING 

## Sentiment
sia = SentimentIntensityAnalyzer()
df['sentiment_score'] = df['clean_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

## AI Future Impact
def classify_impact(text):
    if any(word in text for word in ["destroy", "take over", "dangerous", "replace us", "ultron"]):
        return "Negative"
    elif any(word in text for word in ["help", "assist", "innovate", "transform", "improve", "enhance"]):
        return "Positive"
    else:
        return "Neutral"

## AI Understanding Type
def classify_understanding(text):
    if any(word in text for word in ["chatgpt", "prompt", "api", "tool", "assistant"]):
        return "Tool"
    elif any(word in text for word in ["ultron", "alive", "entity", "robot", "sentient", "conscious"]):
        return "Entity"
    else:
        return "Mystery"

df['ai_future_impact'] = df['clean_text'].apply(classify_impact)
df['ai_understanding_type'] = df['clean_text'].apply(classify_understanding)

## Mental Health Keywords
mental_words = ["anxiety", "fear", "depress", "excited", "addicted", "overwhelmed", "productive"]
df['mental_health_keywords'] = df['clean_text'].apply(lambda x: any(word in x for word in mental_words))

print(df.head())

   post_id                                               text  upvotes  \
0        1  AI is changing everything, this is revolutionary!      120   
1        2  ChatGPT scares me... it's like Ultron becoming...       45   
2        3  AI is useful but not magic, people exaggerate ...       66   

                                          clean_text  sentiment_score  \
0    ai is changing everything this is revolutionary           0.0000   
1    chatgpt scares me its like ultron becoming real           0.0258   
2  ai is useful but not magic people exaggerate i...           0.3851   

  ai_future_impact ai_understanding_type  mental_health_keywords  
0          Neutral               Mystery                   False  
1         Negative                  Tool                   False  
2          Neutral               Mystery                   False  


In [10]:
# 5. TF-IDF VECTORIZATION 

stop_words = stopwords.words('english')
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=3000)
X_text = vectorizer.fit_transform(df['clean_text'])

print(f"TF-IDF shape: {X_text.shape}")

TF-IDF shape: (3, 15)


In [14]:
# 6. CLUSTERING 

# Option 1: Agglomerative clustering with cosine distance (best for text)
dist_matrix = cosine_distances(X_text)

model = AgglomerativeClustering(n_clusters=4, metric='precomputed', linkage='average')
df['cluster_id'] = model.fit_predict(dist_matrix)

ValueError: Cannot extract more clusters than samples: 4 clusters were given for a tree with 3 leaves.

In [15]:
# 7. EVALUATION: CHOOSE K (Elbow/Silhouette) 

sil_scores = []
for k in range(2, 8):
    km = KMeans(n_clusters=k, random_state=42)
    labels = km.fit_predict(X_text)
    sil = silhouette_score(X_text, labels)
    sil_scores.append(sil)

plt.figure(figsize=(6,4))
plt.plot(range(2,8), sil_scores, marker='o')
plt.title("Silhouette Scores for Different k")
plt.xlabel("k (number of clusters)")
plt.ylabel("Silhouette Score")
plt.show()

ValueError: Number of labels is 3. Valid values are 2 to n_samples - 1 (inclusive)

In [16]:
# 8. CLUSTER INTERPRETATION 

# Aggregate sentiment and understanding type by cluster
summary = df.groupby('cluster_id').agg({
    'sentiment_score': 'mean',
    'ai_future_impact': lambda x: x.value_counts().index[0],
    'ai_understanding_type': lambda x: x.value_counts().index[0],
    'mental_health_keywords': 'sum'
}).reset_index()

print(summary)

# Visualize cluster sentiment
plt.figure(figsize=(7,4))
sns.barplot(data=summary, x='cluster_id', y='sentiment_score', palette='coolwarm')
plt.title("Average Sentiment per Cluster")
plt.show()

KeyError: 'cluster_id'

In [17]:
# 9. WORDCLOUDS / TOP TERMS 

from wordcloud import WordCloud

terms = vectorizer.get_feature_names_out()
for i in sorted(df['cluster_id'].unique()):
    cluster_text = ' '.join(df[df['cluster_id']==i]['clean_text'])
    wc = WordCloud(width=600, height=400, background_color='white').generate(cluster_text)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Cluster {i} Word Cloud")
    plt.show()

ModuleNotFoundError: No module named 'wordcloud'

In [18]:
# 10. SAVE RESULTS 

df.to_csv("outputs/clustered_ai_perceptions.csv", index=False)
summary.to_csv("outputs/cluster_summary.csv", index=False)

print("✅ Clustering complete. Files saved in outputs/ folder.")

OSError: Cannot save file into a non-existent directory: 'outputs'