# Unsupervised Learning: Clustering and Topic Modeling

This notebook explores unsupervised learning techniques for sentiment analysis, including K-Means clustering, topic modeling with LDA, and dimensionality reduction with t-SNE visualization.

## 1. Import Libraries

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Clustering and dimensionality reduction
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# Feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Plotting
from mpl_toolkits.mplot3d import Axes3D

# Add src to path
sys.path.append(os.path.join('..', 'src'))

# Set random seed
np.random.seed(42)

# Configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Setup complete!")

## 2. Load Data

In [None]:
# Load cleaned data
data_path = os.path.join('..', 'data', 'processed', 'cleaned_tweets.csv')
df = pd.read_csv(data_path)

# Use a sample for faster computation
SAMPLE_SIZE = 20000
df_sample = df.sample(n=min(SAMPLE_SIZE, len(df)), random_state=42)

print(f"Dataset shape: {df_sample.shape}")
print(f"\nClass distribution:")
print(df_sample['sentiment'].value_counts())
df_sample.head()

## 3. Feature Extraction with TF-IDF

In [None]:
# Create TF-IDF features
tfidf_vectorizer = TfidfVectorizer(
    max_features=1000,  # Limit features for computational efficiency
    max_df=0.8,
    min_df=5,
    ngram_range=(1, 2)
)

X_tfidf = tfidf_vectorizer.fit_transform(df_sample['cleaned_text'])

print(f"TF-IDF feature matrix shape: {X_tfidf.shape}")
print(f"Number of features: {len(tfidf_vectorizer.get_feature_names_out())}")
print(f"Matrix sparsity: {(1 - X_tfidf.nnz / (X_tfidf.shape[0] * X_tfidf.shape[1])) * 100:.2f}%")

## 4. K-Means Clustering

In [None]:
# Find optimal number of clusters using elbow method
inertias = []
silhouette_scores = []
K_range = range(2, 11)

print("Finding optimal number of clusters...")
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_tfidf)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_tfidf, kmeans.labels_))
    print(f"K={k}: Inertia={kmeans.inertia_:.2f}, Silhouette={silhouette_scores[-1]:.4f}")

# Plot elbow curve and silhouette scores
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(K_range, inertias, marker='o', linewidth=2)
axes[0].set_title('Elbow Method', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Number of Clusters (K)', fontsize=12)
axes[0].set_ylabel('Inertia', fontsize=12)
axes[0].grid(True, alpha=0.3)

axes[1].plot(K_range, silhouette_scores, marker='s', color='green', linewidth=2)
axes[1].set_title('Silhouette Score', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Number of Clusters (K)', fontsize=12)
axes[1].set_ylabel('Silhouette Score', fontsize=12)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../visuals/charts/kmeans_optimization.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Apply K-Means with optimal K (trying K=2 since we have binary sentiment)
optimal_k = 2
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_tfidf)

# Add cluster labels to dataframe
df_sample['cluster'] = clusters

print(f"\nK-Means Clustering with K={optimal_k}")
print("="*50)
print(f"Silhouette Score: {silhouette_score(X_tfidf, clusters):.4f}")
print(f"Davies-Bouldin Index: {davies_bouldin_score(X_tfidf.toarray(), clusters):.4f}")
print(f"Calinski-Harabasz Index: {calinski_harabasz_score(X_tfidf.toarray(), clusters):.4f}")

# Analyze clusters vs actual sentiment
print(f"\nCluster distribution:")
print(df_sample['cluster'].value_counts())
print(f"\nCluster vs Sentiment crosstab:")
print(pd.crosstab(df_sample['cluster'], df_sample['sentiment'], normalize='index'))

In [None]:
# Get top terms per cluster
def get_top_terms_per_cluster(kmeans_model, vectorizer, n_terms=10):
    """
    Get top terms for each cluster
    """
    terms = vectorizer.get_feature_names_out()
    order_centroids = kmeans_model.cluster_centers_.argsort()[:, ::-1]
    
    for i in range(kmeans_model.n_clusters):
        print(f"\nCluster {i} top terms:")
        top_terms = [terms[ind] for ind in order_centroids[i, :n_terms]]
        print(", ".join(top_terms))

get_top_terms_per_cluster(kmeans, tfidf_vectorizer, n_terms=15)

## 5. Topic Modeling with LDA (Latent Dirichlet Allocation)

In [None]:
# Create Count Vectorizer for LDA (LDA works with count data, not TF-IDF)
count_vectorizer = CountVectorizer(
    max_features=1000,
    max_df=0.8,
    min_df=5,
    stop_words='english'
)

X_counts = count_vectorizer.fit_transform(df_sample['cleaned_text'])

print(f"Count matrix shape: {X_counts.shape}")

In [None]:
# Train LDA model
n_topics = 5
lda = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    max_iter=20,
    learning_method='online',
    n_jobs=-1
)

print(f"Training LDA with {n_topics} topics...")
lda.fit(X_counts)
print("LDA training complete!")

In [None]:
# Display top words for each topic
def display_topics(model, feature_names, n_top_words=10):
    """
    Display top words for each topic
    """
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx}:")
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(", ".join(top_words))

feature_names = count_vectorizer.get_feature_names_out()
print("="*70)
print("Topics discovered by LDA:")
print("="*70)
display_topics(lda, feature_names, n_top_words=15)

In [None]:
# Get topic distribution for each document
topic_distributions = lda.transform(X_counts)
df_sample['dominant_topic'] = topic_distributions.argmax(axis=1)

print(f"\nTopic distribution:")
print(df_sample['dominant_topic'].value_counts().sort_index())

# Analyze topic vs sentiment
print(f"\nTopic vs Sentiment crosstab:")
print(pd.crosstab(df_sample['dominant_topic'], df_sample['sentiment'], normalize='index'))

## 6. Dimensionality Reduction and Visualization with t-SNE

In [None]:
# First reduce dimensions with PCA for faster t-SNE
print("Reducing dimensions with PCA...")
pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X_tfidf.toarray())
print(f"PCA explained variance: {pca.explained_variance_ratio_.sum():.4f}")

# Apply t-SNE for 2D visualization
print("\nApplying t-SNE for 2D visualization (this may take a few minutes)...")
tsne_2d = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
X_tsne_2d = tsne_2d.fit_transform(X_pca)

print("t-SNE 2D complete!")

In [None]:
# Visualize t-SNE with actual sentiment labels
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot by sentiment
scatter1 = axes[0].scatter(
    X_tsne_2d[:, 0], X_tsne_2d[:, 1],
    c=df_sample['sentiment'],
    cmap='coolwarm',
    alpha=0.6,
    s=10
)
axes[0].set_title('t-SNE Visualization by Sentiment', fontsize=14, fontweight='bold')
axes[0].set_xlabel('t-SNE Component 1')
axes[0].set_ylabel('t-SNE Component 2')
cbar1 = plt.colorbar(scatter1, ax=axes[0])
cbar1.set_label('Sentiment (0=Neg, 1=Pos)')

# Plot by cluster
scatter2 = axes[1].scatter(
    X_tsne_2d[:, 0], X_tsne_2d[:, 1],
    c=df_sample['cluster'],
    cmap='viridis',
    alpha=0.6,
    s=10
)
axes[1].set_title('t-SNE Visualization by K-Means Cluster', fontsize=14, fontweight='bold')
axes[1].set_xlabel('t-SNE Component 1')
axes[1].set_ylabel('t-SNE Component 2')
cbar2 = plt.colorbar(scatter2, ax=axes[1])
cbar2.set_label('Cluster')

plt.tight_layout()
plt.savefig('../visuals/charts/tsne_visualization_2d.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# 3D t-SNE visualization
print("Applying t-SNE for 3D visualization...")
tsne_3d = TSNE(n_components=3, random_state=42, perplexity=30, n_iter=1000)
X_tsne_3d = tsne_3d.fit_transform(X_pca)

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

scatter = ax.scatter(
    X_tsne_3d[:, 0], X_tsne_3d[:, 1], X_tsne_3d[:, 2],
    c=df_sample['sentiment'],
    cmap='coolwarm',
    alpha=0.6,
    s=10
)

ax.set_title('3D t-SNE Visualization by Sentiment', fontsize=14, fontweight='bold')
ax.set_xlabel('t-SNE Component 1')
ax.set_ylabel('t-SNE Component 2')
ax.set_zlabel('t-SNE Component 3')

cbar = plt.colorbar(scatter, ax=ax, pad=0.1)
cbar.set_label('Sentiment (0=Neg, 1=Pos)')

plt.tight_layout()
plt.savefig('../visuals/charts/tsne_visualization_3d.png', dpi=300, bbox_inches='tight')
plt.show()

print("3D t-SNE complete!")

## 7. Summary and Insights

### Key Findings:

1. **K-Means Clustering:**
   - Discovered natural groupings in the data
   - Analyzed how clusters align with true sentiment labels
   - Examined characteristic terms for each cluster

2. **Topic Modeling (LDA):**
   - Identified latent topics in tweets
   - Each topic represents a theme or subject area
   - Topics may correlate with sentiment patterns

3. **t-SNE Visualization:**
   - Reduced high-dimensional TF-IDF features to 2D/3D
   - Visualized data distribution and separation
   - Observed clustering patterns visually
   - 2D and 3D plots show how sentiments are distributed

4. **Insights:**
   - Unsupervised methods can discover patterns without labels
   - Clusters may or may not align perfectly with sentiment
   - Topic modeling reveals what people talk about
   - Visualization helps understand data structure

### Observations:
- If clusters align well with sentiments → good separability in feature space
- If topics show distinct sentiment patterns → topics carry sentiment information
- t-SNE plots reveal if positive/negative tweets form distinct groups
- These insights validate supervised model performance

### Applications:
- Discover emerging topics in social media
- Identify customer pain points or satisfaction drivers
- Explore data before labeling (semi-supervised learning)
- Validate that sentiment classes are distinguishable