In [None]:
# =============================================================================
# TOPIC CLUSTERING ANALYSIS - GYM MANAGEMENT SYSTEM
# Phase 3 - Task 9: Group user queries/feedback into thematic clusters
# =============================================================================

# =============================================================================
# 1. SETUP AND IMPORTS
# =============================================================================

# Install required packages (run only once in Colab)
!pip install wordcloud scikit-learn plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Text processing and NLP
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Clustering algorithms
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import LatentDirichletAllocation
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

# Dimensionality reduction and visualization
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap

# Word cloud and text visualization
from wordcloud import WordCloud
from collections import Counter

# Metrics and evaluation
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')  # Added missing resource
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")

# =============================================================================
# 2. DATA LOADING AND INITIAL EXPLORATION
# =============================================================================

# Load the feedback data
# Replace 'feedback_data.csv' with your actual file path
df = pd.read_csv('feedback_data.csv')

print("🔍 DATASET OVERVIEW")
print("="*50)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
print(df.head())

print("\n📊 FEEDBACK DISTRIBUTION")
print("="*50)
print("Feedback types:")
print(df['feedback_type'].value_counts())
print("\nRating distribution:")
print(df['rating'].value_counts().sort_index())

print("\n📝 TEXT DATA ANALYSIS")
print("="*50)
df['text_length'] = df['feedback_text'].str.len()
df['word_count'] = df['feedback_text'].str.split().str.len()

print(f"Average text length: {df['text_length'].mean():.1f} characters")
print(f"Average word count: {df['word_count'].mean():.1f} words")
print(f"Minimum words: {df['word_count'].min()}")
print(f"Maximum words: {df['word_count'].max()}")

# Sample feedback texts
print("\n📋 SAMPLE FEEDBACK TEXTS")
print("="*50)
for i in range(3):
    print(f"Sample {i+1}: {df['feedback_text'].iloc[i]}")
    print("-"*40)

# =============================================================================
# 3. TEXT PREPROCESSING PIPELINE
# =============================================================================

class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        # Add gym-specific stop words
        self.stop_words.update(['gym', 'class', 'workout', 'exercise', 'session', 'coach', 'instructor'])
        
    def clean_text(self, text):
        """Clean and normalize text"""
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text
    
    def tokenize_and_lemmatize(self, text):
        """Tokenize and lemmatize text"""
        try:
            tokens = word_tokenize(text)
        except LookupError:
            # Fallback to simple split if NLTK resources are missing
            tokens = text.split()
        
        # Remove stop words and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens 
                 if token not in self.stop_words and len(token) > 2]
        return ' '.join(tokens)
    
    def preprocess(self, text):
        """Complete preprocessing pipeline"""
        text = self.clean_text(text)
        text = self.tokenize_and_lemmatize(text)
        return text

# Initialize preprocessor and clean the data
print("🧹 PREPROCESSING TEXT DATA")
print("="*50)

preprocessor = TextPreprocessor()
df['cleaned_text'] = df['feedback_text'].apply(preprocessor.preprocess)

# Show before and after examples
print("BEFORE AND AFTER PREPROCESSING:")
for i in range(3):
    print(f"\nOriginal: {df['feedback_text'].iloc[i]}")
    print(f"Cleaned:  {df['cleaned_text'].iloc[i]}")
    print("-"*40)

# Remove empty or very short cleaned texts
df_clean = df[df['cleaned_text'].str.len() > 10].copy()
print(f"\n✅ Preprocessing complete! {len(df_clean)} valid feedback entries ready for clustering.")

# =============================================================================
# 4. TEXT VECTORIZATION
# =============================================================================

print("🔤 TEXT VECTORIZATION")
print("="*50)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(
    max_features=100,  # Top 100 features
    min_df=2,          # Minimum document frequency
    max_df=0.8,        # Maximum document frequency
    ngram_range=(1, 2) # Unigrams and bigrams
)

# Fit and transform the cleaned text
X_tfidf = tfidf_vectorizer.fit_transform(df_clean['cleaned_text'])
feature_names = tfidf_vectorizer.get_feature_names_out()

print(f"TF-IDF matrix shape: {X_tfidf.shape}")
print(f"Number of features: {len(feature_names)}")

# Show top features
feature_scores = X_tfidf.sum(axis=0).A1
top_features_idx = feature_scores.argsort()[-20:][::-1]
print("\nTop 20 features by TF-IDF score:")
for idx in top_features_idx:
    print(f"  {feature_names[idx]}: {feature_scores[idx]:.2f}")

# Count Vectorization (for LDA)
count_vectorizer = CountVectorizer(
    max_features=50,
    min_df=2,
    max_df=0.8,
    ngram_range=(1, 1)  # Only unigrams for LDA
)

X_count = count_vectorizer.fit_transform(df_clean['cleaned_text'])
count_feature_names = count_vectorizer.get_feature_names_out()

print(f"\nCount matrix shape: {X_count.shape}")

# =============================================================================
# 5. OPTIMAL NUMBER OF CLUSTERS ANALYSIS
# =============================================================================

print("📊 DETERMINING OPTIMAL NUMBER OF CLUSTERS")
print("="*50)

# Elbow method for K-means
def find_optimal_clusters(X, max_k=10):
    sse = []
    silhouette_scores = []
    calinski_scores = []
    
    K_range = range(2, max_k+1)
    
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X)
        
        sse.append(kmeans.inertia_)
        
        # Calculate silhouette score
        sil_score = silhouette_score(X.toarray(), kmeans.labels_)
        silhouette_scores.append(sil_score)
        
        # Calculate Calinski-Harabasz score
        ch_score = calinski_harabasz_score(X.toarray(), kmeans.labels_)
        calinski_scores.append(ch_score)
    
    return K_range, sse, silhouette_scores, calinski_scores

K_range, sse, silhouette_scores, calinski_scores = find_optimal_clusters(X_tfidf)

# Plot the results
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Elbow curve
axes[0].plot(K_range, sse, 'bo-')
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Sum of Squared Errors (SSE)')
axes[0].set_title('Elbow Method for Optimal k')
axes[0].grid(True)

# Silhouette scores
axes[1].plot(K_range, silhouette_scores, 'ro-')
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Analysis')
axes[1].grid(True)

# Calinski-Harabasz scores
axes[2].plot(K_range, calinski_scores, 'go-')
axes[2].set_xlabel('Number of Clusters (k)')
axes[2].set_ylabel('Calinski-Harabasz Score')
axes[2].set_title('Calinski-Harabasz Analysis')
axes[2].grid(True)

plt.tight_layout()
plt.show()

# Find optimal k based on silhouette score
optimal_k = K_range[np.argmax(silhouette_scores)]
print(f"\n🎯 Recommended number of clusters based on silhouette score: {optimal_k}")
print(f"Best silhouette score: {max(silhouette_scores):.3f}")

# =============================================================================
# 6. CLUSTERING ALGORITHMS
# =============================================================================

print("🔍 APPLYING CLUSTERING ALGORITHMS")
print("="*50)

# 6.1 K-Means Clustering
print("1. K-Means Clustering")
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df_clean['kmeans_cluster'] = kmeans.fit_predict(X_tfidf)

print(f"K-Means cluster distribution:")
print(df_clean['kmeans_cluster'].value_counts().sort_index())

# 6.2 DBSCAN Clustering
print("\n2. DBSCAN Clustering")
# Try different epsilon values
eps_values = [0.3, 0.5, 0.7, 0.9]
best_eps = 0.5
best_n_clusters = 0

for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=3)
    labels = dbscan.fit_predict(X_tfidf.toarray())
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    if n_clusters > best_n_clusters and n_clusters < 8:
        best_eps = eps
        best_n_clusters = n_clusters

dbscan = DBSCAN(eps=best_eps, min_samples=3)
df_clean['dbscan_cluster'] = dbscan.fit_predict(X_tfidf.toarray())

print(f"DBSCAN cluster distribution (eps={best_eps}):")
print(df_clean['dbscan_cluster'].value_counts().sort_index())

# 6.3 Hierarchical Clustering
print("\n3. Hierarchical Clustering")
hierarchical = AgglomerativeClustering(n_clusters=optimal_k)
df_clean['hierarchical_cluster'] = hierarchical.fit_predict(X_tfidf.toarray())

print(f"Hierarchical cluster distribution:")
print(df_clean['hierarchical_cluster'].value_counts().sort_index())

# 6.4 Topic Modeling with LDA
print("\n4. Latent Dirichlet Allocation (LDA)")
lda = LatentDirichletAllocation(
    n_components=optimal_k,
    random_state=42,
    max_iter=100
)
lda.fit(X_count)

# Get topic assignments
topic_probs = lda.transform(X_count)
df_clean['lda_topic'] = topic_probs.argmax(axis=1)

print(f"LDA topic distribution:")
print(df_clean['lda_topic'].value_counts().sort_index())

# =============================================================================
# 7. CLUSTER ANALYSIS AND INTERPRETATION
# =============================================================================

print("🔬 CLUSTER ANALYSIS AND INTERPRETATION")
print("="*50)

def analyze_clusters(df, cluster_col, vectorizer, X, method_name):
    """Analyze and interpret clusters"""
    print(f"\n{method_name.upper()} CLUSTER ANALYSIS")
    print("="*60)
    
    feature_names = vectorizer.get_feature_names_out()
    
    for cluster_id in sorted(df[cluster_col].unique()):
        if cluster_id == -1:  # Skip noise points in DBSCAN
            continue
            
        cluster_mask = df[cluster_col] == cluster_id
        cluster_docs = df[cluster_mask]
        
        print(f"\n📁 CLUSTER {cluster_id} ({len(cluster_docs)} documents)")
        print("-" * 40)
        
        # Average rating in this cluster
        avg_rating = cluster_docs['rating'].mean()
        print(f"Average Rating: {avg_rating:.2f}")
        
        # Feedback types in this cluster
        feedback_types = cluster_docs['feedback_type'].value_counts()
        print(f"Feedback Types: {dict(feedback_types)}")
        
        # Top terms for this cluster
        cluster_indices = cluster_docs.index
        if hasattr(X, 'toarray'):
            cluster_tfidf = X[cluster_indices].toarray().mean(axis=0)
        else:
            cluster_tfidf = X[cluster_indices].mean(axis=0)
        
        top_indices = cluster_tfidf.argsort()[-10:][::-1]
        top_terms = [feature_names[i] for i in top_indices]
        top_scores = [cluster_tfidf[i] for i in top_indices]
        
        print("Top Terms:")
        for term, score in zip(top_terms, top_scores):
            print(f"  • {term}: {score:.3f}")
        
        # Sample feedback from this cluster
        print("\nSample Feedback:")
        sample_feedback = cluster_docs['feedback_text'].head(2)
        for i, feedback in enumerate(sample_feedback, 1):
            print(f"  {i}. {feedback[:100]}...")

# Analyze K-Means clusters
analyze_clusters(df_clean, 'kmeans_cluster', tfidf_vectorizer, X_tfidf, "K-Means")

# =============================================================================
# 8. LDA TOPIC INTERPRETATION
# =============================================================================

print("\n🎯 LDA TOPIC INTERPRETATION")
print("="*50)

def display_topics(model, feature_names, no_top_words=10):
    """Display topics from LDA model"""
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[-no_top_words:][::-1]
        top_words = [feature_names[i] for i in top_words_idx]
        top_weights = [topic[i] for i in top_words_idx]
        
        print(f"\n📋 TOPIC {topic_idx}")
        print("-" * 30)
        for word, weight in zip(top_words, top_weights):
            print(f"  {word}: {weight:.3f}")
        
        topics.append({
            'topic_id': topic_idx,
            'words': top_words,
            'weights': top_weights
        })
    
    return topics

topics = display_topics(lda, count_feature_names, no_top_words=8)

# Interpret topics based on top words
topic_labels = {}
print("\n🏷️ TOPIC LABELS (Interpretations)")
print("="*40)

for i, topic in enumerate(topics):
    words = topic['words'][:5]  # Top 5 words
    
    # Simple heuristic labeling based on common words
    if any(word in ['equipment', 'machine', 'facility'] for word in words):
        label = "Equipment/Facility Issues"
    elif any(word in ['coach', 'instructor', 'teaching'] for word in words):
        label = "Coach/Instruction Quality"
    elif any(word in ['challenging', 'hard', 'intense', 'difficult'] for word in words):
        label = "Workout Intensity"
    elif any(word in ['music', 'energy', 'fun', 'amazing'] for word in words):
        label = "Class Atmosphere"
    elif any(word in ['beginner', 'advanced', 'modification'] for word in words):
        label = "Difficulty Level"
    else:
        label = f"General Feedback {i+1}"
    
    topic_labels[i] = label
    print(f"Topic {i}: {label}")

df_clean['topic_label'] = df_clean['lda_topic'].map(topic_labels)

# =============================================================================
# 9. VISUALIZATION
# =============================================================================

print("📊 CREATING VISUALIZATIONS")
print("="*50)

# 9.1 Cluster Distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# K-Means distribution
df_clean['kmeans_cluster'].value_counts().sort_index().plot(kind='bar', ax=axes[0,0])
axes[0,0].set_title('K-Means Cluster Distribution')
axes[0,0].set_xlabel('Cluster')
axes[0,0].set_ylabel('Count')

# DBSCAN distribution
df_clean['dbscan_cluster'].value_counts().sort_index().plot(kind='bar', ax=axes[0,1])
axes[0,1].set_title('DBSCAN Cluster Distribution')
axes[0,1].set_xlabel('Cluster')
axes[0,1].set_ylabel('Count')

# LDA topic distribution
df_clean['lda_topic'].value_counts().sort_index().plot(kind='bar', ax=axes[1,0])
axes[1,0].set_title('LDA Topic Distribution')
axes[1,0].set_xlabel('Topic')
axes[1,0].set_ylabel('Count')

# Rating distribution by cluster
df_clean.groupby('kmeans_cluster')['rating'].mean().plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('Average Rating by K-Means Cluster')
axes[1,1].set_xlabel('Cluster')
axes[1,1].set_ylabel('Average Rating')

plt.tight_layout()
plt.show()

# 9.2 Word Clouds for each cluster
print("\n☁️ Generating Word Clouds")

def create_wordcloud(text_list, title):
    """Create word cloud from list of texts"""
    text = ' '.join(text_list)
    if len(text.strip()) > 0:
        wordcloud = WordCloud(
            width=800, 
            height=400, 
            background_color='white',
            colormap='viridis'
        ).generate(text)
        
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(title, fontsize=16)
        plt.tight_layout()
        plt.show()

# Create word clouds for top K-means clusters
top_clusters = df_clean['kmeans_cluster'].value_counts().head(3).index

for cluster_id in top_clusters:
    cluster_texts = df_clean[df_clean['kmeans_cluster'] == cluster_id]['cleaned_text'].tolist()
    create_wordcloud(cluster_texts, f'Word Cloud - K-Means Cluster {cluster_id}')

# 9.3 2D Visualization of Clusters using t-SNE
print("\n🗺️ Creating 2D Cluster Visualization")

# Reduce dimensions for visualization
tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(df_clean)-1))
X_tsne = tsne.fit_transform(X_tfidf.toarray())

# Create interactive plot with Plotly
fig = px.scatter(
    x=X_tsne[:, 0], 
    y=X_tsne[:, 1],
    color=df_clean['kmeans_cluster'].astype(str),
    title='t-SNE Visualization of Feedback Clusters',
    labels={'x': 't-SNE 1', 'y': 't-SNE 2', 'color': 'Cluster'},
    hover_data=[df_clean['rating'], df_clean['feedback_type']]
)
fig.show()

# 9.4 Topic Distribution Heatmap
print("\n🔥 Creating Topic-Feedback Type Heatmap")

# Create crosstab for heatmap
crosstab = pd.crosstab(df_clean['feedback_type'], df_clean['lda_topic'])

plt.figure(figsize=(10, 6))
sns.heatmap(crosstab, annot=True, fmt='d', cmap='Blues')
plt.title('Feedback Type vs LDA Topic Distribution')
plt.xlabel('LDA Topic')
plt.ylabel('Feedback Type')
plt.tight_layout()
plt.show()

# =============================================================================
# 10. RESULTS SUMMARY AND EXPORT
# =============================================================================

print("📋 GENERATING RESULTS SUMMARY")
print("="*50)

# Create comprehensive results summary
results_summary = {
    'total_feedback': len(df_clean),
    'optimal_clusters': optimal_k,
    'clustering_methods': ['K-Means', 'DBSCAN', 'Hierarchical', 'LDA'],
    'best_silhouette_score': max(silhouette_scores),
    'topic_labels': topic_labels
}

print("🎯 CLUSTERING RESULTS SUMMARY")
print("="*40)
print(f"Total feedback analyzed: {results_summary['total_feedback']}")
print(f"Optimal number of clusters: {results_summary['optimal_clusters']}")
print(f"Best silhouette score: {results_summary['best_silhouette_score']:.3f}")

print(f"\n📊 CLUSTER DISTRIBUTION:")
for method in ['kmeans_cluster', 'dbscan_cluster', 'lda_topic']:
    print(f"\n{method.upper()}:")
    distribution = df_clean[method].value_counts().sort_index()
    for cluster, count in distribution.items():
        print(f"  Cluster {cluster}: {count} feedback entries")

print(f"\n🏷️ IDENTIFIED TOPICS:")
for topic_id, label in topic_labels.items():
    count = (df_clean['lda_topic'] == topic_id).sum()
    avg_rating = df_clean[df_clean['lda_topic'] == topic_id]['rating'].mean()
    print(f"  Topic {topic_id} - {label}: {count} entries (avg rating: {avg_rating:.2f})")

# Export results to CSV
export_df = df_clean[[
    'member_id', 'feedback_type', 'rating', 'feedback_text', 
    'cleaned_text', 'kmeans_cluster', 'dbscan_cluster', 
    'lda_topic', 'topic_label', 'sentiment_score'
]].copy()

export_df.to_csv('feedback_clustering_results.csv', index=False)

# Export cluster summaries
cluster_summary = []
for cluster_id in sorted(df_clean['kmeans_cluster'].unique()):
    cluster_data = df_clean[df_clean['kmeans_cluster'] == cluster_id]
    
    # Get top terms
    cluster_indices = cluster_data.index
    cluster_tfidf = X_tfidf[cluster_indices].toarray().mean(axis=0)
    top_indices = cluster_tfidf.argsort()[-5:][::-1]
    top_terms = [feature_names[i] for i in top_indices]
    
    cluster_summary.append({
        'cluster_id': cluster_id,
        'size': len(cluster_data),
        'avg_rating': cluster_data['rating'].mean(),
        'top_terms': ', '.join(top_terms),
        'dominant_feedback_type': cluster_data['feedback_type'].mode().iloc[0],
        'sample_feedback': cluster_data['feedback_text'].iloc[0][:200] + "..."
    })

cluster_summary_df = pd.DataFrame(cluster_summary)
cluster_summary_df.to_csv('cluster_summary.csv', index=False)

print(f"\n✅ EXPORT COMPLETE!")
print(f"📁 Files created:")
print(f"  • feedback_clustering_results.csv - Detailed results with cluster assignments")
print(f"  • cluster_summary.csv - Cluster summaries and characteristics")

# =============================================================================
# 11. BUSINESS INSIGHTS AND RECOMMENDATIONS
# =============================================================================

print("\n💡 BUSINESS INSIGHTS AND RECOMMENDATIONS")
print("="*60)

print("🔍 KEY FINDINGS:")

# Analyze ratings by cluster
print("\n1. RATING ANALYSIS BY CLUSTER:")
rating_by_cluster = df_clean.groupby('kmeans_cluster')['rating'].agg(['mean', 'count']).round(2)
for cluster_id, row in rating_by_cluster.iterrows():
    status = "HIGH" if row['mean'] >= 4 else "MEDIUM" if row['mean'] >= 3 else "LOW"
    print(f"   Cluster {cluster_id}: {row['mean']}/5 ({status} satisfaction) - {row['count']} feedback")

# Analyze feedback types distribution
print("\n2. FEEDBACK TYPE PATTERNS:")
feedback_patterns = df_clean.groupby(['kmeans_cluster', 'feedback_type']).size().unstack(fill_value=0)
for cluster_id in feedback_patterns.index:
    dominant_type = feedback_patterns.loc[cluster_id].idxmax()
    print(f"   Cluster {cluster_id}: Primarily {dominant_type}")

# Topic insights
print("\n3. TOPIC INSIGHTS:")
for topic_id, label in topic_labels.items():
    topic_data = df_clean[df_clean['lda_topic'] == topic_id]
    avg_rating = topic_data['rating'].mean()
    sentiment = topic_data['sentiment_score'].mean()
    
    status = "Positive" if avg_rating >= 4 else "Neutral" if avg_rating >= 3 else "Concerning"
    print(f"   {label}: {avg_rating:.2f}/5 rating, {sentiment:.2f} sentiment ({status})")

print(f"\n🎯 RECOMMENDATIONS:")
print("1. Focus improvement efforts on low-rating clusters")
print("2. Replicate success factors from high-rating clusters") 
print("3. Address specific issues identified in topic analysis")
print("4. Monitor sentiment trends for early warning signs")
print("5. Use cluster insights for personalized member communication")

print(f"\n✅ TOPIC CLUSTERING ANALYSIS COMPLETE!")
print(f"🚀 Ready for Phase 4: Visualization & Reporting")