# Unsupervised Learning for Student Feedback Analysis

This notebook demonstrates unsupervised learning techniques for analyzing student feedback.

## 1. Setup and Data Loading

In [None]:
# Import necessary libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Add the src directory to the path so we can import our modules
sys.path.append('..')

# Set plot style
plt.style.use('ggplot')
%matplotlib inline

In [None]:
# Import project modules
from src.utils.data_generator import generate_sample_feedback
from src.preprocessing.text_processor import TextProcessor, extract_features
from src.models.unsupervised_models import TopicModeler, FeedbackClusterer, DimensionalityReducer
from src.evaluation.metrics import evaluate_clustering, evaluate_topic_model
from src.visualization.visualizer import plot_topic_wordcloud, plot_cluster_visualization

In [None]:
# Load data
data_path = '../data/sample_feedback.csv'
if not os.path.exists(data_path):
    print("Generating sample feedback data...")
    df = generate_sample_feedback(n_samples=1000, output_path=data_path)
else:
    print("Loading existing feedback data...")
    df = pd.read_csv(data_path)

print(f"Loaded {len(df)} feedback samples")
df.head()

## 2. Text Preprocessing and Feature Extraction

In [None]:
# Preprocess text if not already done
if 'processed_text' not in df.columns:
    text_processor = TextProcessor(
        remove_stopwords=True,
        remove_punctuation=True,
        lemmatize=True,
        stem=False,
        lowercase=True
    )
    df = text_processor.preprocess_dataframe(df, 'feedback_text')

# Extract features
feature_matrix, vectorizer = extract_features(
    df['processed_text'],
    method='tfidf',
    max_features=5000,
    ngram_range=(1, 2)
)

feature_names = vectorizer.get_feature_names_out()
print(f"Extracted {len(feature_names)} features")

## 3. Topic Modeling

In [None]:
# Perform topic modeling using LDA
print("Performing LDA topic modeling...")
lda_topic_modeler = TopicModeler(method='lda', n_topics=5)
lda_topic_modeler.fit(df['processed_text'])

# Get top words for each topic
lda_top_words = lda_topic_modeler.get_top_words_per_topic(n_words=10)
print("\nTop words for each LDA topic:")
for i, words in enumerate(lda_top_words):
    print(f"Topic {i+1}: {', '.join(words)}")

In [None]:
# Visualize LDA topics
plot_topic_wordcloud(lda_top_words, topic_names=[f'Topic {i+1}' for i in range(len(lda_top_words))])

In [None]:
# Get topic distribution for each document
topic_distribution = lda_topic_modeler.get_topic_distribution(df['processed_text'])

# Add dominant topic to the dataframe
df_with_topics = df.copy()
df_with_topics['dominant_topic'] = topic_distribution['Dominant_Topic']

# Display sample rows with their dominant topics
print("Sample feedback with dominant topics:")
sample_with_topics = df_with_topics[['feedback_text', 'dominant_topic']].head(10)
sample_with_topics

In [None]:
# Compare with NMF topic modeling
print("Performing NMF topic modeling...")
nmf_topic_modeler = TopicModeler(method='nmf', n_topics=5)
nmf_topic_modeler.fit(df['processed_text'])

# Get top words for each topic
nmf_top_words = nmf_topic_modeler.get_top_words_per_topic(n_words=10)
print("\nTop words for each NMF topic:")
for i, words in enumerate(nmf_top_words):
    print(f"Topic {i+1}: {', '.join(words)}")

# Visualize NMF topics
plot_topic_wordcloud(nmf_top_words, topic_names=[f'Topic {i+1}' for i in range(len(nmf_top_words))])

## 4. Clustering Analysis

In [None]:
# Reduce dimensionality for clustering
print("Reducing dimensionality for clustering...")
reducer = DimensionalityReducer(method='svd', n_components=50)
reduced_features = reducer.fit_transform(feature_matrix)
print(f"Reduced features shape: {reduced_features.shape}")

In [None]:
# Perform K-means clustering
print("Performing K-means clustering...")
kmeans_clusterer = FeedbackClusterer(method='kmeans', n_clusters=5)
kmeans_clusterer.fit(reduced_features)

# Evaluate clustering
kmeans_results = kmeans_clusterer.evaluate(reduced_features)
print(f"Silhouette Score: {kmeans_results['silhouette_score']:.4f}")
print("Cluster sizes:", kmeans_results['cluster_sizes'])

In [None]:
# Visualize clusters
kmeans_labels = kmeans_clusterer.model.labels_
plot_cluster_visualization(reduced_features, kmeans_labels, method='tsne')

In [None]:
# Add cluster labels to the dataframe
df_with_clusters = df.copy()
df_with_clusters['cluster'] = kmeans_labels

# Analyze clusters
print("Analyzing clusters...")
for cluster_id in range(5):
    cluster_feedback = df_with_clusters[df_with_clusters['cluster'] == cluster_id]['feedback_text']
    print(f"\nCluster {cluster_id} ({len(cluster_feedback)} samples):")
    print("Sample feedback:")
    for feedback in cluster_feedback.head(3):
        print(f"- {feedback}")

In [None]:
# Compare with hierarchical clustering
print("Performing hierarchical clustering...")
hierarchical_clusterer = FeedbackClusterer(method='hierarchical', n_clusters=5)
hierarchical_clusterer.fit(reduced_features)

# Evaluate clustering
hierarchical_results = hierarchical_clusterer.evaluate(reduced_features)
print(f"Silhouette Score: {hierarchical_results['silhouette_score']:.4f}")
print("Cluster sizes:", hierarchical_results['cluster_sizes'])

# Visualize clusters
hierarchical_labels = hierarchical_clusterer.model.labels_
plot_cluster_visualization(reduced_features, hierarchical_labels, method='tsne')

## 5. Combining Supervised and Unsupervised Insights

In [None]:
# Combine cluster labels with sentiment and category information
combined_df = df.copy()
combined_df['cluster'] = kmeans_labels
combined_df['dominant_topic'] = topic_distribution['Dominant_Topic']

# Analyze relationship between clusters and sentiment
cluster_sentiment = pd.crosstab(combined_df['cluster'], combined_df['true_sentiment'])
cluster_sentiment_pct = cluster_sentiment.div(cluster_sentiment.sum(axis=1), axis=0)

plt.figure(figsize=(12, 8))
cluster_sentiment_pct.plot(kind='bar', stacked=True)
plt.title('Sentiment Distribution by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Percentage')
plt.legend(title='Sentiment')
plt.show()

In [None]:
# Analyze relationship between topics and sentiment
topic_sentiment = pd.crosstab(combined_df['dominant_topic'], combined_df['true_sentiment'])
topic_sentiment_pct = topic_sentiment.div(topic_sentiment.sum(axis=1), axis=0)

plt.figure(figsize=(12, 8))
topic_sentiment_pct.plot(kind='bar', stacked=True)
plt.title('Sentiment Distribution by Topic')
plt.xlabel('Topic')
plt.ylabel('Percentage')
plt.legend(title='Sentiment')
plt.show()

In [None]:
# Analyze relationship between topics and categories
topic_category = pd.crosstab(combined_df['dominant_topic'], combined_df['true_category'])
topic_category_pct = topic_category.div(topic_category.sum(axis=1), axis=0)

plt.figure(figsize=(14, 10))
topic_category_pct.plot(kind='bar', stacked=True)
plt.title('Category Distribution by Topic')
plt.xlabel('Topic')
plt.ylabel('Percentage')
plt.legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## 6. Conclusion and Insights

### Key Findings:

1. **Topic Modeling**: We identified 5 main topics in the student feedback using both LDA and NMF methods. These topics represent different aspects of the educational experience that students comment on.

2. **Clustering**: K-means and hierarchical clustering grouped similar feedback together, revealing patterns that might not be immediately obvious from manual inspection.

3. **Combined Analysis**: By combining supervised labels (sentiment, categories) with unsupervised results (clusters, topics), we gained deeper insights into the feedback data:
   - Some topics are more associated with positive sentiment than others
   - Certain clusters contain feedback predominantly from specific categories
   - The relationship between topics and categories helps validate our topic modeling results

### Applications:

1. **Automated Feedback Categorization**: The models can be used to automatically categorize new feedback
2. **Sentiment Tracking**: Track sentiment trends over time for different courses or subjects
3. **Topic Discovery**: Identify emerging topics or issues in student feedback
4. **Targeted Improvements**: Focus improvement efforts on areas receiving negative feedback

### Next Steps:

1. **Model Refinement**: Fine-tune models with more data and parameter optimization
2. **Interactive Dashboard**: Develop a dashboard for real-time feedback analysis
3. **Temporal Analysis**: Analyze how feedback changes over academic terms
4. **Integration**: Integrate with existing educational systems for automated feedback processing