# Student Feedback Analysis

This notebook demonstrates the analysis of student feedback using both supervised and unsupervised machine learning techniques.

## 1. Setup and Data Loading

In [None]:
# Import necessary libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add the src directory to the path so we can import our modules
sys.path.append('..')

# Set plot style
plt.style.use('ggplot')
%matplotlib inline

In [None]:
# Import project modules
from src.utils.data_generator import generate_sample_feedback
from src.preprocessing.text_processor import TextProcessor, extract_features
from src.models.supervised_models import SentimentClassifier, FeedbackCategorizer
from src.models.unsupervised_models import TopicModeler, FeedbackClusterer, DimensionalityReducer
from src.evaluation.metrics import evaluate_classification, evaluate_clustering, evaluate_topic_model
from src.visualization.visualizer import (
    plot_sentiment_distribution, plot_category_distribution,
    plot_wordcloud, plot_topic_wordcloud, plot_cluster_visualization,
    plot_confusion_matrix, plot_rating_distribution
)

In [None]:
# Generate or load sample data
data_path = '../data/sample_feedback.csv'
if not os.path.exists(data_path):
    print("Generating sample feedback data...")
    df = generate_sample_feedback(n_samples=1000, output_path=data_path)
else:
    print("Loading existing feedback data...")
    df = pd.read_csv(data_path)

print(f"Loaded {len(df)} feedback samples")
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Basic statistics
print("Dataset shape:", df.shape)
print("\nColumns:")
for col in df.columns:
    print(f"- {col}")

print("\nMissing values:")
print(df.isnull().sum())

In [None]:
# Visualize sentiment distribution
plot_sentiment_distribution(df['true_sentiment'])

In [None]:
# Visualize category distribution
plot_category_distribution(df['true_category'])

In [None]:
# Visualize rating distribution
plot_rating_distribution(df['rating'])

In [None]:
# Analyze relationship between rating and sentiment
plt.figure(figsize=(10, 6))
sns.boxplot(x='true_sentiment', y='rating', data=df)
plt.title('Rating Distribution by Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Rating')
plt.show()

In [None]:
# Analyze relationship between subject and sentiment
plt.figure(figsize=(12, 8))
sentiment_by_subject = pd.crosstab(df['subject'], df['true_sentiment'])
sentiment_by_subject_pct = sentiment_by_subject.div(sentiment_by_subject.sum(axis=1), axis=0)
sentiment_by_subject_pct.plot(kind='bar', stacked=True)
plt.title('Sentiment Distribution by Subject')
plt.xlabel('Subject')
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.legend(title='Sentiment')
plt.show()

## 3. Text Preprocessing

In [None]:
# Initialize text processor
text_processor = TextProcessor(
    remove_stopwords=True,
    remove_punctuation=True,
    lemmatize=True,
    stem=False,
    lowercase=True
)

# Preprocess the text data
df = text_processor.preprocess_dataframe(df, 'feedback_text')

In [None]:
# Compare original and processed text
for i, (original, processed) in enumerate(zip(df['feedback_text'].head(5), df['processed_text'].head(5))):
    print(f"\nOriginal [{i+1}]: {original}")
    print(f"Processed [{i+1}]: {processed}")

In [None]:
# Generate word cloud of all feedback
plot_wordcloud(df['processed_text'].str.cat(sep=' '))

In [None]:
# Generate word clouds by sentiment
plt.figure(figsize=(15, 12))

for i, sentiment in enumerate(['Positive', 'Neutral', 'Negative']):
    plt.subplot(1, 3, i+1)
    sentiment_text = df[df['true_sentiment'] == sentiment]['processed_text'].str.cat(sep=' ')
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(sentiment_text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'{sentiment} Feedback')

plt.tight_layout()
plt.show()