# Phase 3 — Advanced Insights (Sentiment, Topics, Segmentation)

In [None]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

feedback = pd.read_csv('../data/feedback.csv')  # columns: feedback_id, member_id, text, created_at

# Sentiment
sia = SentimentIntensityAnalyzer()
feedback['sentiment'] = feedback['text'].astype(str).apply(lambda t: sia.polarity_scores(t)['compound'])

# Topic clustering (simple baseline)
vec = TfidfVectorizer(max_features=2000, stop_words='english')
X = vec.fit_transform(feedback['text'].fillna(''))
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
feedback['topic'] = kmeans.fit_predict(X)

feedback[['text','sentiment','topic']].head()


In [None]:

# Member segmentation example (frequency-based using attendance)
attendance = pd.read_csv('../data/attendance.csv')
attendance['checkin_time'] = pd.to_datetime(attendance['checkin_time'], errors='coerce')
freq = attendance.groupby('member_id').size().rename('visits')
bins = [0,1,4,8,9999]
labels = ['New','Occasional','Regular','Power']
segments = pd.cut(freq, bins=bins, labels=labels, right=True, include_lowest=True)
segments.value_counts()
