In [None]:
# Fitness Club Feedback Sentiment Analysis
# Complete analysis for Google Colab

# ============================================================================
# SETUP AND IMPORTS
# ============================================================================

# Install required packages
!pip install textblob vaderSentiment wordcloud seaborn plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Text processing and sentiment analysis
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
from collections import Counter
from wordcloud import WordCloud

# Interactive plotting
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All packages imported successfully!")

# ============================================================================
# DATA LOADING AND PREPROCESSING
# ============================================================================

# Load the data
df = pd.read_csv('feedback_data.csv')

print("📊 Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"\nFeedback types: {df['feedback_type'].unique()}")
print(f"\nRating distribution:\n{df['rating'].value_counts().sort_index()}")

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['date'] = df['timestamp'].dt.date
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.day_name()

# Create sentiment categories based on existing sentiment_score
def categorize_sentiment(score):
    if score >= 0.7:
        return 'Positive'
    elif score >= 0.3:
        return 'Neutral'
    else:
        return 'Negative'

df['sentiment_category'] = df['sentiment_score'].apply(categorize_sentiment)

print(f"\n🎯 Sentiment Distribution:")
print(df['sentiment_category'].value_counts())

# ============================================================================
# SENTIMENT VALIDATION WITH MULTIPLE METHODS
# ============================================================================

# Initialize sentiment analyzers
analyzer = SentimentIntensityAnalyzer()

def get_textblob_sentiment(text):
    """Get sentiment using TextBlob"""
    blob = TextBlob(text)
    return blob.sentiment.polarity

def get_vader_sentiment(text):
    """Get sentiment using VADER"""
    scores = analyzer.polarity_scores(text)
    return scores['compound']

def normalize_sentiment(score, method='linear'):
    """Normalize sentiment scores to 0-1 range"""
    if method == 'linear':
        return (score + 1) / 2
    elif method == 'sigmoid':
        return 1 / (1 + np.exp(-score))

# Apply different sentiment analysis methods
print("🔍 Applying multiple sentiment analysis methods...")

df['textblob_sentiment'] = df['feedback_text'].apply(get_textblob_sentiment)
df['vader_sentiment'] = df['feedback_text'].apply(get_vader_sentiment)

# Normalize scores
df['textblob_normalized'] = df['textblob_sentiment'].apply(normalize_sentiment)
df['vader_normalized'] = df['vader_sentiment'].apply(normalize_sentiment)

# Compare methods
sentiment_comparison = df[['sentiment_score', 'textblob_normalized', 'vader_normalized']].corr()
print("\n📈 Correlation between sentiment methods:")
print(sentiment_comparison.round(3))

# ============================================================================
# COMPREHENSIVE SENTIMENT ANALYSIS
# ============================================================================

def create_sentiment_dashboard():
    """Create comprehensive sentiment analysis dashboard"""
    
    fig = make_subplots(
        rows=3, cols=2,
        subplot_titles=[
            'Sentiment Distribution by Category',
            'Rating vs Sentiment Score',
            'Sentiment Trends Over Time',
            'Sentiment by Feedback Type',
            'Sentiment by Coach',
            'Daily Sentiment Patterns'
        ],
        specs=[[{"type": "bar"}, {"type": "scatter"}],
               [{"colspan": 2}, None],
               [{"type": "bar"}, {"type": "heatmap"}]]
    )
    
    # 1. Sentiment distribution
    sentiment_counts = df['sentiment_category'].value_counts()
    fig.add_trace(
        go.Bar(x=sentiment_counts.index, y=sentiment_counts.values,
               marker_color=['#ff6b6b', '#ffd93d', '#6bcf7f']),
        row=1, col=1
    )
    
    # 2. Rating vs Sentiment
    fig.add_trace(
        go.Scatter(x=df['rating'], y=df['sentiment_score'],
                  mode='markers', opacity=0.6,
                  marker=dict(size=8, color=df['sentiment_score'],
                            colorscale='RdYlGn', showscale=True)),
        row=1, col=2
    )
    
    # 3. Time trends
    daily_sentiment = df.groupby('date')['sentiment_score'].mean().reset_index()
    fig.add_trace(
        go.Scatter(x=daily_sentiment['date'], y=daily_sentiment['sentiment_score'],
                  mode='lines+markers', name='Daily Avg Sentiment'),
        row=2, col=1
    )
    
    # 4. Sentiment by feedback type
    type_sentiment = df.groupby('feedback_type')['sentiment_score'].mean().sort_values(ascending=True)
    fig.add_trace(
        go.Bar(x=type_sentiment.values, y=type_sentiment.index, orientation='h',
               marker_color='lightblue'),
        row=3, col=1
    )
    
    # Update layout
    fig.update_layout(height=1200, showlegend=False,
                     title_text="🎯 Comprehensive Sentiment Analysis Dashboard")
    
    return fig

# Create and display dashboard
dashboard = create_sentiment_dashboard()
dashboard.show()

# ============================================================================
# DETAILED ANALYSIS BY CATEGORIES
# ============================================================================

def analyze_sentiment_by_category():
    """Detailed sentiment analysis by different categories"""
    
    print("=" * 60)
    print("🔍 DETAILED SENTIMENT ANALYSIS")
    print("=" * 60)
    
    # 1. By Feedback Type
    print("\n1️⃣ SENTIMENT BY FEEDBACK TYPE:")
    type_analysis = df.groupby('feedback_type').agg({
        'sentiment_score': ['mean', 'std', 'count'],
        'rating': 'mean'
    }).round(3)
    print(type_analysis)
    
    # 2. By Rating
    print("\n2️⃣ SENTIMENT BY RATING:")
    rating_analysis = df.groupby('rating').agg({
        'sentiment_score': ['mean', 'std', 'count']
    }).round(3)
    print(rating_analysis)
    
    # 3. By Coach (where applicable)
    print("\n3️⃣ SENTIMENT BY COACH:")
    coach_feedback = df[df['coach_id'].notna()]
    if not coach_feedback.empty:
        coach_analysis = coach_feedback.groupby('coach_id').agg({
            'sentiment_score': ['mean', 'std', 'count'],
            'rating': 'mean'
        }).round(3)
        print(coach_analysis)
    
    # 4. Temporal Analysis
    print("\n4️⃣ TEMPORAL SENTIMENT PATTERNS:")
    temporal_analysis = df.groupby('day_of_week')['sentiment_score'].mean().sort_values(ascending=False)
    print("Average sentiment by day of week:")
    print(temporal_analysis.round(3))
    
    # 5. Package Analysis
    print("\n5️⃣ SENTIMENT BY PACKAGE:")
    package_feedback = df[df['package_id'].notna()]
    if not package_feedback.empty:
        package_analysis = package_feedback.groupby('package_id').agg({
            'sentiment_score': ['mean', 'std', 'count'],
            'rating': 'mean'
        }).round(3)
        print(package_analysis)

analyze_sentiment_by_category()

# ============================================================================
# TEXT ANALYSIS AND INSIGHTS
# ============================================================================

def extract_keywords_by_sentiment():
    """Extract keywords from positive and negative feedback"""
    
    positive_feedback = df[df['sentiment_category'] == 'Positive']['feedback_text'].str.cat(sep=' ')
    negative_feedback = df[df['sentiment_category'] == 'Negative']['feedback_text'].str.cat(sep=' ')
    
    def clean_text(text):
        # Convert to lowercase and remove special characters
        text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
        # Remove common stop words
        stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'a', 'an', 'is', 'was', 'are', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves'}
        words = [word for word in text.split() if word not in stop_words and len(word) > 2]
        return ' '.join(words)
    
    positive_clean = clean_text(positive_feedback)
    negative_clean = clean_text(negative_feedback)
    
    # Get most common words
    positive_words = Counter(positive_clean.split()).most_common(15)
    negative_words = Counter(negative_clean.split()).most_common(15)
    
    print("\n🎯 KEYWORD ANALYSIS:")
    print("\n✅ Most common words in POSITIVE feedback:")
    for word, count in positive_words:
        print(f"  • {word}: {count}")
    
    print("\n❌ Most common words in NEGATIVE feedback:")
    for word, count in negative_words:
        print(f"  • {word}: {count}")
    
    return positive_words, negative_words

positive_keywords, negative_keywords = extract_keywords_by_sentiment()

# ============================================================================
# SENTIMENT TREND ANALYSIS
# ============================================================================

def analyze_sentiment_trends():
    """Analyze sentiment trends and patterns"""
    
    # Daily trends
    daily_trends = df.groupby('date').agg({
        'sentiment_score': 'mean',
        'rating': 'mean',
        'feedback_text': 'count'
    }).rename(columns={'feedback_text': 'feedback_count'})
    
    # Weekly patterns
    weekly_patterns = df.groupby('day_of_week')['sentiment_score'].mean().reindex([
        'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
    ])
    
    # Hourly patterns
    hourly_patterns = df.groupby('hour')['sentiment_score'].mean()
    
    # Create visualization
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Daily trends
    axes[0, 0].plot(daily_trends.index, daily_trends['sentiment_score'], marker='o')
    axes[0, 0].set_title('Daily Sentiment Trends')
    axes[0, 0].set_xlabel('Date')
    axes[0, 0].set_ylabel('Average Sentiment Score')
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # Weekly patterns
    weekly_patterns.plot(kind='bar', ax=axes[0, 1])
    axes[0, 1].set_title('Weekly Sentiment Patterns')
    axes[0, 1].set_xlabel('Day of Week')
    axes[0, 1].set_ylabel('Average Sentiment Score')
    axes[0, 1].tick_params(axis='x', rotation=45)
    
    # Sentiment distribution
    df['sentiment_category'].value_counts().plot(kind='pie', ax=axes[1, 0], autopct='%1.1f%%')
    axes[1, 0].set_title('Sentiment Distribution')
    
    # Hourly patterns
    hourly_patterns.plot(kind='line', ax=axes[1, 1], marker='o')
    axes[1, 1].set_title('Hourly Sentiment Patterns')
    axes[1, 1].set_xlabel('Hour of Day')
    axes[1, 1].set_ylabel('Average Sentiment Score')
    
    plt.tight_layout()
    plt.show()
    
    return daily_trends, weekly_patterns, hourly_patterns

daily_trends, weekly_patterns, hourly_patterns = analyze_sentiment_trends()

# ============================================================================
# ACTIONABLE INSIGHTS AND RECOMMENDATIONS
# ============================================================================

def generate_insights():
    """Generate actionable insights from sentiment analysis"""
    
    print("\n" + "=" * 60)
    print("🎯 ACTIONABLE INSIGHTS & RECOMMENDATIONS")
    print("=" * 60)
    
    # Overall sentiment health
    overall_sentiment = df['sentiment_score'].mean()
    positive_rate = (df['sentiment_category'] == 'Positive').mean() * 100
    negative_rate = (df['sentiment_category'] == 'Negative').mean() * 100
    
    print(f"\n📊 OVERALL SENTIMENT HEALTH:")
    print(f"  • Average sentiment score: {overall_sentiment:.3f}")
    print(f"  • Positive feedback rate: {positive_rate:.1f}%")
    print(f"  • Negative feedback rate: {negative_rate:.1f}%")
    
    # Identify problem areas
    print(f"\n🚨 AREAS NEEDING ATTENTION:")
    
    # Low-rated feedback types
    low_sentiment_types = df.groupby('feedback_type')['sentiment_score'].mean().sort_values().head(2)
    for feedback_type, score in low_sentiment_types.items():
        print(f"  • {feedback_type}: {score:.3f} average sentiment")
    
    # Problematic coaches
    if 'coach_id' in df.columns and df['coach_id'].notna().any():
        coach_sentiment = df[df['coach_id'].notna()].groupby('coach_id')['sentiment_score'].mean().sort_values()
        if len(coach_sentiment) > 0:
            worst_coach = coach_sentiment.index[0]
            worst_score = coach_sentiment.iloc[0]
            print(f"  • Coach {worst_coach}: {worst_score:.3f} average sentiment")
    
    # Time-based insights
    worst_day = weekly_patterns.idxmin()
    worst_day_score = weekly_patterns.min()
    print(f"  • {worst_day}: {worst_day_score:.3f} (lowest sentiment day)")
    
    # Recommendations
    print(f"\n💡 RECOMMENDATIONS:")
    
    if overall_sentiment < 0.6:
        print("  • Overall sentiment is concerning - conduct comprehensive review")
    
    if negative_rate > 20:
        print("  • High negative feedback rate - implement immediate action plan")
    
    print("  • Focus on addressing recurring issues mentioned in negative feedback")
    print("  • Enhance training for underperforming coaches")
    print("  • Consider adjusting class schedules based on sentiment patterns")
    print("  • Implement follow-up system for negative feedback")
    
    # Success stories
    print(f"\n🎉 SUCCESS HIGHLIGHTS:")
    
    best_sentiment_type = df.groupby('feedback_type')['sentiment_score'].mean().idxmax()
    best_score = df.groupby('feedback_type')['sentiment_score'].mean().max()
    print(f"  • Best performing area: {best_sentiment_type} ({best_score:.3f})")
    
    if 'coach_id' in df.columns and df['coach_id'].notna().any():
        best_coach = df[df['coach_id'].notna()].groupby('coach_id')['sentiment_score'].mean().idxmax()
        best_coach_score = df[df['coach_id'].notna()].groupby('coach_id')['sentiment_score'].mean().max()
        print(f"  • Top performing coach: Coach {best_coach} ({best_coach_score:.3f})")
    
    best_day = weekly_patterns.idxmax()
    best_day_score = weekly_patterns.max()
    print(f"  • Best day for satisfaction: {best_day} ({best_day_score:.3f})")

generate_insights()

# ============================================================================
# PREDICTIVE MODELING (BONUS)
# ============================================================================

def create_sentiment_predictor():
    """Create a simple sentiment prediction model"""
    
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_absolute_error, r2_score
    
    print("\n" + "=" * 60)
    print("🤖 SENTIMENT PREDICTION MODEL")
    print("=" * 60)
    
    # Prepare features
    feature_df = df.copy()
    
    # Create dummy variables for categorical features
    feature_df = pd.get_dummies(feature_df, columns=['feedback_type', 'day_of_week'])
    
    # Select features
    feature_cols = [col for col in feature_df.columns if col.startswith(('feedback_type_', 'day_of_week_'))]
    feature_cols.extend(['rating', 'hour'])
    
    # Remove rows with missing values
    model_df = feature_df[feature_cols + ['sentiment_score']].dropna()
    
    X = model_df[feature_cols]
    y = model_df['sentiment_score']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = rf.predict(X_test)
    
    # Evaluate
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Model Performance:")
    print(f"  • Mean Absolute Error: {mae:.4f}")
    print(f"  • R² Score: {r2:.4f}")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 5 Most Important Features:")
    for idx, row in feature_importance.head().iterrows():
        print(f"  • {row['feature']}: {row['importance']:.4f}")
    
    return rf, feature_importance

try:
    !pip install scikit-learn
    model, feature_importance = create_sentiment_predictor()
except ImportError:
    print("Scikit-learn not available. Install with: !pip install scikit-learn")

# ============================================================================
# EXPORT RESULTS
# ============================================================================

def export_results():
    """Export analysis results"""
    
    # Create summary statistics
    summary_stats = {
        'overall_sentiment_score': df['sentiment_score'].mean(),
        'positive_feedback_rate': (df['sentiment_category'] == 'Positive').mean(),
        'negative_feedback_rate': (df['sentiment_category'] == 'Negative').mean(),
        'total_feedback_count': len(df),
        'date_range': f"{df['timestamp'].min()} to {df['timestamp'].max()}"
    }
    
    # Save enhanced dataset
    df.to_csv('feedback_with_sentiment_analysis.csv', index=False)
    
    # Save summary
    pd.DataFrame([summary_stats]).to_csv('sentiment_analysis_summary.csv', index=False)
    
    print("\n✅ Results exported:")
    print("  • feedback_with_sentiment_analysis.csv")
    print("  • sentiment_analysis_summary.csv")
    
    return summary_stats

summary = export_results()

print("\n🎉 Sentiment Analysis Complete!")
print("📊 Use the generated insights to improve customer satisfaction!")

IndentationError: unexpected indent (2752482578.py, line 9)