In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set styles
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

# Load data
def load_data():
    """Load all available data files"""
    try:
        spotify_df = pd.read_csv('spotify_global_top_50.csv')
        billboard_df = pd.read_csv('billboard_hot_100.csv')
        common_df = pd.read_csv('common_tracks.csv') if 'common_tracks.csv' in os.listdir() else None
        
        print("📁 Data loaded successfully!")
        print(f"Spotify data: {spotify_df.shape}")
        print(f"Billboard data: {billboard_df.shape}")
        if common_df is not None:
            print(f"Common tracks: {common_df.shape}")
        
        return spotify_df, billboard_df, common_df
        
    except FileNotFoundError:
        print("Data files not found. Please run the data collection script first.")
        return None, None, None

# --- PHASE 1: COMPREHENSIVE EDA ---
def perform_complete_eda(spotify_df, billboard_df, common_df):
    """Perform comprehensive exploratory data analysis"""
    
    print("="*60)
    print("📊 EXPLORATORY DATA ANALYSIS")
    print("="*60)
    
    # 1. Basic Dataset Overview
    print("\n1. 📋 DATASET OVERVIEW")
    print("-" * 40)
    
    print("Spotify Global Top 50:")
    print(f"• Shape: {spotify_df.shape}")
    print(f"• Columns: {list(spotify_df.columns)}")
    print(f"• Date range: {spotify_df.get('date', 'Not available')}")
    
    print("\nBillboard Hot 100:")
    print(f"• Shape: {billboard_df.shape}")
    print(f"• Columns: {list(billboard_df.columns)}")
    
    # 2. Data Quality Check
    print("\n2. 🔍 DATA QUALITY CHECK")
    print("-" * 40)
    
    print("Spotify Data Info:")
    print(spotify_df.info())
    print("\nMissing values in Spotify data:")
    print(spotify_df.isnull().sum())
    
    print("\nBillboard Data Info:")
    print(billboard_df.info())
    print("\nMissing values in Billboard data:")
    print(billboard_df.isnull().sum())
    
    # 3. Descriptive Statistics
    print("\n3. 📈 DESCRIPTIVE STATISTICS")
    print("-" * 40)
    
    if 'danceability' in spotify_df.columns:
        audio_features = ['danceability', 'energy', 'valence', 'acousticness', 
                         'instrumentalness', 'liveness', 'speechiness', 'tempo']
        
        available_features = [feat for feat in audio_features if feat in spotify_df.columns]
        
        print("Audio Features Statistics:")
        print(spotify_df[available_features].describe())
    
    # 4. Track Duration Analysis
    print("\n4. ⏰ DURATION ANALYSIS")
    print("-" * 40)
    
    if 'duration_ms' in spotify_df.columns:
        spotify_df['duration_min'] = spotify_df['duration_ms'] / 60000
        avg_duration = spotify_df['duration_min'].mean()
        print(f"Average track duration: {avg_duration:.2f} minutes")
        print(f"Shortest track: {spotify_df['duration_min'].min():.2f} minutes")
        print(f"Longest track: {spotify_df['duration_min'].max():.2f} minutes")
    
    # 5. Popularity Analysis
    print("\n5. 🎯 POPULARITY ANALYSIS")
    print("-" * 40)
    
    if 'popularity' in spotify_df.columns:
        print(f"Average popularity score: {spotify_df['popularity'].mean():.1f}/100")
        print(f"Most popular track: {spotify_df['popularity'].max()}/100")
        print(f"Least popular track: {spotify_df['popularity'].min()}/100")
    
    # 6. Artist Analysis
    print("\n6. 🎤 ARTIST ANALYSIS")
    print("-" * 40)
    
    print("Top 10 Artists on Spotify Global:")
    top_artists_spotify = spotify_df['artist'].value_counts().head(10)
    print(top_artists_spotify)
    
    print("\nTop 10 Artists on Billboard:")
    top_artists_billboard = billboard_df['artist'].value_counts().head(10)
    print(top_artists_billboard)
    
    # 7. Chart Overlap Analysis
    if common_df is not None and not common_df.empty:
        print("\n7. 🔄 CHART OVERLAP ANALYSIS")
        print("-" * 40)
        
        overlap_percentage = (len(common_df) / min(len(spotify_df), len(billboard_df))) * 100
        print(f"Tracks on both charts: {len(common_df)}")
        print(f"Overlap percentage: {overlap_percentage:.1f}%")
        
        # Correlation between rankings
        if 'spotify_rank' in common_df.columns and 'billboard_rank' in common_df.columns:
            correlation = common_df['spotify_rank'].corr(common_df['billboard_rank'])
            print(f"Rank correlation: {correlation:.3f}")

# --- PHASE 2: ADVANCED VISUALIZATIONS ---
def create_advanced_visualizations(spotify_df, billboard_df, common_df):
    """Create comprehensive visualizations"""
    
    print("\n" + "="*60)
    print("🎨 CREATING ADVANCED VISUALIZATIONS")
    print("="*60)
    
    # 1. Audio Features Radar Chart
    def create_radar_comparison():
        """Compare audio features between charts"""
        if not all(col in spotify_df.columns for col in ['danceability', 'energy', 'valence']):
            return
            
        features = ['danceability', 'energy', 'valence', 'acousticness', 'instrumentalness']
        features = [f for f in features if f in spotify_df.columns]
        
        # Calculate averages
        spotify_avg = spotify_df[features].mean().values
        billboard_avg = common_df[features].mean().values if common_df is not None else None
        
        # Create radar chart
        categories = features
        N = len(categories)
        
        angles = [n / float(N) * 2 * np.pi for n in range(N)]
        angles += angles[:1]
        
        fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
        
        # Plot Spotify data
        spotify_values = np.append(spotify_avg, spotify_avg[0])
        ax.plot(angles, spotify_values, linewidth=2, linestyle='solid', 
                label='Spotify Global', color='blue')
        ax.fill(angles, spotify_values, alpha=0.25, color='blue')
        
        # Plot Billboard common tracks data if available
        if billboard_avg is not None:
            billboard_values = np.append(billboard_avg, billboard_avg[0])
            ax.plot(angles, billboard_values, linewidth=2, linestyle='solid', 
                    label='Billboard Common Tracks', color='red')
            ax.fill(angles, billboard_values, alpha=0.25, color='red')
        
        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(categories, size=12)
        ax.set_yticklabels([])
        ax.set_ylim(0, 1)
        plt.legend(loc='upper right')
        plt.title('Audio Features Comparison\n', size=16, fontweight='bold')
        plt.tight_layout()
        plt.savefig('audio_features_radar.png', dpi=300, bbox_inches='tight')
        plt.show()

    # 2. Correlation Heatmap
    def create_correlation_analysis():
        """Create correlation heatmap for audio features"""
        audio_features = ['danceability', 'energy', 'valence', 'acousticness', 
                         'instrumentalness', 'liveness', 'speechiness', 'tempo', 'popularity']
        
        available_features = [feat for feat in audio_features if feat in spotify_df.columns]
        
        if len(available_features) >= 3:
            corr_matrix = spotify_df[available_features].corr()
            
            plt.figure(figsize=(12, 10))
            mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
            
            sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
                       square=True, linewidths=0.5, cbar_kws={"shrink": .8}, fmt='.2f')
            
            plt.title('Audio Features Correlation Heatmap\n', fontsize=16, fontweight='bold')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
            plt.show()

    # 3. Duration Distribution
    def create_duration_analysis():
        """Analyze track duration distribution"""
        if 'duration_ms' in spotify_df.columns:
            spotify_df['duration_min'] = spotify_df['duration_ms'] / 60000
            
            plt.figure(figsize=(12, 6))
            sns.histplot(data=spotify_df, x='duration_min', bins=20, kde=True, alpha=0.7)
            
            mean_duration = spotify_df['duration_min'].mean()
            median_duration = spotify_df['duration_min'].median()
            
            plt.axvline(mean_duration, color='red', linestyle='--', 
                       label=f'Mean: {mean_duration:.2f} min')
            plt.axvline(median_duration, color='green', linestyle='--', 
                       label=f'Median: {median_duration:.2f} min')
            
            plt.title('Distribution of Track Durations\n', fontsize=16, fontweight='bold')
            plt.xlabel('Duration (minutes)')
            plt.ylabel('Frequency')
            plt.legend()
            plt.tight_layout()
            plt.savefig('duration_distribution.png', dpi=300, bbox_inches='tight')
            plt.show()

    # 4. Popularity vs Features Scatter Matrix
    def create_scatter_matrix():
        """Create scatter matrix of popularity vs audio features"""
        if 'popularity' in spotify_df.columns:
            features = ['popularity', 'danceability', 'energy', 'valence']
            available_features = [f for f in features if f in spotify_df.columns]
            
            if len(available_features) >= 3:
                sns.pairplot(spotify_df[available_features], diag_kind='kde', 
                            plot_kws={'alpha': 0.6}, height=3)
                plt.suptitle('Popularity vs Audio Features Relationship', 
                           y=1.02, fontsize=16, fontweight='bold')
                plt.tight_layout()
                plt.savefig('scatter_matrix.png', dpi=300, bbox_inches='tight')
                plt.show()

    # 5. Artist Dominance Chart
    def create_artist_dominance():
        """Show artist dominance across charts"""
        top_artists_spotify = spotify_df['artist'].value_counts().head(10)
        top_artists_billboard = billboard_df['artist'].value_counts().head(10)
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
        
        # Spotify artists
        colors1 = plt.cm.Blues(np.linspace(0.6, 1, len(top_artists_spotify)))
        ax1.barh(top_artists_spotify.index, top_artists_spotify.values, color=colors1)
        ax1.set_title('Top Artists - Spotify Global', fontweight='bold')
        ax1.set_xlabel('Number of Tracks')
        ax1.invert_yaxis()
        
        # Billboard artists
        colors2 = plt.cm.Reds(np.linspace(0.6, 1, len(top_artists_billboard)))
        ax2.barh(top_artists_billboard.index, top_artists_billboard.values, color=colors2)
        ax2.set_title('Top Artists - Billboard Hot 100', fontweight='bold')
        ax2.set_xlabel('Number of Tracks')
        ax2.invert_yaxis()
        
        plt.suptitle('Artist Dominance Comparison', fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.savefig('artist_dominance.png', dpi=300, bbox_inches='tight')
        plt.show()

    # 6. Explicit Content Analysis
    def create_explicit_analysis():
        """Analyze explicit content distribution"""
        if 'explicit' in spotify_df.columns:
            explicit_counts = spotify_df['explicit'].value_counts()
            labels = ['Clean', 'Explicit'] if len(explicit_counts) == 2 else ['All Clean']
            
            plt.figure(figsize=(10, 8))
            colors = ['#66b3ff', '#ff6666']
            plt.pie(explicit_counts.values, labels=labels, autopct='%1.1f%%', 
                   colors=colors[:len(explicit_counts)], startangle=90)
            plt.title('Explicit Content Distribution\n', fontsize=16, fontweight='bold')
            plt.axis('equal')
            plt.tight_layout()
            plt.savefig('explicit_content.png', dpi=300, bbox_inches='tight')
            plt.show()

    # 7. Ranking Comparison (if common tracks exist)
    def create_ranking_comparison():
        """Compare rankings between charts"""
        if common_df is not None and not common_df.empty:
            plt.figure(figsize=(12, 8))
            
            # Create a colormap based on popularity
            colors = common_df['popularity'] if 'popularity' in common_df.columns else 'blue'
            
            scatter = plt.scatter(common_df['spotify_rank'], common_df['billboard_rank'], 
                                c=colors, cmap='viridis', alpha=0.7, s=100)
            
            plt.colorbar(scatter, label='Popularity Score')
            plt.xlabel('Spotify Global Rank (Lower = Better)')
            plt.ylabel('Billboard Hot 100 Rank (Lower = Better)')
            plt.title('Ranking Comparison: Common Tracks\n(Color indicates popularity)', 
                     fontweight='bold')
            plt.gca().invert_xaxis()
            plt.gca().invert_yaxis()
            plt.grid(True, alpha=0.3)
            
            # Add trend line
            z = np.polyfit(common_df['spotify_rank'], common_df['billboard_rank'], 1)
            p = np.poly1d(z)
            plt.plot(common_df['spotify_rank'], p(common_df['spotify_rank']), "r--", alpha=0.8)
            
            plt.tight_layout()
            plt.savefig('ranking_comparison.png', dpi=300, bbox_inches='tight')
            plt.show()

    # Execute all visualization functions
    visualization_functions = [
        create_radar_comparison,
        create_correlation_analysis,
        create_duration_analysis,
        create_scatter_matrix,
        create_artist_dominance,
        create_explicit_analysis,
        create_ranking_comparison
    ]
    
    for func in visualization_functions:
        try:
            func()
            print(f"✓ Created {func._name_}")
        except Exception as e:
            print(f"⚠ Could not create {func._name_}: {e}")

# --- PHASE 3: INSIGHTS AND SUMMARY ---
def generate_insights(spotify_df, billboard_df, common_df):
    """Generate actionable insights from the analysis"""
    
    print("\n" + "="*60)
    print("💡 ACTIONABLE INSIGHTS")
    print("="*60)
    
    insights = []
    
    # Insight 1: Audio Features
    if 'danceability' in spotify_df.columns and 'energy' in spotify_df.columns:
        avg_danceability = spotify_df['danceability'].mean()
        avg_energy = spotify_df['energy'].mean()
        
        if avg_danceability > 0.7:
            insights.append("🎵 High danceability suggests party/club music dominates global charts")
        if avg_energy > 0.7:
            insights.append("⚡ High energy levels indicate upbeat, energetic music is trending")
    
    # Insight 2: Duration
    if 'duration_ms' in spotify_df.columns:
        avg_duration = spotify_df['duration_ms'].mean() / 60000
        if avg_duration < 3.5:
            insights.append("⏱ Short track durations suggest streaming-optimized music (attention economy)")
    
    # Insight 3: Chart Overlap
    if common_df is not None:
        overlap_percentage = (len(common_df) / min(len(spotify_df), len(billboard_df))) * 100
        insights.append(f"🌍 {overlap_percentage:.1f}% overlap shows global vs US music taste differences")
    
    # Insight 4: Popularity Distribution
    if 'popularity' in spotify_df.columns:
        pop_std = spotify_df['popularity'].std()
        if pop_std < 10:
            insights.append("📊 Low popularity variance suggests homogeneous chart composition")
    
    # Print insights
    for i, insight in enumerate(insights, 1):
        print(f"{i}. {insight}")
    
    # Additional statistics
    print(f"\n📈 Key Statistics:")
    if 'duration_ms' in spotify_df.columns:
        print(f"• Average track length: {spotify_df['duration_ms'].mean()/60000:.2f} minutes")
    if 'popularity' in spotify_df.columns:
        print(f"• Average popularity: {spotify_df['popularity'].mean():.1f}/100")
    if 'danceability' in spotify_df.columns:
        print(f"• Average danceability: {spotify_df['danceability'].mean():.3f}")
    
    return insights

# --- MAIN EXECUTION ---
def main():
    """Run complete EDA and visualization pipeline"""
    
    # Load data
    spotify_df, billboard_df, common_df = load_data()
    
    if spotify_df is None:
        return
    
    # Perform EDA
    perform_complete_eda(spotify_df, billboard_df, common_df)
    
    # Create visualizations
    create_advanced_visualizations(spotify_df, billboard_df, common_df)
    
    # Generate insights
    insights = generate_insights(spotify_df, billboard_df, common_df)
    
    print("\n" + "="*60)
    print("✅ ANALYSIS COMPLETE!")
    print("="*60)
    print("Generated files:")
    print("• audio_features_radar.png")
    print("• correlation_heatmap.png")
    print("• duration_distribution.png")
    print("• scatter_matrix.png")
    print("• artist_dominance.png")
    print("• explicit_content.png")
    print("• ranking_comparison.png")
    print("\nCheck these files for comprehensive visual insights!")

if __name__ == "__main__":
    main()

EmptyDataError: No columns to parse from file