# Spotify Songs Analysis - Feature Analysis and Visualization

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src to path for custom modules
sys.path.append('../src')
from visualization import save_hist, save_corr_heatmap

# Set up paths
DATA_DIR = Path('../data')
CLEAN_DATA_PATH = DATA_DIR / 'clean_spotify.csv'

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

# Set larger font sizes for better readability
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10


In [None]:
# Load cleaned data
df = pd.read_csv(CLEAN_DATA_PATH)
print(f"Dataset shape: {df.shape}")
print("\nFirst 3 rows:")
display(df.head(3))


## 2. Popularity Distribution Analysis

In [None]:
# Create popularity categories for visualization
# Use integer labels [0, 1, 2] for compatibility with notebook 3
# 0 = Low (<40), 1 = Medium (40-60), 2 = High (>60)
df['popularity_category'] = pd.cut(
    df['popularity'], 
    bins=[-1, 40, 60, 101], 
    labels=[0, 1, 2]
).astype(int)

# Create a string version for display purposes
category_labels = {0: 'Low', 1: 'Medium', 2: 'High'}
df['popularity_category_label'] = df['popularity_category'].map(category_labels)

print("Popularity category distribution:")
pop_counts = df['popularity_category_label'].value_counts()
display(pop_counts)

# Visualize popularity distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
ax1.hist(df['popularity'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
ax1.set_xlabel('Popularity Score')
ax1.set_ylabel('Frequency')
ax1.set_title('Distribution of Popularity Scores')
ax1.grid(True, alpha=0.3)

# Box plot
df.boxplot(column='popularity', ax=ax2)
ax2.set_title('Box Plot of Popularity Scores')
ax2.set_ylabel('Popularity')

plt.tight_layout()
plt.savefig('../reports/figures/popularity_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

# Popularity statistics
print("\nPopularity Statistics:")
print(f"Mean popularity: {df['popularity'].mean():.2f}")
print(f"Median popularity: {df['popularity'].median():.2f}")
print(f"Std deviation: {df['popularity'].std():.2f}")
print(f"Min popularity: {df['popularity'].min()}")
print(f"Max popularity: {df['popularity'].max()}")


## 3. Correlation Analysis

In [None]:
# Select numerical features for correlation analysis
numerical_features = ['danceability', 'energy', 'loudness', 'acousticness', 
                     'instrumentalness', 'liveness', 'valence', 'tempo', 
                     'duration_ms', 'popularity']

# Use only features that exist in the dataframe
available_features = [f for f in numerical_features if f in df.columns]

# Calculate correlation matrix
corr_matrix = df[available_features].corr()

# Create heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # Mask for upper triangle
sns.heatmap(corr_matrix, 
            mask=mask,
            annot=True, 
            cmap='coolwarm', 
            center=0,
            square=True,
            fmt='.2f',
            cbar_kws={'shrink': 0.8})
plt.title('Correlation Matrix of Audio Features and Popularity', fontsize=16, pad=20)
plt.tight_layout()
plt.savefig('../reports/figures/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Top correlations with popularity
print("Top correlations with popularity:")
popularity_corrs = corr_matrix['popularity'].drop('popularity').sort_values(ascending=False)
for feature, corr in popularity_corrs.head(10).items():
    print(f"  {feature}: {corr:.3f}")

# Strongest correlations overall
print("\nStrongest correlations overall (absolute value):")
corr_pairs = corr_matrix.unstack().sort_values(key=abs, ascending=False)
# Remove self-correlations and duplicates
corr_pairs = corr_pairs[corr_pairs < 0.999]
for idx, value in corr_pairs.head(5).items():
    feat1, feat2 = idx
    print(f"  {feat1} - {feat2}: {value:.3f}")


## 4. Feature Pair Analysis

In [None]:
# Create scatter plots for key feature relationships
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Energy vs Loudness
axes[0,0].scatter(df['energy'], df['loudness'], alpha=0.5, c=df['popularity'], cmap='viridis')
axes[0,0].set_xlabel('Energy')
axes[0,0].set_ylabel('Loudness')
axes[0,0].set_title('Energy vs Loudness (colored by popularity)')

# Add trend line
z = np.polyfit(df['energy'], df['loudness'], 1)
p = np.poly1d(z)
axes[0,0].plot(df['energy'], p(df['energy']), "r--", alpha=0.8)

# Danceability vs Tempo
axes[0,1].scatter(df['danceability'], df['tempo'], alpha=0.5, c=df['popularity'], cmap='viridis')
axes[0,1].set_xlabel('Danceability')
axes[0,1].set_ylabel('Tempo (BPM)')
axes[0,1].set_title('Danceability vs Tempo (colored by popularity)')

# Valence vs Acousticness
axes[1,0].scatter(df['valence'], df['acousticness'], alpha=0.5, c=df['popularity'], cmap='viridis')
axes[1,0].set_xlabel('Valence (Positivity)')
axes[1,0].set_ylabel('Acousticness')
axes[1,0].set_title('Valence vs Acousticness (colored by popularity)')

# Energy vs Valence
scatter = axes[1,1].scatter(df['energy'], df['valence'], alpha=0.5, c=df['popularity'], cmap='viridis')
axes[1,1].set_xlabel('Energy')
axes[1,1].set_ylabel('Valence')
axes[1,1].set_title('Energy vs Valence (colored by popularity)')

# Add colorbar
plt.colorbar(scatter, ax=axes[1,1], label='Popularity')

plt.tight_layout()
plt.savefig('../reports/figures/feature_scatter_plots.png', dpi=300, bbox_inches='tight')
plt.show()


## 5. Genre-Level Analysis

In [None]:
# Check available genre column
genre_col = None
for col in ['playlist_genre', 'genre']:
    if col in df.columns:
        genre_col = col
        break

if genre_col:
    print(f"Using genre column: {genre_col}")
    
    # Genre distribution
    genre_counts = df[genre_col].value_counts()
    
    plt.figure(figsize=(12, 6))
    genre_counts.plot(kind='bar', color='lightcoral', edgecolor='black')
    plt.title('Distribution of Songs by Genre', fontsize=16)
    plt.xlabel('Genre')
    plt.ylabel('Number of Songs')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('../reports/figures/genre_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("Genre distribution:")
    display(genre_counts)
else:
    print("No genre column found in the dataset")
    # Create a dummy genre column for demonstration
    df['playlist_genre'] = 'Unknown'
    genre_col = 'playlist_genre'


In [None]:
# Average features by genre
genre_stats = df.groupby(genre_col).agg({
    'popularity': 'mean',
    'danceability': 'mean',
    'energy': 'mean',
    'valence': 'mean',
    'acousticness': 'mean',
    'instrumentalness': 'mean'
}).round(3)

print("Average audio features by genre:")
display(genre_stats.sort_values('popularity', ascending=False))


In [None]:
# Visualize genre comparisons
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
features_to_plot = ['popularity', 'danceability', 'energy', 'valence', 'acousticness', 'instrumentalness']

for i, feature in enumerate(features_to_plot):
    if feature in df.columns:
        ax = axes[i//3, i%3]
        genre_stats_sorted = genre_stats.sort_values(feature, ascending=False)
        
        bars = ax.bar(range(len(genre_stats_sorted)), genre_stats_sorted[feature], 
                     color=plt.cm.Set3(range(len(genre_stats_sorted))))
        ax.set_title(f'Average {feature.title()} by Genre')
        ax.set_xlabel('Genre')
        ax.set_ylabel(feature.title())
        ax.set_xticks(range(len(genre_stats_sorted)))
        ax.set_xticklabels(genre_stats_sorted.index, rotation=45)
        
        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{height:.2f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('../reports/figures/genre_feature_comparison.png', dpi=300, bbox_inches='tight')
plt.show()


## 6. Feature Distributions

In [None]:
# Plot distributions of key audio features
key_features = ['danceability', 'energy', 'valence', 'acousticness', 'instrumentalness', 'liveness']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, feature in enumerate(key_features):
    if feature in df.columns:
        axes[i].hist(df[feature], bins=30, alpha=0.7, color='lightseagreen', edgecolor='black')
        axes[i].set_xlabel(feature.title())
        axes[i].set_ylabel('Frequency')
        axes[i].set_title(f'Distribution of {feature.title()}')
        axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/feature_distributions.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Box plots by popularity category
if 'popularity_category' in df.columns:
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    features_to_plot = ['danceability', 'energy', 'valence', 'acousticness', 'loudness', 'tempo']
    
    for i, feature in enumerate(features_to_plot):
        if feature in df.columns:
            ax = axes[i//3, i%3]
            df.boxplot(column=feature, by='popularity_category', ax=ax)
            ax.set_title(f'{feature.title()} by Popularity Category')
            ax.set_ylabel(feature.title())
            ax.set_xlabel('Popularity Category')
    
    plt.suptitle('')  # Remove automatic title
    plt.tight_layout()
    plt.savefig('../reports/figures/feature_boxplots_by_popularity.png', dpi=300, bbox_inches='tight')
    plt.show()


## 7. Advanced Visualizations

In [None]:
# Pairplot for a subset of features (sample for performance)
sample_df = df.sample(n=1000, random_state=42) if len(df) > 1000 else df

pairplot_features = ['danceability', 'energy', 'valence', 'popularity']
if genre_col:
    pairplot_features.append(genre_col)

pairplot_features = [f for f in pairplot_features if f in sample_df.columns]

print("Creating pairplot (this may take a moment for larger datasets)...")
g = sns.pairplot(sample_df[pairplot_features], 
                 hue=genre_col if genre_col and genre_col in pairplot_features else None,
                 diag_kind='hist',
                 palette='Set2',
                 plot_kws={'alpha': 0.6})
g.fig.suptitle('Pairplot of Key Audio Features', y=1.02)
plt.savefig('../reports/figures/feature_pairplot.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Radar chart for genre characteristics (example for top 3 genres)
if genre_col and len(genre_stats) >= 3:
    top_genres = genre_stats.nlargest(3, 'popularity').index
    
    # Select features for radar chart
    radar_features = ['danceability', 'energy', 'valence', 'acousticness', 'instrumentalness']
    radar_features = [f for f in radar_features if f in genre_stats.columns]
    
    # Normalize features for radar chart
    genre_stats_normalized = genre_stats.loc[top_genres, radar_features].copy()
    for feature in radar_features:
        genre_stats_normalized[feature] = (genre_stats_normalized[feature] - genre_stats_normalized[feature].min()) / \
                                         (genre_stats_normalized[feature].max() - genre_stats_normalized[feature].min())
    
    # Create radar chart
    angles = np.linspace(0, 2*np.pi, len(radar_features), endpoint=False).tolist()
    angles += angles[:1]  # Complete the circle
    
    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))
    
    colors = ['red', 'blue', 'green']
    for i, genre in enumerate(top_genres):
        values = genre_stats_normalized.loc[genre].tolist()
        values += values[:1]  # Complete the circle
        ax.plot(angles, values, 'o-', linewidth=2, label=genre, color=colors[i])
        ax.fill(angles, values, alpha=0.1, color=colors[i])
    
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(radar_features)
    ax.set_ylim(0, 1)
    ax.set_title('Audio Feature Profile by Genre (Normalized)', size=16, y=1.08)
    ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
    
    plt.tight_layout()
    plt.savefig('../reports/figures/genre_radar_chart.png', dpi=300, bbox_inches='tight')
    plt.show()


## 8. Summary and Insights

In [None]:
print("=" * 60)
print("FEATURE ANALYSIS SUMMARY")
print("=" * 60)

print("\nðŸ“Š KEY INSIGHTS:")
print("-" * 40)

# Popularity insights
print(f"1. Popularity Distribution:")
print(f"   â€¢ Mean popularity: {df['popularity'].mean():.1f}")
print(f"   â€¢ Most songs fall in category: {df['popularity_category_label'].mode().iloc[0]}")

# Correlation insights
top_pop_corr = popularity_corrs.head(3)
print(f"\n2. Top Correlations with Popularity:")
for feature, corr in top_pop_corr.items():
    direction = "positively" if corr > 0 else "negatively"
    print(f"   â€¢ {feature}: {corr:.3f} ({direction} correlated)")

# Genre insights
if genre_col and genre_col in df.columns:
    top_pop_genre = genre_stats['popularity'].idxmax()
    lowest_pop_genre = genre_stats['popularity'].idxmin()
    print(f"\n3. Genre Analysis:")
    print(f"   â€¢ Highest average popularity: {top_pop_genre} ({genre_stats.loc[top_pop_genre, 'popularity']:.1f})")
    print(f"   â€¢ Lowest average popularity: {lowest_pop_genre} ({genre_stats.loc[lowest_pop_genre, 'popularity']:.1f})")

# Feature ranges
print(f"\n4. Audio Feature Ranges:")
for feature in ['danceability', 'energy', 'valence']:
    if feature in df.columns:
        print(f"   â€¢ {feature}: {df[feature].min():.2f} - {df[feature].max():.2f}")

print("\n" + "=" * 60)
print("Feature analysis completed!")
print("Next step: Run 03_modeling.ipynb for predictive modeling")
print("=" * 60)
