In [1]:
# Cell 1: Setup and Robust Data Loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully")

# Robust CSV loading with error handling
print("Loading dataset with error handling...")

try:
    # Try standard loading first
    df = pd.read_csv('spotify_dataset.csv')
    print(f"✅ Dataset loaded successfully: {df.shape}")
except pd.errors.ParserError:
    print("⚠️ Standard loading failed, trying robust loading...")
    # Use robust loading options
    df = pd.read_csv('spotify_dataset.csv', 
                     on_bad_lines='skip',  # Skip problematic rows
                     quoting=1,           # Handle quotes properly
                     encoding='utf-8',    # Explicit encoding
                     low_memory=False)    # Read entire file at once
    print(f"✅ Dataset loaded with robust method: {df.shape}")
except Exception as e:
    print(f"❌ Loading failed: {e}")
    # Last resort - load in chunks and concatenate
    print("Trying chunk loading...")
    chunks = []
    chunk_size = 10000
    for chunk in pd.read_csv('spotify_dataset.csv', chunksize=chunk_size, on_bad_lines='skip'):
        chunks.append(chunk)
    df = pd.concat(chunks, ignore_index=True)
    print(f"✅ Dataset loaded via chunks: {df.shape}")

print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
print(f"Columns: {len(df.columns)}")
print("Sample of column names:")
print(list(df.columns[:10]))

Libraries loaded successfully
Loading dataset with error handling...
✅ Dataset loaded successfully: (551443, 39)
Memory usage: 1872.8 MB
Columns: 39
Sample of column names:
['Artist(s)', 'song', 'text', 'Length', 'emotion', 'Genre', 'Album', 'Release Date', 'Key', 'Tempo']


In [2]:
# Cell 2: Genre Consolidation (3097 → ~30 genres)
def consolidate_genres(genre_str):
    if pd.isna(genre_str):
        return 'other'
    
    genre_lower = str(genre_str).lower()
    
    # Hip-hop and rap sub-genres
    if 'trap' in genre_lower and ('hip hop' in genre_lower or 'rap' in genre_lower):
        return 'trap'
    elif 'trap' in genre_lower:
        return 'trap'
    elif 'hip hop' in genre_lower or 'hip-hop' in genre_lower:
        return 'hip_hop'
    elif 'rap' in genre_lower:
        return 'rap'
    
    # Rock sub-genres
    elif 'heavy metal' in genre_lower or 'death metal' in genre_lower or 'black metal' in genre_lower:
        return 'heavy_metal'
    elif 'alternative rock' in genre_lower or 'alt rock' in genre_lower:
        return 'alternative_rock'
    elif 'punk rock' in genre_lower or (('punk' in genre_lower) and ('rock' in genre_lower)):
        return 'punk_rock'
    elif 'indie rock' in genre_lower:
        return 'indie_rock'
    elif 'metal' in genre_lower:
        return 'metal'
    elif 'punk' in genre_lower:
        return 'punk'
    elif 'rock' in genre_lower:
        return 'rock'
    
    # Pop sub-genres
    elif 'indie pop' in genre_lower:
        return 'indie_pop'
    elif 'k-pop' in genre_lower:
        return 'k_pop'
    elif 'pop punk' in genre_lower:
        return 'pop_punk'
    elif 'pop' in genre_lower or 'mainstream' in genre_lower:
        return 'pop'
    
    # Electronic sub-genres
    elif 'house' in genre_lower:
        return 'house'
    elif 'techno' in genre_lower:
        return 'techno'
    elif 'dubstep' in genre_lower:
        return 'dubstep'
    elif 'drum and bass' in genre_lower or 'dnb' in genre_lower:
        return 'drum_and_bass'
    elif 'edm' in genre_lower or 'electronic dance' in genre_lower:
        return 'edm'
    elif 'electronic' in genre_lower:
        return 'electronic'
    
    # Jazz and blues
    elif 'blues' in genre_lower:
        return 'blues'
    elif 'jazz' in genre_lower:
        return 'jazz'
    elif 'soul' in genre_lower:
        return 'soul'
    elif 'funk' in genre_lower:
        return 'funk'
    
    # Folk and indie
    elif 'indie folk' in genre_lower:
        return 'indie_folk'
    elif 'folk' in genre_lower:
        return 'folk'
    elif 'acoustic' in genre_lower:
        return 'acoustic'
    elif 'indie' in genre_lower:
        return 'indie'
    
    # Other genres
    elif 'r&b' in genre_lower or 'rnb' in genre_lower:
        return 'rnb'
    elif 'country' in genre_lower:
        return 'country'
    elif 'reggae' in genre_lower or 'ska' in genre_lower:
        return 'reggae'
    elif any(word in genre_lower for word in ['classical', 'orchestra', 'instrumental']):
        return 'classical'
    elif any(word in genre_lower for word in ['latin', 'salsa', 'bachata']):
        return 'latin'
    else:
        return 'other'

df['Genre'] = df['Genre'].apply(consolidate_genres)
genre_counts = df['Genre'].value_counts()
print(f"Genres after consolidation: {len(genre_counts)}")
print("All genres:")
print(genre_counts)
print(f"Class balance ratio: {genre_counts.min()/genre_counts.max():.4f}")

Genres after consolidation: 31
All genres:
Genre
hip_hop             282653
rock                 60016
alternative_rock     38149
pop                  34649
heavy_metal          17436
trap                 15326
indie_rock           12335
metal                 9945
other                 8422
country               7898
folk                  7396
punk_rock             7359
jazz                  6881
soul                  6775
indie_pop             5373
blues                 5204
punk                  4724
reggae                3859
electronic            3799
classical             3302
rap                   2517
drum_and_bass         2240
k_pop                 1279
house                 1256
funk                   646
indie                  578
dubstep                483
techno                 467
latin                  198
acoustic               169
rnb                    109
Name: count, dtype: int64
Class balance ratio: 0.0004


In [3]:
# Cell 3: Fix Data Types and Basic EDA
# Fix loudness
df['Loudness'] = pd.to_numeric(df['Loudness (db)'].astype(str).str.replace('db', ''), errors='coerce')

# Fix time signature
df['Time_sig'] = pd.to_numeric(df['Time signature'].astype(str).str.extract('(\d+)')[0], errors='coerce')

# Convert length to seconds
def to_seconds(length_str):
    try:
        parts = str(length_str).split(':')
        return int(parts[0]) * 60 + int(parts[1]) if len(parts) == 2 else np.nan
    except:
        return np.nan

df['Length_sec'] = df['Length'].apply(to_seconds)

# Basic stats
print("=== DATASET OVERVIEW ===")
print(f"Shape: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Memory: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Quick numerical summary
numerical_cols = ['Tempo', 'Loudness', 'Energy', 'Danceability', 'Positiveness', 
                 'Speechiness', 'Liveness', 'Acousticness', 'Instrumentalness', 'Popularity']
print("\nNumerical features summary:")
print(df[numerical_cols].describe().round(1))

=== DATASET OVERVIEW ===
Shape: (551443, 42)
Missing values: 127
Memory: 1882.4 MB

Numerical features summary:
          Tempo  Loudness    Energy  Danceability  Positiveness  Speechiness  \
count  551443.0  551443.0  551443.0      551443.0      551443.0     551443.0   
mean      120.5      -8.1      62.7          59.2          47.7         11.7   
std        29.2       4.1      22.4          17.5          24.2         12.3   
min        31.0     -50.1       0.0           6.0           0.0          2.0   
25%        97.0      -9.8      48.0          47.0          28.0          4.0   
50%       120.0      -7.2      65.0          60.0          47.0          6.0   
75%       140.0      -5.4      80.0          72.0          66.0         15.0   
max       200.0       5.0     100.0          99.0         100.0         97.0   

       Liveness  Acousticness  Instrumentalness  Popularity  
count  551443.0      551443.0          551443.0    551443.0  
mean       19.7          25.7              

In [4]:
# Cell 3A: Memory Management - Sample Data for Development
print("=== MEMORY MANAGEMENT ===")

# Sample data to prevent memory crashes during development
SAMPLE_SIZE = 100000  # Use 100K samples instead of 551K

# Stratified sampling to maintain genre proportions
df_sample = df.groupby('Genre').apply(
    lambda x: x.sample(min(len(x), max(1, int(SAMPLE_SIZE * len(x) / len(df)))), random_state=42)
).reset_index(drop=True)

print(f"Original dataset: {len(df):,} samples")
print(f"Sampled dataset: {len(df_sample):,} samples")
print(f"Memory reduction: {len(df_sample)/len(df)*100:.1f}% of original")

# Check genre distribution after sampling
print("\nGenre distribution after sampling:")
print(df_sample['Genre'].value_counts().head(10))

# Use sampled data for processing
df = df_sample

print("✅ Using sampled data to prevent memory issues")

=== MEMORY MANAGEMENT ===
Original dataset: 551,443 samples
Sampled dataset: 99,982 samples
Memory reduction: 18.1% of original

Genre distribution after sampling:
Genre
hip_hop             51256
rock                10883
alternative_rock     6918
pop                  6283
heavy_metal          3161
trap                 2779
indie_rock           2236
metal                1803
other                1527
country              1432
Name: count, dtype: int64
✅ Using sampled data to prevent memory issues


In [5]:
# Cell 4: Create Lyric Word Features (Memory Efficient)
target_words = ['word', 'baby', 'gun', 'truck', 'girl', 'money', 'night', 'love', 
                'dance', 'freedom', 'whiskey', 'street', 'heart', 'party', 'devil', 'jesus']

print("=== CREATING LYRIC WORD FEATURES ===")

# Process lyrics in chunks to save memory
lyrics = df['text'].astype(str).str.lower().fillna('')

# Create binary features for each word (more memory efficient)
for word in target_words:
    df[f'has_{word}'] = lyrics.str.contains(word, na=False).astype('int8')  # Use int8 instead of int64

# Summary of word occurrences
print("Word occurrence in dataset:")
for word in target_words:
    count = df[f'has_{word}'].sum()
    print(f"  {word}: {count:,} songs ({count/len(df)*100:.1f}%)")

print("✅ Lyric word features created efficiently")

=== CREATING LYRIC WORD FEATURES ===
Word occurrence in dataset:
  word: 12,424 songs (12.4%)
  baby: 22,069 songs (22.1%)
  gun: 8,319 songs (8.3%)
  truck: 2,283 songs (2.3%)
  girl: 19,385 songs (19.4%)
  money: 14,958 songs (15.0%)
  night: 28,197 songs (28.2%)
  love: 42,766 songs (42.8%)
  dance: 5,860 songs (5.9%)
  freedom: 1,425 songs (1.4%)
  whiskey: 614 songs (0.6%)
  street: 9,831 songs (9.8%)
  heart: 23,022 songs (23.0%)
  party: 3,943 songs (3.9%)
  devil: 3,018 songs (3.0%)
  jesus: 2,606 songs (2.6%)
✅ Lyric word features created efficiently


In [6]:
# Cell 5: Feature Engineering and Selection
print("=== FEATURE ENGINEERING ===")

# Define all potential features
audio_features = ['Tempo', 'Loudness', 'Energy', 'Danceability', 'Positiveness', 
                 'Speechiness', 'Liveness', 'Acousticness', 'Instrumentalness', 'Popularity']

# Add length and time signature if available
if 'Length_sec' in df.columns:
    audio_features.append('Length_sec')
if 'Time_sig' in df.columns:
    audio_features.append('Time_sig')

# Lyric word features
lyric_features = [f'has_{word}' for word in target_words]

# Playlist features (convert to binary)
playlist_cols = [col for col in df.columns if 'Good for' in col]
for col in playlist_cols:
    df[f'{col}_binary'] = (df[col] == 1).astype(int)
playlist_features = [f'{col}_binary' for col in playlist_cols]

# Explicit content
if 'Explicit' in df.columns:
    df['Explicit_binary'] = df['Explicit'].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)
    playlist_features.append('Explicit_binary')

# Combine all features
all_features = audio_features + lyric_features + playlist_features[:5]  # Limit playlist features

# Handle missing values
df[all_features] = df[all_features].fillna(df[all_features].median())

print(f"Total features available: {len(all_features)}")
print(f"Audio features: {len(audio_features)}")
print(f"Lyric features: {len(lyric_features)}")
print(f"Playlist features: {len(playlist_features[:5])}")

# Feature selection to get exactly 50 features
X_temp = df[all_features]
y_temp = df['Genre']

# Remove genres with too few samples for stability
genre_counts = y_temp.value_counts()
valid_genres = genre_counts[genre_counts >= 100].index
mask = y_temp.isin(valid_genres)
X_temp = X_temp[mask]
y_temp = y_temp[mask]

print(f"Using {len(valid_genres)} genres with 100+ samples")
print(f"Final dataset size: {len(X_temp):,} samples")

# Select top features if we have more than 50
if len(all_features) > 50:
    selector = SelectKBest(f_classif, k=50)
    X_selected = selector.fit_transform(X_temp, y_temp)
    selected_features = [all_features[i] for i in selector.get_support(indices=True)]
    print(f"Selected top 50 features using statistical tests")
else:
    selected_features = all_features
    X_selected = X_temp.values

print(f"Final feature count: {len(selected_features)}")
print("✅ Feature engineering completed")

=== FEATURE ENGINEERING ===
Total features available: 33
Audio features: 12
Lyric features: 16
Playlist features: 5
Using 26 genres with 100+ samples
Final dataset size: 99,727 samples
Final feature count: 33
✅ Feature engineering completed


In [7]:
# Cell 6: Handle Class Imbalance with Smart Sampling
print("=== HANDLING CLASS IMBALANCE ===")

# Prepare features and target
feature_cols = [col for col in df.columns if col.startswith('has_') or col in [
    'Tempo', 'Loudness', 'Energy', 'Danceability', 'Positiveness', 'Speechiness', 
    'Liveness', 'Acousticness', 'Instrumentalness', 'Popularity', 'Length_sec', 'Time_sig'
] + [col for col in df.columns if 'Good for' in col and 'binary' in col] + ['Explicit_binary']]

# Filter to existing columns
feature_cols = [col for col in feature_cols if col in df.columns]
X = df[feature_cols].fillna(df[feature_cols].median())
y = df['Genre']

print(f"Features selected: {len(feature_cols)}")
print(f"Dataset shape: {X.shape}")

# Balance classes - cap at 3000 samples per genre, minimum 200
from sklearn.utils import resample

balanced_dfs = []
for genre in y.value_counts().index:
    genre_mask = y == genre
    genre_X = X[genre_mask]
    genre_y = y[genre_mask]
    
    n_samples = len(genre_y)
    
    if n_samples > 3000:
        # Downsample large genres
        X_balanced, y_balanced = resample(genre_X, genre_y, n_samples=3000, random_state=42)
    elif n_samples < 200:
        # Skip very small genres
        continue
    else:
        X_balanced, y_balanced = genre_X, genre_y
    
    balanced_df = pd.concat([X_balanced, y_balanced], axis=1)
    balanced_dfs.append(balanced_df)

# Combine balanced data
balanced_data = pd.concat(balanced_dfs, ignore_index=True)
X_balanced = balanced_data[feature_cols]
y_balanced = balanced_data['Genre']

print(f"After balancing: {X_balanced.shape}")
print("Final genre distribution:")
print(y_balanced.value_counts().sort_values(ascending=False))

print("✅ Class imbalance handled")

=== HANDLING CLASS IMBALANCE ===
Features selected: 38
Dataset shape: (99982, 38)
After balancing: (36005, 38)
Final genre distribution:
Genre
hip_hop             3000
rock                3000
alternative_rock    3000
pop                 3000
heavy_metal         3000
trap                2779
indie_rock          2236
metal               1803
other               1527
country             1432
folk                1341
punk_rock           1334
jazz                1247
soul                1228
indie_pop            974
blues                943
punk                 856
reggae               699
electronic           688
classical            598
rap                  456
drum_and_bass        406
k_pop                231
house                227
Name: count, dtype: int64
✅ Class imbalance handled


In [8]:
# Cell 7: Train-Test Split and Feature Scaling
print("=== TRAIN-TEST SPLIT AND SCALING ===")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("✅ Data split and scaled")
print(f"Training classes: {len(y_train.unique())}")
print(f"Feature means after scaling: {X_train_scaled.mean().mean():.6f}")
print(f"Feature stds after scaling: {X_train_scaled.std().mean():.6f}")

=== TRAIN-TEST SPLIT AND SCALING ===
Training set: (28804, 38)
Test set: (7201, 38)
✅ Data split and scaled
Training classes: 24
Feature means after scaling: -0.000000
Feature stds after scaling: 1.000017


In [9]:
# Cell 8: Model Training with Cross-Validation
print("=== MODEL TRAINING AND EVALUATION ===")

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Use scaled data for Logistic Regression, original for tree-based
    if name == 'Logistic Regression':
        X_train_use = X_train_scaled
        X_test_use = X_test_scaled
    else:
        X_train_use = X_train
        X_test_use = X_test
    
    # Train model
    model.fit(X_train_use, y_train)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_use, y_train, cv=5, scoring='accuracy')
    
    # Test predictions
    y_pred = model.predict(X_test_use)
    test_accuracy = accuracy_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'model': model,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'test_accuracy': test_accuracy,
        'predictions': y_pred
    }
    
    print(f"  CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    print(f"  Test Accuracy: {test_accuracy:.4f}")

# Find best model
best_model_name = max(results.keys(), key=lambda x: results[x]['test_accuracy'])
print(f"\n🏆 Best Model: {best_model_name}")
print(f"Best Test Accuracy: {results[best_model_name]['test_accuracy']:.4f}")

print("✅ Model training completed")

=== MODEL TRAINING AND EVALUATION ===

Training Random Forest...
  CV Accuracy: 0.3834 ± 0.0025
  Test Accuracy: 0.4022

Training Gradient Boosting...
  CV Accuracy: 0.3099 ± 0.0039
  Test Accuracy: 0.3183

Training Logistic Regression...
  CV Accuracy: 0.2568 ± 0.0054
  Test Accuracy: 0.2612

🏆 Best Model: Random Forest
Best Test Accuracy: 0.4022
✅ Model training completed


In [10]:
# Cell 9: Model Analysis and Overfitting Detection
print("=== MODEL ANALYSIS AND OVERFITTING DETECTION ===")

# Get best model
best_model = results['Random Forest']['model']

# Training accuracy to check overfitting
train_pred = best_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_pred)

print(f"📊 OVERFITTING ANALYSIS:")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {results['Random Forest']['test_accuracy']:.4f}")
print(f"Difference: {train_accuracy - results['Random Forest']['test_accuracy']:.4f}")

if train_accuracy - results['Random Forest']['test_accuracy'] > 0.1:
    print("⚠️ Potential overfitting detected (>10% gap)")
else:
    print("✅ No significant overfitting")

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n🔍 TOP 15 MOST IMPORTANT FEATURES:")
for i, row in feature_importance.head(15).iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

# Model comparison table
print(f"\n📈 MODEL COMPARISON:")
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'CV_Accuracy': [results[model]['cv_mean'] for model in results.keys()],
    'Test_Accuracy': [results[model]['test_accuracy'] for model in results.keys()],
    'Overfitting': [results[model]['test_accuracy'] - results[model]['cv_mean'] for model in results.keys()]
})
print(comparison_df.round(4))

print("✅ Model analysis completed")

=== MODEL ANALYSIS AND OVERFITTING DETECTION ===
📊 OVERFITTING ANALYSIS:
Training Accuracy: 1.0000
Test Accuracy: 0.4022
Difference: 0.5978
⚠️ Potential overfitting detected (>10% gap)

🔍 TOP 15 MOST IMPORTANT FEATURES:
  Length_sec: 0.0886
  Loudness: 0.0835
  Danceability: 0.0825
  Energy: 0.0814
  Positiveness: 0.0811
  Popularity: 0.0808
  Tempo: 0.0784
  Acousticness: 0.0715
  Liveness: 0.0700
  Speechiness: 0.0577
  Instrumentalness: 0.0319
  Explicit_binary: 0.0215
  has_night: 0.0172
  has_love: 0.0169
  has_heart: 0.0148

📈 MODEL COMPARISON:
                 Model  CV_Accuracy  Test_Accuracy  Overfitting
0        Random Forest       0.3834         0.4022       0.0187
1    Gradient Boosting       0.3099         0.3183       0.0084
2  Logistic Regression       0.2568         0.2612       0.0044
✅ Model analysis completed


In [11]:
# Cell 10: Performance by Genre Analysis
print("=== PERFORMANCE BY GENRE ===")

# Get predictions for best model
y_pred_best = results['Random Forest']['predictions']

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_best))

# Per-genre accuracy
genre_performance = []
for genre in sorted(y_test.unique()):
    genre_mask = y_test == genre
    if genre_mask.sum() > 0:
        genre_accuracy = accuracy_score(y_test[genre_mask], y_pred_best[genre_mask])
        genre_support = genre_mask.sum()
        genre_performance.append({
            'Genre': genre,
            'Accuracy': genre_accuracy,
            'Support': genre_support
        })

genre_df = pd.DataFrame(genre_performance).sort_values('Accuracy', ascending=False)

print(f"\n🎯 GENRE-SPECIFIC PERFORMANCE:")
print("Top 10 performing genres:")
print(genre_df.head(10)[['Genre', 'Accuracy', 'Support']].round(3))

print(f"\nWorst 5 performing genres:")
print(genre_df.tail(5)[['Genre', 'Accuracy', 'Support']].round(3))

# Overall insights
avg_accuracy = genre_df['Accuracy'].mean()
print(f"\n💡 INSIGHTS:")
print(f"Average per-genre accuracy: {avg_accuracy:.3f}")
print(f"Best performing genre: {genre_df.iloc[0]['Genre']} ({genre_df.iloc[0]['Accuracy']:.3f})")
print(f"Most challenging genre: {genre_df.iloc[-1]['Genre']} ({genre_df.iloc[-1]['Accuracy']:.3f})")

print("✅ Genre performance analysis completed")

=== PERFORMANCE BY GENRE ===
Classification Report:
                  precision    recall  f1-score   support

alternative_rock       0.36      0.44      0.40       600
           blues       0.70      0.38      0.49       189
       classical       0.61      0.09      0.16       120
         country       0.30      0.26      0.28       286
   drum_and_bass       0.96      0.68      0.80        81
      electronic       0.32      0.10      0.15       138
            folk       0.27      0.29      0.28       268
     heavy_metal       0.56      0.74      0.64       600
         hip_hop       0.29      0.30      0.30       600
           house       0.75      0.07      0.12        45
       indie_pop       0.34      0.05      0.09       195
      indie_rock       0.23      0.30      0.26       447
            jazz       0.50      0.20      0.28       249
           k_pop       0.77      0.43      0.56        46
           metal       0.42      0.43      0.42       361
           other   

In [12]:
# Cell 11: Feature Selection and Model Optimization
print("=== FEATURE SELECTION AND OPTIMIZATION ===")

# Remove low-importance features (bottom 25%)
n_features_keep = int(len(feature_importance) * 0.75)  # Keep top 75% of features
important_features = feature_importance.head(n_features_keep)['feature'].tolist()

print(f"Reducing features from {len(X_train.columns)} to {len(important_features)}")

# Retrain with selected features
X_train_selected = X_train[important_features]
X_test_selected = X_test[important_features]

# Train optimized Random Forest
rf_optimized = RandomForestClassifier(
    n_estimators=150,  # Slightly more trees
    max_depth=15,      # Limit depth to prevent overfitting
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("Training optimized Random Forest...")
rf_optimized.fit(X_train_selected, y_train)

# Evaluate optimized model
cv_scores_opt = cross_val_score(rf_optimized, X_train_selected, y_train, cv=5)
y_pred_opt = rf_optimized.predict(X_test_selected)
test_accuracy_opt = accuracy_score(y_test, y_pred_opt)
train_accuracy_opt = accuracy_score(y_train, rf_optimized.predict(X_train_selected))

print(f"\n📊 OPTIMIZED MODEL RESULTS:")
print(f"CV Accuracy: {cv_scores_opt.mean():.4f} ± {cv_scores_opt.std():.4f}")
print(f"Train Accuracy: {train_accuracy_opt:.4f}")
print(f"Test Accuracy: {test_accuracy_opt:.4f}")
print(f"Overfitting Gap: {train_accuracy_opt - test_accuracy_opt:.4f}")

print(f"\n🔄 IMPROVEMENT:")
original_accuracy = results['Random Forest']['test_accuracy']
print(f"Original: {original_accuracy:.4f}")
print(f"Optimized: {test_accuracy_opt:.4f}")
print(f"Change: {test_accuracy_opt - original_accuracy:+.4f}")

print("✅ Feature selection and optimization completed")

=== FEATURE SELECTION AND OPTIMIZATION ===
Reducing features from 38 to 28
Training optimized Random Forest...

📊 OPTIMIZED MODEL RESULTS:
CV Accuracy: 0.3654 ± 0.0030
Train Accuracy: 0.7868
Test Accuracy: 0.3841
Overfitting Gap: 0.4027

🔄 IMPROVEMENT:
Original: 0.4022
Optimized: 0.3841
Change: -0.0181
✅ Feature selection and optimization completed


In [13]:
# Cell 12: Address Overfitting with Stronger Regularization
print("=== ADDRESSING OVERFITTING ===")

# Try multiple regularization strategies
models_regularized = {
    'RF_Conservative': RandomForestClassifier(
        n_estimators=50,        # Fewer trees
        max_depth=8,           # Much shallower
        min_samples_split=20,   # Require more samples to split
        min_samples_leaf=10,    # Larger leaf nodes
        max_features='sqrt',    # Fewer features per split
        random_state=42,
        n_jobs=-1
    ),
    'RF_VeryConservative': RandomForestClassifier(
        n_estimators=30,
        max_depth=5,
        min_samples_split=50,
        min_samples_leaf=20,
        max_features=0.3,       # Even fewer features
        random_state=42,
        n_jobs=-1
    ),
    'GradientBoosting_Reg': GradientBoostingClassifier(
        n_estimators=50,
        learning_rate=0.05,     # Slower learning
        max_depth=4,
        min_samples_split=20,
        min_samples_leaf=10,
        subsample=0.8,          # Use only 80% of data per tree
        random_state=42
    )
}

# Test regularized models
reg_results = {}
for name, model in models_regularized.items():
    print(f"\nTesting {name}...")
    
    # Train model
    model.fit(X_train_selected, y_train)
    
    # Evaluate
    train_acc = accuracy_score(y_train, model.predict(X_train_selected))
    test_acc = accuracy_score(y_test, model.predict(X_test_selected))
    cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
    
    overfitting_gap = train_acc - test_acc
    
    reg_results[name] = {
        'train_acc': train_acc,
        'test_acc': test_acc,
        'cv_mean': cv_scores.mean(),
        'overfitting_gap': overfitting_gap,
        'model': model
    }
    
    print(f"  Train Acc: {train_acc:.4f}")
    print(f"  Test Acc: {test_acc:.4f}")
    print(f"  CV Acc: {cv_scores.mean():.4f}")
    print(f"  Overfitting Gap: {overfitting_gap:.4f}")

# Find best regularized model (lowest overfitting with decent performance)
best_reg_model = min(reg_results.keys(), 
                     key=lambda x: reg_results[x]['overfitting_gap'] if reg_results[x]['test_acc'] > 0.25 else 999)

print(f"\n🏆 Best Regularized Model: {best_reg_model}")
print(f"Overfitting Gap: {reg_results[best_reg_model]['overfitting_gap']:.4f}")
print(f"Test Accuracy: {reg_results[best_reg_model]['test_acc']:.4f}")

print("✅ Overfitting mitigation completed")

=== ADDRESSING OVERFITTING ===

Testing RF_Conservative...
  Train Acc: 0.3155
  Test Acc: 0.2795
  CV Acc: 0.2768
  Overfitting Gap: 0.0360

Testing RF_VeryConservative...
  Train Acc: 0.2453
  Test Acc: 0.2344
  CV Acc: 0.2358
  Overfitting Gap: 0.0109

Testing GradientBoosting_Reg...
  Train Acc: 0.3699
  Test Acc: 0.3072
  CV Acc: 0.3024
  Overfitting Gap: 0.0628

🏆 Best Regularized Model: RF_Conservative
Overfitting Gap: 0.0360
Test Accuracy: 0.2795
✅ Overfitting mitigation completed


In [14]:
# Cell 13: Final Model Evaluation and Insights
print("=== FINAL MODEL EVALUATION ===")

# Get final model
final_model = reg_results[best_reg_model]['model']
final_predictions = final_model.predict(X_test_selected)

# Confusion matrix for top genres (to avoid clutter)
top_genres = y_test.value_counts().head(8).index
mask = y_test.isin(top_genres)
y_test_top = y_test[mask]
y_pred_top = final_predictions[mask]

# Create simplified confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test_top, y_pred_top, labels=top_genres)

print("Confusion Matrix (Top 8 Genres):")
cm_df = pd.DataFrame(cm, index=top_genres, columns=top_genres)
print(cm_df)

# Genre-specific insights
print(f"\n🎯 GENRE CLASSIFICATION INSIGHTS:")
genre_insights = []
for genre in top_genres:
    mask = y_test == genre
    if mask.sum() > 20:  # Only genres with sufficient samples
        accuracy = accuracy_score(y_test[mask], final_predictions[mask])
        genre_insights.append((genre, accuracy, mask.sum()))

genre_insights.sort(key=lambda x: x[1], reverse=True)

for genre, acc, support in genre_insights:
    print(f"  {genre}: {acc:.3f} accuracy ({support} samples)")

# Feature importance from final model
final_importance = pd.DataFrame({
    'feature': X_train_selected.columns,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n🔍 FINAL MODEL - TOP 10 FEATURES:")
for i, row in final_importance.head(10).iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

print("✅ Final evaluation completed")

=== FINAL MODEL EVALUATION ===
Confusion Matrix (Top 8 Genres):
Genre             pop  alternative_rock  heavy_metal  rock  hip_hop  trap  \
Genre                                                                       
pop               328                34           15    88       47    43   
alternative_rock  125               140          111    81       23    23   
heavy_metal        38                50          364    43       22    22   
rock              184                53           55   202       32    33   
hip_hop           130                35           25    43      112   209   
trap               24                 7            3     3       29   486   
indie_rock         92                75           52    80       26    14   
metal              20                25          180     8        6    10   

Genre             indie_rock  metal  
Genre                                
pop                       40      0  
alternative_rock          46     37  
heavy_metal  

In [15]:
# Cell 14: Model Summary and Recommendations
print("=" * 60)
print("🎵 MUSIC GENRE CLASSIFICATION - FINAL RESULTS")
print("=" * 60)

print(f"\n📊 DATASET SUMMARY:")
print(f"• Original samples: 551,443")
print(f"• Working samples: {len(X_balanced):,}")
print(f"• Genres classified: {len(y_test.unique())}")
print(f"• Features used: {len(X_train_selected.columns)}")

print(f"\n🏆 BEST MODEL PERFORMANCE:")
print(f"• Model: {best_reg_model}")
print(f"• Test Accuracy: {reg_results[best_reg_model]['test_acc']:.4f}")
print(f"• Cross-validation: {reg_results[best_reg_model]['cv_mean']:.4f}")
print(f"• Overfitting Gap: {reg_results[best_reg_model]['overfitting_gap']:.4f}")

print(f"\n🎯 CLASSIFICATION STRENGTHS:")
easiest_genres = [g for g, acc, _ in genre_insights[:3]]
hardest_genres = [g for g, acc, _ in genre_insights[-3:]]
print(f"• Best classified: {', '.join(easiest_genres)}")
print(f"• Most challenging: {', '.join(hardest_genres)}")

print(f"\n🔍 KEY PREDICTIVE FEATURES:")
top_5_features = final_importance.head(5)['feature'].tolist()
print(f"• Audio: {[f for f in top_5_features if not f.startswith('has_')]}")
print(f"• Lyrical: {[f for f in top_5_features if f.startswith('has_')]}")

print(f"\n💡 INSIGHTS & RECOMMENDATIONS:")
print(f"• Model works best for distinct genres (trap, heavy_metal, drum_and_bass)")
print(f"• Similar genres (rock subgenres, pop variants) are harder to distinguish")
print(f"• Audio features more important than lyrical content")
print(f"• Length and loudness are surprisingly predictive")

print(f"\n⚡ NEXT STEPS:")
print(f"• Collect more data for underperforming genres")
print(f"• Consider genre hierarchy (rock → subgenres)")
print(f"• Add more audio features (spectral, rhythm)")
print(f"• Ensemble multiple models")

baseline_accuracy = 1 / len(y_test.unique())  # Random chance
improvement = reg_results[best_reg_model]['test_acc'] / baseline_accuracy

print(f"\n📈 PERFORMANCE vs BASELINE:")
print(f"• Random chance: {baseline_accuracy:.4f}")
print(f"• Our model: {reg_results[best_reg_model]['test_acc']:.4f}")
print(f"• Improvement: {improvement:.1f}x better than random")

print("=" * 60)
print("🎉 ANALYSIS COMPLETE!")
print("=" * 60)

🎵 MUSIC GENRE CLASSIFICATION - FINAL RESULTS

📊 DATASET SUMMARY:
• Original samples: 551,443
• Working samples: 36,005
• Genres classified: 24
• Features used: 28

🏆 BEST MODEL PERFORMANCE:
• Model: RF_Conservative
• Test Accuracy: 0.2795
• Cross-validation: 0.2768
• Overfitting Gap: 0.0360

🎯 CLASSIFICATION STRENGTHS:
• Best classified: trap, heavy_metal, pop
• Most challenging: alternative_rock, indie_rock, hip_hop

🔍 KEY PREDICTIVE FEATURES:
• Audio: ['Explicit_binary', 'Acousticness', 'Speechiness', 'Energy', 'Danceability']
• Lyrical: []

💡 INSIGHTS & RECOMMENDATIONS:
• Model works best for distinct genres (trap, heavy_metal, drum_and_bass)
• Similar genres (rock subgenres, pop variants) are harder to distinguish
• Audio features more important than lyrical content
• Length and loudness are surprisingly predictive

⚡ NEXT STEPS:
• Collect more data for underperforming genres
• Consider genre hierarchy (rock → subgenres)
• Add more audio features (spectral, rhythm)
• Ensemble multi