# Spotify Songs Analysis - Predictive Modeling

## 1. Setup and Data Loading


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src to path for custom modules
sys.path.append('../src')
from modeling import make_baseline_model, train_and_evaluate, save_model

# Set up paths
DATA_DIR = Path('../data')
CLEAN_DATA_PATH = DATA_DIR / 'clean_spotify.csv'

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

# Set larger font sizes for better readability
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14


In [None]:
# Load cleaned data
df = pd.read_csv(CLEAN_DATA_PATH)
print(f"Dataset shape: {df.shape}")
print("\nFirst 3 rows:")
display(df.head(3))


## 2. Create Target Variable


In [None]:
# Create popularity categories (target variable) if not already created by notebook 2
# 0 = Low (<40), 1 = Medium (40-60), 2 = High (>60)
if 'popularity_category' not in df.columns:
    df['popularity_category'] = pd.cut(
        df['popularity'], 
        bins=[-1, 40, 60, 101], 
        labels=[0, 1, 2]
    ).astype(int)
    print("Created popularity_category column")
else:
    # Convert to integer if it's already there (might be from notebook 2)
    if df['popularity_category'].dtype == 'object' or df['popularity_category'].dtype.name == 'category':
        # Convert string labels to integers if needed
        label_map = {'Low': 0, 'Medium': 1, 'High': 2}
        if df['popularity_category'].dtype.name == 'category':
            # Get the categories
            cats = df['popularity_category'].cat.categories.tolist()
            # If they're strings, map them
            if any(isinstance(c, str) for c in cats):
                df['popularity_category'] = df['popularity_category'].astype(str).map(label_map)
            else:
                df['popularity_category'] = df['popularity_category'].astype(int)
        else:
            df['popularity_category'] = df['popularity_category'].map(label_map).fillna(df['popularity_category']).astype(int)
        print("Converted popularity_category to integer format")
    else:
        df['popularity_category'] = df['popularity_category'].astype(int)
        print("Using existing popularity_category column")

print("\nPopularity category distribution:")
print(df['popularity_category'].value_counts().sort_index())

# Check for any NaN values
if df['popularity_category'].isna().any():
    print("\nWarning: Found NaN values in popularity_category. Dropping these rows.")
    df = df.dropna(subset=['popularity_category'])


## 3. Feature Selection


In [None]:
# Select features for modeling
feature_columns = ['danceability', 'energy', 'loudness', 'acousticness', 
                   'instrumentalness', 'liveness', 'valence', 'tempo']

# Use only features that exist in the dataframe
available_features = [f for f in feature_columns if f in df.columns]

print(f"Features to use: {available_features}")
print(f"\nFeatures missing: {set(feature_columns) - set(available_features)}")

# Create feature matrix and target
X = df[available_features].copy()
y = df['popularity_category'].copy()

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts().sort_index()}")


## 4. Train/Test Split


In [None]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"\nTraining set target distribution:\n{y_train.value_counts().sort_index()}")
print(f"\nTest set target distribution:\n{y_test.value_counts().sort_index()}")


## 5. Model Training and Evaluation


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Model 1: Logistic Regression
print("=" * 60)
print("MODEL 1: Logistic Regression with StandardScaler")
print("=" * 60)

lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"F1 Score (macro): {f1_score(y_test, y_pred_lr, average='macro'):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr, target_names=['Low', 'Medium', 'High']))


In [None]:
# Model 2: Random Forest
print("=" * 60)
print("MODEL 2: Random Forest Classifier")
print("=" * 60)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"F1 Score (macro): {f1_score(y_test, y_pred_rf, average='macro'):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Low', 'Medium', 'High']))


In [None]:
# Optional: XGBoost (if available)
xgb_available = False
try:
    import xgboost as xgb
    print("=" * 60)
    print("MODEL 3: XGBoost Classifier")
    print("=" * 60)
    
    xgb_model = xgb.XGBClassifier(random_state=42, n_jobs=-1, eval_metric='mlogloss')
    xgb_model.fit(X_train, y_train)
    y_pred_xgb = xgb_model.predict(X_test)
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
    print(f"F1 Score (macro): {f1_score(y_test, y_pred_xgb, average='macro'):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_xgb, target_names=['Low', 'Medium', 'High']))
    xgb_available = True
except ImportError:
    print("XGBoost not available. Skipping XGBoost model.")
    xgb_available = False
except Exception as e:
    print(f"Error with XGBoost: {e}")
    xgb_available = False


## 6. Confusion Matrices Visualization


In [None]:
# Visualize confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Logistic Regression Confusion Matrix
cm_lr = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', ax=axes[0], 
            xticklabels=['Low', 'Medium', 'High'], 
            yticklabels=['Low', 'Medium', 'High'])
axes[0].set_title('Logistic Regression Confusion Matrix')
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')

# Random Forest Confusion Matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens', ax=axes[1],
            xticklabels=['Low', 'Medium', 'High'], 
            yticklabels=['Low', 'Medium', 'High'])
axes[1].set_title('Random Forest Confusion Matrix')
axes[1].set_ylabel('True Label')
axes[1].set_xlabel('Predicted Label')

plt.tight_layout()
plt.savefig('../reports/figures/confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()


## 7. Feature Importance


In [None]:
# Feature importance from Random Forest
feature_importance = pd.DataFrame({
    'feature': available_features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance (Random Forest):")
display(feature_importance)

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(range(len(feature_importance)), feature_importance['importance'], color='steelblue')
plt.yticks(range(len(feature_importance)), feature_importance['feature'])
plt.xlabel('Importance')
plt.title('Feature Importance (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../reports/figures/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()


## 8. Model Comparison and Selection


In [None]:
# Compare models
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Accuracy': [accuracy_score(y_test, y_pred_lr), accuracy_score(y_test, y_pred_rf)],
    'F1 Score (macro)': [f1_score(y_test, y_pred_lr, average='macro'), 
                         f1_score(y_test, y_pred_rf, average='macro')]
})

if xgb_available:
    results = pd.concat([results, pd.DataFrame({
        'Model': ['XGBoost'],
        'Accuracy': [accuracy_score(y_test, y_pred_xgb)],
        'F1 Score (macro)': [f1_score(y_test, y_pred_xgb, average='macro')]
    })], ignore_index=True)

print("Model Comparison:")
display(results.sort_values('Accuracy', ascending=False))

# Select best model based on F1 score
best_model_name = results.loc[results['F1 Score (macro)'].idxmax(), 'Model']
print(f"\nBest model (by F1 Score): {best_model_name}")


## 9. Save Best Model


In [None]:
# Save the best model (Random Forest for now, as it typically performs well)
from pathlib import Path
import joblib

models_dir = Path('../models')
models_dir.mkdir(parents=True, exist_ok=True)

# Save Random Forest model (you can change this to save the best model dynamically)
best_model = rf_model
model_path = models_dir / 'best_model.joblib'
joblib.dump(best_model, model_path)
print(f"Best model saved to: {model_path}")

# Also save the feature names for later use
feature_names_path = models_dir / 'feature_names.joblib'
joblib.dump(available_features, feature_names_path)
print(f"Feature names saved to: {feature_names_path}")


## 10. Summary and Insights


In [None]:
print("=" * 60)
print("MODELING SUMMARY")
print("=" * 60)

print(f"\nüìä MODEL PERFORMANCE:")
print("-" * 40)
for idx, row in results.iterrows():
    print(f"{row['Model']}:")
    print(f"  ‚Ä¢ Accuracy: {row['Accuracy']:.4f}")
    print(f"  ‚Ä¢ F1 Score (macro): {row['F1 Score (macro)']:.4f}")

print(f"\nüîç TOP FEATURES (Random Forest):")
print("-" * 40)
for idx, row in feature_importance.head(5).iterrows():
    print(f"  ‚Ä¢ {row['feature']}: {row['importance']:.4f}")

print(f"\nüíæ SAVED MODEL:")
print("-" * 40)
print(f"  ‚Ä¢ Model: {model_path}")
print(f"  ‚Ä¢ Features: {feature_names_path}")

print("\n" + "=" * 60)
print("Modeling completed successfully!")
print("=" * 60)
