In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

%matplotlib inline
seed = 42

np.random.seed(42)


# Part 1 - MLP for Regression (Spotify Song Popularity)

In this part, we will build a **4-layer** MLP to predict song popularity.

## 1.1 Data Loading and Preprocessing

This section prepares the dataset for a regression task on Spotify song popularity.

It includes feature selection, data cleaning, normalization, and trainâ€“test splitting,formatted to match the expected input structure of a fully connected neural network.


In [3]:
# Load the dataset
try:
    spotify_df = pd.read_csv('tracks.csv')
    
    # The tracks.csv has 'release_date', so we create 'year' from it
    if 'year' not in spotify_df.columns and 'release_date' in spotify_df.columns:
        spotify_df['year'] = pd.to_datetime(spotify_df['release_date'], errors='coerce').dt.year

except FileNotFoundError:
    print("Dataset file 'tracks.csv' not found.")
    print("Falling back to a synthetic dataset for pipeline validation.")
    spotify_df = pd.DataFrame(np.random.rand(100, 14), 
                              columns=['acousticness', 'danceability', 'energy', 'instrumentalness', 
                                       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'popularity', 
                                       'key', 'mode', 'explicit', 'year'])

# Select features (X) and target (Y)
features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 
            'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 
            'key', 'mode', 'explicit', 'year']
target = 'popularity'

# Drop NAs
spotify_df = spotify_df.dropna(subset=features + [target])

# Ensure numeric types
for col in features + [target]:
    spotify_df[col] = pd.to_numeric(spotify_df[col], errors='coerce')
    
spotify_df = spotify_df.dropna()

# For performance, we'll sample 30,000 tracks
if len(spotify_df) > 30000:
    spotify_df = spotify_df.sample(n=30000, random_state=seed)

X_spotify = spotify_df[features]
Y_spotify = spotify_df[[target]]

# Scale the data
scaler_X = MinMaxScaler()
scaler_Y = MinMaxScaler()
X_spotify_scaled = scaler_X.fit_transform(X_spotify)
Y_spotify_scaled = scaler_Y.fit_transform(Y_spotify)

# Split data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_spotify_scaled, Y_spotify_scaled, test_size=0.2, random_state=seed)

# Transpose for our NN architecture: (features, num_examples)
x_train_reg = X_train_reg.T
y_train_reg = y_train_reg.T
x_test_reg = X_test_reg.T
y_test_reg = y_test_reg.T

print(f"X_train_reg shape: {x_train_reg.shape}")
print(f"y_train_reg shape: {y_train_reg.shape}")
print(f"(Input features, 'I' = {x_train_reg.shape[0]})")

X_train_reg shape: (13, 24000)
y_train_reg shape: (1, 24000)
(Input features, 'I' = 13)
