In [1]:
import pandas as pd
import numpy as np

# Feature Selection

### Load in the data

In [2]:
data = pd.read_csv('dataset/spotify_songs_restructured_cleaned.csv')
data.head()

Unnamed: 0,track_name,track_artist,track_popularity,track_album_name,track_album_release_date,playlist_genre,danceability,energy,key,loudness,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,popularity_category,tempo_category,loudness_category,release_month
0,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66.0,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,pop,0.748,0.916,6.0,-6.713226,...,0.102,0.0,0.0653,0.518,122.036,194754.0,Medium,Fast,Moderate,June
1,All the Time - Don Diablo Remix,Zara Larsson,70.0,All the Time (Don Diablo Remix),2019-07-05,pop,0.675,0.931,1.0,-6.713226,...,0.0794,2.3e-05,0.11,0.613,124.008,176616.0,Medium,Fast,Moderate,July
2,Call You Mine - Keanu Silva Remix,The Chainsmokers,60.0,Call You Mine - The Remixes,2019-07-19,pop,0.718,0.93,7.0,-3.778,...,0.0287,9e-06,0.204,0.277,121.956,169093.0,Medium,Fast,Loud,July
3,Beautiful People (feat. Khalid) - Jack Wins Remix,Ed Sheeran,67.0,Beautiful People (feat. Khalid) [Jack Wins Remix],2019-07-11,pop,0.675,0.919,8.0,-5.385,...,0.0799,0.0,0.191163,0.585,124.982,163049.0,Medium,Fast,Moderate,July
4,Never Really Over - R3HAB Remix,Katy Perry,62.0,Never Really Over (R3HAB Remix),2019-07-26,pop,0.449,0.856,5.0,-4.788,...,0.176028,0.0,0.176,0.152,112.648,187675.0,Medium,Moderate,Loud,July


In [3]:
# from sklearn.feature_selection import f_classif, SelectKBest, f_regression
# 
# # Create a copy of your DataFrame
# # data_copy = data.copy()
# # 
# # # Replace all categorical columns with categorical codes
# # for col in data_copy.select_dtypes(include='category').columns:
# #     data_copy[col] = data_copy[col].cat.codes
# 
# X = data.drop(columns=['track_popularity']).select_dtypes(include=np.number)  # Feature set
# y = data['track_popularity']  # Target
# # y = data['popularity_category']  # Target
# 
# # Apply the F-test
# selector = SelectKBest(f_regression, k='all') # f_regression for continuous target, f_classif for categorical target
# X_new = selector.fit_transform(X, y)
# 
# # new_indices = selector.get_support(indices=True)
# # best_features_df = data.iloc[:, new_indices]
# 
# # Get the scores
# f_scores = selector.scores_
# 
# # list the f-score of each feature and sort them from high to low
# feature_scores = pd.DataFrame({'Feature': X.columns, 'F-score': f_scores})
# feature_scores = feature_scores.sort_values(by='F-score', ascending=False)
# top_features = feature_scores.head(9)['Feature'].tolist()
# 
# # Step 4: Drop columns that are not in the top 9 from data_copy
# data_best_features = data[['track_name', 'track_artist', 'track_album_name', 'track_popularity', 'popularity_category', 'track_album_release_date', 'playlist_genre'] + top_features]
# 
# print(feature_scores)
# 
# data_best_features.head(20)

### feature selection

In [4]:
from sklearn.feature_selection import f_regression, mutual_info_regression, SelectKBest
import pandas as pd

# Copy of data to avoid modifying the original
data_copy = data.copy()

# --- Step 1: Transform date, genre, and text features ---

# Convert dates to year, month, or other relevant components
data_copy['album_year'] = pd.to_datetime(data_copy['track_album_release_date']).dt.year
data_copy['album_month'] = pd.to_datetime(data_copy['track_album_release_date']).dt.month

# Convert genre to categorical codes if it's text-based
data_copy['genre_code'] = data_copy['playlist_genre'].astype('category').cat.codes
data_copy['artist_code'] = data_copy['track_artist'].astype('category').cat.codes
# data_copy['loudness_category_code'] = data_copy['loudness_category'].astype('category').cat.codes
data_copy['release_month_code'] = data_copy['release_month'].astype('category').cat.codes

# Drop the original text fields we don’t want to process further for feature selection
data_copy = data_copy.drop(columns=['track_name', 'track_album_name', 'track_album_release_date', 'playlist_genre',
                                    'track_artist', 'popularity_category', 'loudness_category', 'tempo_category', 'release_month'])

# Separate numerical and categorical columns again after transformations
numerical_cols = data_copy.select_dtypes(include=np.number).columns.tolist()
# categorical_cols = data_copy.select_dtypes(include='object').columns.tolist()

X = data_copy.drop(columns=['track_popularity'])
y = data_copy['track_popularity']

# --- Step 2: Feature Selection for Numerical and Categorical Features ---

# Apply F-test for feature selection
selector = SelectKBest(f_regression, k='all')
selector.fit(X, y)
f_scores = selector.scores_


# Create a DataFrame with feature names and F-scores, sorted by score
feature_scores = pd.DataFrame({'Feature': X.columns, 'F-score': f_scores})
feature_scores = feature_scores.sort_values(by='F-score', ascending=False)

print(feature_scores)

# Select the top 5 features based on F-score
top_features = feature_scores.head(15)['Feature'].tolist()

# # Filter the data to include only the top features and essential columns
data_best_features = data_copy[['track_popularity'] + top_features]
data_best_features.head(5)
# # --- Step 3: Convert Back to Categorical Codes ---
# 
# # Convert categorical codes back to categorical type in the selected data
# data_best_features['genre'] = data_best_features['genre_code'].astype('category')
# data_best_features['artist'] = data_best_features['artist_code'].astype('category')
# 
# # Display the top features and their scores

               Feature     F-score
7     instrumentalness  422.674851
11         duration_ms  353.577639
1               energy  173.013292
13         album_month  128.597861
12          album_year   94.953415
3             loudness   88.528857
6         acousticness   88.124296
0         danceability   68.268863
8             liveness   45.575241
16  release_month_code   41.906534
14          genre_code   27.733014
9              valence   21.195258
5          speechiness    1.593983
10               tempo    1.051871
2                  key    0.499438
4                 mode    0.481657
15         artist_code    0.303345


Unnamed: 0,track_popularity,instrumentalness,duration_ms,energy,album_month,album_year,loudness,acousticness,danceability,liveness,release_month_code,genre_code,valence,speechiness,tempo,key
0,66.0,0.0,194754.0,0.916,6,2019,-6.713226,0.102,0.748,0.0653,6,2,0.518,0.0583,122.036,6.0
1,70.0,2.3e-05,176616.0,0.931,7,2019,-6.713226,0.0794,0.675,0.11,5,2,0.613,0.0742,124.008,1.0
2,60.0,9e-06,169093.0,0.93,7,2019,-3.778,0.0287,0.718,0.204,5,2,0.277,0.106659,121.956,7.0
3,67.0,0.0,163049.0,0.919,7,2019,-5.385,0.0799,0.675,0.191163,5,2,0.585,0.127,124.982,8.0
4,62.0,0.0,187675.0,0.856,7,2019,-4.788,0.176028,0.449,0.176,5,2,0.152,0.0623,112.648,5.0


### Feature Selection using Correlation

In [5]:
# Using correlation matrix
correlation_matrix = data_copy.corr()

target_correlations = correlation_matrix['track_popularity'].drop('track_popularity').sort_values(ascending=False)

target_correlations

album_month           0.080628
album_year            0.069342
loudness              0.066966
acousticness          0.066813
danceability          0.058836
release_month_code    0.046128
genre_code            0.037539
valence               0.032822
speechiness           0.009006
mode                  0.004951
artist_code           0.003929
key                  -0.005041
tempo                -0.007316
liveness             -0.048100
energy               -0.093416
duration_ms          -0.132940
instrumentalness     -0.145100
Name: track_popularity, dtype: float64

### Saving the selected features

In [6]:
data_best_features.to_csv('dataset/spotify_features.csv', index=False)

### Model Training

In [7]:
from sklearn.model_selection import train_test_split

# apply train_test_split twice to create a train (60%), validation (20%) and test set (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25)

print(f'Training set: {X_train.shape}, {y_train.shape}')
print(f'Validation set: {X_val.shape}, {y_val.shape}')
print(f'Test set: {X_test.shape}, {y_test.shape}')

Training set: (11793, 17), (11793,)
Validation set: (3931, 17), (3931,)
Test set: (3931, 17), (3931,)


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Scaling the features
# because the are values that are not on the same scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Train a RandomForestRegressor instead of LinearRegression
# RandomForestRegressor can model complex, non-linear patterns in the data
model = RandomForestRegressor(random_state=1)
model.fit(X_train_scaled, y_train)

# Evaluate the model
train_score = model.score(X_train_scaled, y_train)
val_score = model.score(X_val_scaled, y_val)
test_score = model.score(X_test_scaled, y_test)

print(f"Train R^2: {train_score:.2f}")
print(f"Validation R^2: {val_score:.2f}")
print(f"Test R^2: {test_score:.2f}")

Train R^2: 0.89
Validation R^2: 0.22
Test R^2: 0.22


In [9]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
import numpy as np

# --- Standardize and model within a pipeline ---
# This pipeline standardizes the data and applies RandomForestRegressor in sequence.
pipeline = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42))

# Define KFold with 6 splits
kf = KFold(n_splits=6, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(pipeline, X, y, cv=kf, scoring='r2')

# Display cross-validation results
print(f'6-Fold Cross-Validation R^2: {scores.mean():.2f} (+/- {scores.std():.2f})')
print("Scores for each fold:", scores)

6-Fold Cross-Validation R^2: 0.24 (+/- 0.02)
Scores for each fold: [0.23362704 0.21828754 0.22945915 0.27213184 0.24461197 0.24201835]
