In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import f_regression, SelectKBest

# Feature Selection

### Load in the data

In [14]:
data = pd.read_csv('dataset/spotify_songs_restructured_cleaned.csv')
data.head()

Unnamed: 0,track_name,track_artist,track_popularity,track_album_name,track_album_release_date,playlist_genre,danceability,energy,key,loudness,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,popularity_category,tempo_category,loudness_category,release_month
0,Memories - Dillon Francis Remix,Maroon 5,67.0,Memories (Dillon Francis Remix),2019-12-13,pop,0.726,0.815,11.0,-4.969,...,0.0724,0.00421,0.357,0.693,99.972,162600.0,Medium Popularity,Slow,Loud,December
1,All the Time - Don Diablo Remix,Zara Larsson,70.0,All the Time (Don Diablo Remix),2019-07-05,pop,0.675,0.931,1.0,-3.432,...,0.0794,2.3e-05,0.19031,0.613,124.008,176616.0,Medium Popularity,Fast,Loud,July
2,Call You Mine - Keanu Silva Remix,The Chainsmokers,60.0,Call You Mine - The Remixes,2019-07-19,pop,0.718,0.93,7.0,-3.778,...,0.0287,9e-06,0.204,0.509838,121.956,169093.0,Medium Popularity,Fast,Loud,July
3,Never Really Over - R3HAB Remix,Katy Perry,62.0,Never Really Over (R3HAB Remix),2019-07-26,pop,0.449,0.856,5.387222,-4.788,...,0.187,0.0,0.176,0.152,112.648,187675.0,Medium Popularity,Moderate,Loud,July
4,Cross Me (feat. Chance the Rapper & PnB Rock) ...,Ed Sheeran,58.0,Cross Me (feat. Chance the Rapper & PnB Rock) ...,2019-06-21,pop,0.679,0.923,5.387222,-6.5,...,0.146,5e-06,0.124,0.752,121.984,207894.0,Medium Popularity,Fast,Moderate,June


### feature selection

In [15]:
# Copy of data to avoid modifying the original
data_copy = data.copy()

# --- Step 1: Transform date, genre, and text features ---

# Convert dates to year, month, or other relevant components
data_copy['album_year'] = pd.to_datetime(data_copy['track_album_release_date']).dt.year
data_copy['album_month'] = pd.to_datetime(data_copy['track_album_release_date']).dt.month

# Convert genre to categorical codes if it's text-based
data_copy['genre_code'] = data_copy['playlist_genre'].astype('category').cat.codes
data_copy['artist_code'] = data_copy['track_artist'].astype('category').cat.codes

# Drop the original text fields we don’t want to process further for feature selection
data_copy = data_copy.drop(columns=['track_name', 'track_album_name', 'track_album_release_date', 'playlist_genre',
                                    'track_artist', 'popularity_category', 'loudness_category', 'tempo_category', 'release_month'])

# Separate numerical and categorical columns again after transformations
numerical_cols = data_copy.select_dtypes(include=np.number).columns.tolist()
# categorical_cols = data_copy.select_dtypes(include='object').columns.tolist()

X = data_copy.drop(columns=['track_popularity'])
y = data_copy['track_popularity']

# --- Step 2: Feature Selection for Numerical and Categorical Features ---

# Apply F-test for feature selection
selector = SelectKBest(f_regression, k='all')
selector.fit(X, y)
f_scores = selector.scores_


# Create a DataFrame with feature names and F-scores, sorted by score
feature_scores = pd.DataFrame({'Feature': X.columns, 'F-score': f_scores})
feature_scores = feature_scores.sort_values(by='F-score', ascending=False)

print(feature_scores)

# Select the top 5 features based on F-score
top_features = feature_scores.head(12)['Feature'].tolist()

# # Filter the data to include only the top features and essential columns
data_best_features = data_copy[['track_popularity'] + top_features]
data_best_features.head(5)

             Feature     F-score
7   instrumentalness  418.448037
11       duration_ms  395.967208
1             energy  228.625375
6       acousticness  136.102681
13       album_month  125.755961
12        album_year   88.595386
0       danceability   74.680534
3           loudness   61.551568
8           liveness   49.608652
14        genre_code   32.764354
9            valence   18.166556
15       artist_code    2.324400
5        speechiness    0.945192
10             tempo    0.503744
4               mode    0.271951
2                key    0.090617


Unnamed: 0,track_popularity,instrumentalness,duration_ms,energy,acousticness,album_month,album_year,danceability,loudness,liveness,genre_code,valence,artist_code
0,67.0,0.00421,162600.0,0.815,0.0724,12,2019,0.726,-4.969,0.357,2,0.693,4535
1,70.0,2.3e-05,176616.0,0.931,0.0794,7,2019,0.675,-3.432,0.19031,2,0.613,7724
2,60.0,9e-06,169093.0,0.93,0.0287,7,2019,0.718,-3.778,0.204,2,0.509838,6862
3,62.0,0.0,187675.0,0.856,0.187,7,2019,0.449,-4.788,0.176,2,0.152,3635
4,58.0,5e-06,207894.0,0.923,0.146,6,2019,0.679,-6.5,0.124,2,0.752,2056


### Feature Selection using Correlation

In [16]:
correlation_matrix = data_copy.corr()

target_correlations = correlation_matrix['track_popularity'].drop('track_popularity').sort_values(ascending=False)
target_correlations

acousticness        0.082530
album_month         0.079352
album_year          0.066666
danceability        0.061228
loudness            0.055605
genre_code          0.040598
valence             0.030241
artist_code         0.010822
speechiness         0.006901
mode                0.003702
key                -0.002137
tempo              -0.005038
liveness           -0.049934
energy             -0.106718
duration_ms        -0.139863
instrumentalness   -0.143699
Name: track_popularity, dtype: float64

### Saving the selected features

In [6]:
data_best_features.to_csv('dataset/spotify_features.csv', index=False)