# Analisi spotify

## Preparazione dei dati per ML

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

In [34]:
import pandas as pd

# Carica il dataset pulito
df = pd.read_csv('spotify_clean.csv')

# Controllo rapido
print(df.shape)
df.head()

(84997, 21)


Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,genre,popularity,danceability,energy,key,...,mode,instrumentalness,tempo,stream_count,country,explicit,label,release_year,release_month,duration_s
0,Trk-Bebd53Da84E1,Agent Every (0),Noah Rhodes,Beautiful Instead,2016-04-01,Pop,55.0,0.15,0.74,9,...,0,0.436,73.12,13000,Brazil,0,Universal Music,2016,4,234.194
1,Trk-6A32496762D7,Night Respond,Jennifer Cole,Table,2022-04-15,Metal,45.0,0.44,0.46,0,...,0,0.223,157.74,1000,France,0,Island Records,2022,4,375.706
2,Trk-47Aa7523463E,Future Choice Whatever,Brandon Davis,Page Southern,2016-02-23,Rock,55.0,0.62,0.8,8,...,1,0.584,71.03,1000,Germany,0,Xl Recordings,2016,2,289.191
3,Trk-25Ada22E3B06,Bad Fall Pick Those,Corey Jones,Spring,2015-10-12,Pop,51.0,0.78,0.98,1,...,1,0.684,149.0,1000,France,0,Warner Music,2015,10,209.484
4,Trk-9245F2Ad996A,Husband,Mark Diaz,Great Prove,2022-07-08,Indie,39.0,0.74,0.18,10,...,0,0.304,155.85,2000,United States,0,Independent,2022,7,127.435


## Feature Engineering

In [35]:
# Usiamo la mediana del tempo come punto di riferimento
tempo_median = df['tempo'].median()
def tempo_category(tempo):
    if tempo < 100:
        return 'slow'
    elif tempo < 140:
        return 'medium'
    else:
        return 'fast'
df['tempo_cat'] = df['tempo'].apply(tempo_category)

# Loudness Binning
# Categorizziamo il volume (loudness) in 3 fasce
bins = [-np.inf, -10, -5, np.inf] # Bassa, Media, Alta
labels = ['low_loudness', 'medium_loudness', 'high_loudness']
df['loudness_cat'] = pd.cut(df['loudness'], bins=bins, labels=labels, right=False)

# 3. Termine di Interazione: Danceability-Loudness
# L'interazione può aiutare a catturare canzoni "ballabili e che spaccano"
df['dance_loud_interact'] = df['danceability'] * df['loudness']

In [36]:
# Controllo rapido delle nuove colonne
print("Colonne dopo Feature Engineering:")
print(df[['tempo', 'tempo_cat', 'loudness', 'loudness_cat', 'dance_loud_interact']].head())

# 4. Selezione delle feature aggiornata
features_num = ['danceability', 'energy', 'tempo', 'instrumentalness', 'duration_s', 
                'dance_loud_interact'] # Aggiunta la nuova interazione
features_cat = ['key', 'mode', 'tempo_cat', 'loudness_cat', 'label', 'explicit', 'country']

 # Aggiunte le nuove categorie

Colonne dopo Feature Engineering:
    tempo tempo_cat  loudness  loudness_cat  dance_loud_interact
0   73.12      slow    -32.22  low_loudness              -4.8330
1  157.74      fast    -14.02  low_loudness              -6.1688
2   71.03      slow    -48.26  low_loudness             -29.9212
3  149.00      fast    -34.47  low_loudness             -26.8866
4  155.85      fast    -17.84  low_loudness             -13.2016


In [37]:
X = df[features_num + features_cat]
y = df['popularity']

In [38]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: scaling per numeriche + one-hot per categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), features_num),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), features_cat)
    ])


In [39]:
df.head()

Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,genre,popularity,danceability,energy,key,...,stream_count,country,explicit,label,release_year,release_month,duration_s,tempo_cat,loudness_cat,dance_loud_interact
0,Trk-Bebd53Da84E1,Agent Every (0),Noah Rhodes,Beautiful Instead,2016-04-01,Pop,55.0,0.15,0.74,9,...,13000,Brazil,0,Universal Music,2016,4,234.194,slow,low_loudness,-4.833
1,Trk-6A32496762D7,Night Respond,Jennifer Cole,Table,2022-04-15,Metal,45.0,0.44,0.46,0,...,1000,France,0,Island Records,2022,4,375.706,fast,low_loudness,-6.1688
2,Trk-47Aa7523463E,Future Choice Whatever,Brandon Davis,Page Southern,2016-02-23,Rock,55.0,0.62,0.8,8,...,1000,Germany,0,Xl Recordings,2016,2,289.191,slow,low_loudness,-29.9212
3,Trk-25Ada22E3B06,Bad Fall Pick Those,Corey Jones,Spring,2015-10-12,Pop,51.0,0.78,0.98,1,...,1000,France,0,Warner Music,2015,10,209.484,fast,low_loudness,-26.8866
4,Trk-9245F2Ad996A,Husband,Mark Diaz,Great Prove,2022-07-08,Indie,39.0,0.74,0.18,10,...,2000,United States,0,Independent,2022,7,127.435,fast,low_loudness,-13.2016


In [40]:
# Applica il preprocessing
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

print(f"\nDimensione X_train preprocessato (dopo FE): {X_train_proc.shape}")
print(f"Dimensione X_test preprocessato (dopo FE): {X_test_proc.shape}")


Dimensione X_train preprocessato (dopo FE): (67997, 38)
Dimensione X_test preprocessato (dopo FE): (17000, 38)


In [41]:
# 7. Addestramento del modello
print("\nInizio addestramento di RandomForestRegressor con nuove features...")
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_proc, y_train)
print("Addestramento completato.")


Inizio addestramento di RandomForestRegressor con nuove features...
Addestramento completato.


In [42]:
# 8. Valutazione del modello
y_pred = model.predict(X_test_proc)

In [43]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [44]:
print(f"\n--- Risultati con Feature Engineering ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-quadro (R2 Score): {r2:.4f}")


--- Risultati con Feature Engineering ---
Mean Absolute Error (MAE): 11.4621
R-quadro (R2 Score): -0.0227


In [45]:
# --- 9. Analisi della Feature Importance ---

# 1. Recuperare i nomi delle colonne preprocessate
# Ottieni i nomi delle feature numeriche originali
feature_names = features_num

# Ottieni i nomi delle feature create da OneHotEncoder
one_hot_features = list(preprocessor.named_transformers_['cat'].get_feature_names_out(features_cat))

# Combina tutti i nomi delle feature
all_feature_names = feature_names + one_hot_features

# 2. Ottenere l'importanza dal modello
importances = model.feature_importances_

# 3. Creare un DataFrame per visualizzare i risultati
feature_importance_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Importance': importances
})

# 4. Ordinare per importanza e stampare le Top 10
top_10_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(10)

print("\n--- Top 10 Feature Importances nel Modello (con Feature Engineering) ---")
print(top_10_features)


--- Top 10 Feature Importances nel Modello (con Feature Engineering) ---
                  Feature  Importance
4              duration_s    0.138896
2                   tempo    0.135286
5     dance_loud_interact    0.134868
3        instrumentalness    0.129377
1                  energy    0.106448
0            danceability    0.099354
17                 mode_1    0.014537
23      label_Independent    0.009303
26  label_Universal Music    0.009119
22              label_Emi    0.009062
