In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict

In [None]:
df_track = pd.read_csv('tracks.csv', low_memory=False)
df_track.dropna(inplace=True)  
df_track.drop_duplicates(subset='id' ,keep='first', inplace=True)
df_track.head()

In [None]:
genre_map = {
    'minimal-techno': 'Electronic',
    'hip-hop': 'Hip-Hop/Rap',
    'honky-tonk': 'Country',
    'house': 'Electronic',
    'idm': 'Electronic',
    'indian': 'World',
    'indie-pop': 'Indie',
    'indie': 'Indie',
    'industrial': 'Electronic',
    'iranian': 'World',
    'j-dance': 'Electronic',
    'j-idol': 'Pop',
    'j-pop': 'Pop',
    'j-rock': 'Rock',
    'jazz': 'Jazz',
    'k-pop': 'Pop',
    'alternative': 'Alternative',
    'ambient': 'Electronic',
    'anime': 'Pop',
    'black-metal': 'Metal',
    'bluegrass': 'Country',
    'blues': 'Blues',
    'brazil': 'World',
    'breakbeat': 'Electronic',
    'sleep': 'Ambient',
    'songwriter': 'Folk',
    'soul': 'R&B/Soul',
    'spanish': 'World',
    'study': 'Ambient',
    'swedish': 'Pop',
    'synth-pop': 'Electronic',
    'tango': 'World',
    'techno': 'Electronic',
    'trance': 'Electronic',
    'trip-hop': 'Electronic',
    'turkish': 'World',
    'world-music': 'World',
    'edm': 'Electronic',
    'electro': 'Electronic',
    'electronic': 'Electronic',
    'emo': 'Alternative',
    'folk': 'Folk',
    'forro': 'World',
    'french': 'World',
    'funk': 'Funk',
    'garage': 'Electronic',
    'german': 'World',
    'gospel': 'Gospel',
    'goth': 'Alternative',
    'grindcore': 'Metal',
    'groove': 'R&B/Soul',
    'grunge': 'Rock',
    'guitar': 'Rock',
    'acoustic': 'Folk',
    'afrobeat': 'World',
    'alt-rock': 'Rock',
    'happy': 'Pop',
    'hard-rock': 'Rock',
    'hardcore': 'Metal',
    'hardstyle': 'Electronic',
    'heavy-metal': 'Metal',
    'mandopop': 'Pop',
    'metal': 'Metal',
    'metalcore': 'Metal',
    'pop-film': 'Pop',
    'pop': 'Pop',
    'power-pop': 'Pop',
    'progressive-house': 'Electronic',
    'psych-rock': 'Rock',
    'punk-rock': 'Rock',
    'punk': 'Rock',
    'r-n-b': 'R&B/Soul',
    'reggae': 'Reggae',
    'reggaeton': 'Reggaeton',
    'rock-n-roll': 'Rock',
    'rock': 'Rock',
    'rockabilly': 'Rock',
    'romance': 'Pop',
    'sad': 'Pop',
    'salsa': 'Latino',
    'samba': 'Latino',
    'sertanejo': 'Latino',
    'show-tunes': 'Musical',
    'ska': 'Reggae',
    'mpb': 'World',
    'new-age': 'New Age',
    'kids': 'Children',
    'latin': 'Latino',
    'latino': 'Latino',
    'malay': 'World',
    'chicago-house': 'Electronic',
    'children': 'Children',
    'chill': 'Ambient',
    'classical': 'Classical',
    'club': 'Electronic',
    'comedy': 'Comedy',
    'country': 'Country',
    'opera': 'Classical',
    'pagode': 'World',
    'party': 'Pop',
    'piano': 'Classical',
    'british': 'World',
    'cantopop': 'Pop',
    'dance': 'Electronic',
    'dancehall': 'Reggae',
    'death-metal': 'Metal',
    'deep-house': 'Electronic',
    'detroit-techno': 'Electronic',
    'disco': 'Electronic',
    'disney': 'Children',
    'drum-and-bass': 'Electronic',
    'dub': 'Reggae',
    'dubstep': 'Electronic'
}

df_track['genre_cluster'] = df_track['genre'].replace(genre_map)


In [None]:
columns = ['duration_ms', 'popularity', 'danceability', 'energy', 'loudness', 'speechiness',
           'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'start_of_fade_out', 
           'n_beats']
df_sub = df_track[columns]
df_sub

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score 

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [None]:
X = df_sub.values
y = df_track['genre_cluster'].values
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

In [None]:
X_train.shape

# Gradient Boosting 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [None]:
# Inizializziamo il modello
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=100)

# Addestriamo il modello
gbc.fit(X_train, y_train)

# Facciamo le predizioni
y_pred = gbc.predict(X_test)

# Valutiamo il modello
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print("\nClassification Report:")
print(classification_report(y_test, y_pred))




In [12]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

 Alternative       0.19      0.02      0.04       779
     Ambient       0.67      0.61      0.64       778
       Blues       0.45      0.11      0.17       234
    Children       0.53      0.39      0.45       863
   Classical       0.48      0.45      0.47       678
      Comedy       0.91      0.81      0.86       298
     Country       0.48      0.48      0.48       851
  Electronic       0.53      0.73      0.61      5973
        Folk       0.45      0.16      0.23       673
        Funk       0.51      0.24      0.33       191
      Gospel       0.33      0.20      0.25       203
 Hip-Hop/Rap       0.37      0.26      0.31       289
       Indie       0.33      0.09      0.15       340
        Jazz       0.64      0.59      0.61       220
      Latino       0.50      0.41      0.45       979
       Metal       0.62      0.67      0.65      1632
     Musical       0.41      0.22      0.29       299
   

In [13]:
from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
clf = HistGradientBoostingClassifier(learning_rate=1.0, max_depth=3, random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

# XGBOOST

In [19]:
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

Converto la mia colonna generi in numeri perchè XGBoost non accetta stringhe come input!

In [24]:
# Converte le etichette in valori numerici
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Dividi il dataset in train e test set
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=100, stratify=y_encoded)

# Inizializza e addestra il modello XGBoost
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=100)
xgb_model.fit(X_train, y_train)

# Fai le predizioni
y_pred = xgb_model.predict(X_test)

# Valuta il modello
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.44

Classification Report:
              precision    recall  f1-score   support

           0       0.11      0.00      0.01       779
           1       0.63      0.59      0.61       778
           2       1.00      0.05      0.10       234
           3       0.52      0.35      0.42       863
           4       0.46      0.44      0.45       678
           5       0.93      0.81      0.87       298
           6       0.40      0.42      0.41       851
           7       0.50      0.73      0.59      5973
           8       0.45      0.07      0.12       673
           9       0.79      0.14      0.24       191
          10       0.36      0.12      0.18       203
          11       0.44      0.15      0.23       289
          12       0.41      0.03      0.06       340
          13       0.49      0.57      0.53       220
          14       0.51      0.34      0.41       979
          15       0.60      0.68      0.64      1632
          16       0.39      0.11      0.1

# LightGBM

In [27]:
import lightgbm as lgb

In [28]:
# Definisci le feature (X) e l'etichetta (y)
X = df_sub.values
y = df_track['genre_cluster'].values

# Converte le etichette in valori numerici
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Dividi il dataset in train e test set
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=100, stratify=y_encoded)

# Inizializza e addestra il modello LightGBM
lgb_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=100)
lgb_model.fit(X_train, y_train)

# Fai le predizioni
y_pred = lgb_model.predict(X_test)

# Valuta il modello
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3153
[LightGBM] [Info] Number of data points in the train set: 62693, number of used features: 13
[LightGBM] [Info] Start training from score -3.541614
[LightGBM] [Info] Start training from score -3.542164
[LightGBM] [Info] Start training from score -4.745219
[LightGBM] [Info] Start training from score -3.438127
[LightGBM] [Info] Start training from score -3.680192
[LightGBM] [Info] Start training from score -4.503533
[LightGBM] [Info] Start training from score -3.453135
[LightGBM] [Info] Start training from score -1.503631
[LightGBM] [Info] Start training from score -3.687174
[LightGBM] [Info] Start training from score -4.945686
[LightGBM] [Info] Start training from score -4.886910
[LightGBM] [Info] Start training from score -4.531292
[LightGBM] [Info] Start training from score -4.370182
[LightGBM

# Cat Boost

In [32]:
from catboost import CatBoostClassifier

In [36]:
# Converte le etichette in valori numerici
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Dividi il dataset in train e test set
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=100, stratify=y_encoded)

# Inizializza e addestra il modello CatBoost
cat_model = CatBoostClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=100, verbose=0)
cat_model.fit(X_train, y_train)

# Fai le predizioni
y_pred = cat_model.predict(X_test)

# Valuta il modello
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=1))

Accuracy: 0.41

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00       779
           1       0.52      0.53      0.53       778
           2       1.00      0.00      0.00       234
           3       0.45      0.29      0.35       863
           4       0.41      0.35      0.37       678
           5       0.93      0.81      0.87       298
           6       0.42      0.35      0.38       851
           7       0.46      0.72      0.56      5973
           8       0.33      0.00      0.01       673
           9       1.00      0.01      0.02       191
          10       0.00      0.00      0.00       203
          11       0.07      0.00      0.01       289
          12       1.00      0.00      0.00       340
          13       0.48      0.40      0.44       220
          14       0.56      0.20      0.30       979
          15       0.56      0.67      0.61      1632
          16       0.47      0.03      0.0