In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
df_raw = pd.read_csv('../dataset_tab/final_tracks.csv')

In [4]:
df_raw.columns

Index(['name', 'duration_ms', 'explicit', 'popularity', 'artists',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'tempo_confidence', 'time_signature_confidence',
       'key_confidence', 'mode_confidence', 'genre', 'id'],
      dtype='object')

In [5]:
lbl = LabelEncoder()

X_raw = df_raw.drop('genre', axis=1)
y = lbl.fit_transform(df_raw['genre'])
enc_vect=np.array(lbl.classes_)

In [6]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df_raw, y, test_size=0.3, stratify=y, random_state=100)

In [7]:
np.unique(y_test, return_counts=True)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113]),
 array([267, 294,  86, 154, 252, 216, 290, 294, 177, 234, 289, 170, 261,
        293, 280, 199, 211, 288, 236, 166,  84, 193, 197, 155, 281, 180,
        263, 274, 244,  80,  46,  97, 253, 235, 164, 287, 268, 173, 260,
        192, 233, 267, 297, 250, 265, 271, 277, 196, 235, 272, 297, 198,
        294, 195, 293, 253,  53, 179, 275, 279, 291, 295, 219, 1

## Normalizzazione e codifica one hot

In [26]:
X_train_cont = X_train_raw[['duration_ms', 'popularity', 'danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'tempo_confidence', 'time_signature_confidence',
       'key_confidence', 'mode_confidence']]

X_test_cont = X_test_raw[['duration_ms', 'popularity', 'danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'tempo_confidence', 'time_signature_confidence',
       'key_confidence', 'mode_confidence']]

In [34]:
X_train_cat = X_train_raw[['explicit', 'key', 'mode', 'time_signature']]
X_test_cat = X_test_raw[['explicit', 'key', 'mode', 'time_signature']]

### Dummy Encoding per feature categoriche

In [37]:
X_train_cat = X_train_cat.astype({'explicit': 'category', 'key': 'category', 'mode':'category', 'time_signature':'category'})
X_test_cat = X_test_cat.astype({'explicit': 'category', 'key': 'category', 'mode':'category', 'time_signature':'category'})

In [44]:
X_train_cat = pd.get_dummies(X_train_cat, dtype=float)
X_test_cat = pd.get_dummies(X_test_cat, dtype=float)

In [48]:
X_train_cat

Unnamed: 0,explicit_False,explicit_True,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,...,key_9,key_10,key_11,mode_0,mode_1,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5
18568,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
71665,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
83255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
53416,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
74580,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49638,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
25510,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
17807,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
34867,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


### Standard Normalization per feature continue

In [46]:
scaler = StandardScaler()
X_train_cont_norm = scaler.fit_transform(X_train_cont)
X_test_cont_norm = scaler.transform(X_test_cont)

### Rimetto tutto insieme e salvo il dataset

In [52]:
X_train_final = np.concatenate([X_train_cont_norm, np.array(X_train_cat)], axis=1)
X_test_final = np.concatenate([X_test_cont_norm, np.array(X_test_cat)], axis=1)

In [53]:
X_train_final.shape, X_test_final.shape

((58767, 36), (25186, 36))

In [57]:
np.savez('../dataset_tab/tracks_processed', X_train=X_train_final, X_test=X_test_final, y_train=y_train, y_test=y_test, enc_dict=np.array(enc_vect))