In [43]:
import pandas as pd
import sklearn as skl
import ast
import numpy as np
import utils

# this notebook needs to be in the same folder as utils.py / genres.csv / features.csv / echonest.csv / tracks.csv<br>

In [45]:
genres = utils.load('genres.csv')
features = utils.load('features.csv')
echonest = utils.load('echonest.csv')

In [31]:
features.head()

feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,0.866868,1.341231,...,0.054125,0.012226,0.012111,5.75889,0.459473,0.085629,0.071289,0.0,2.089872,0.061448
3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,0.619185,...,0.063831,0.014212,0.01774,2.824694,0.466309,0.084578,0.063965,0.0,1.716724,0.06933
5,0.527563,-0.077654,-0.27961,0.685883,1.93757,0.880839,-0.923192,-0.927232,0.666617,1.038546,...,0.04073,0.012691,0.014759,6.808415,0.375,0.053114,0.041504,0.0,2.193303,0.044861
10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,1.982386,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.0,3.542325,0.0408
20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.81641,0.043851,-0.804761,...,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.040039,0.000977,3.189831,0.030993


In [47]:
from pandas.api.types import CategoricalDtype

tracks = pd.read_csv('audio_files\\tracks.csv', index_col=0, header=[0, 1])

COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),('track', 'genres'), ('track', 'genres_all')]
for column in COLUMNS:
  tracks[column] = tracks[column].map(ast.literal_eval)

COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
           ('album', 'date_created'), ('album', 'date_released'),
           ('artist', 'date_created'), ('artist', 'active_year_begin'),
           ('artist', 'active_year_end')]
for column in COLUMNS:
  tracks[column] = pd.to_datetime(tracks[column])
    
SUBSETS = ('small', 'medium', 'large')
tracks['set', 'subset'] = tracks['set', 'subset'].astype(CategoricalDtype(categories=SUBSETS, ordered=True))
COLUMNS = [('track', 'genre_top'), ('track', 'license'),
           ('album', 'type'), ('album', 'information'),('artist', 'bio')]
for column in COLUMNS:
  tracks[column] = tracks[column].astype('category')

In [48]:
np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

In [71]:
small = tracks['set', 'subset'] <= 'small'

In [67]:
train = tracks['set', 'split'] == 'training'
val = tracks['set', 'split'] == 'validation'
test = tracks['set', 'split'] == 'test'

In [69]:
X_train = features.loc[small & train]
X_test = features.loc[small & test]

In [70]:
y_train = tracks.loc[small & train, ('track', 'genre_top')]
y_test = tracks.loc[small & test, ('track', 'genre_top')]

# prepare features to PCA:

In [73]:
from sklearn.preprocessing import StandardScaler

X_train = StandardScaler().fit_transform(X_train)

In [77]:
from sklearn import decomposition

PCA85 = decomposition.PCA(n_components=85)
PCA151 = decomposition.PCA(n_components=151)
PCA215 =decomposition.PCA(n_components=215)

In [78]:
pComponents85 = PCA85.fit_transform(X_train)
pComponents151 = PCA151.fit_transform(X_train)
pComponents215 = PCA215.fit_transform(X_train)

In [80]:
print('Pour 85 facteurs : ',PCA85.explained_variance_ratio_.sum(),' de l information est expliquee\n')
print('Pour 151 facteurs : ',PCA151.explained_variance_ratio_.sum(),' de l information est expliquee\n')
print('Pour 215 facteurs : ',PCA215.explained_variance_ratio_.sum(),' de l information est expliquee\n')

Pour 85 facteurs :  0.8090947500557024  de l information est expliquee

Pour 151 facteurs :  0.9078570837006089  de l information est expliquee

Pour 215 facteurs :  0.955472083818175  de l information est expliquee



In [81]:
principalDf151 = pd.DataFrame(data = pComponents151)
principalDf85 = pd.DataFrame(data = pComponents85)
principalDf215 = pd.DataFrame(data = pComponents215)

In [82]:
principalDf85.to_csv('85_PCA.csv', index=False)
principalDf151.to_csv('151_PCA.csv', index=False)
principalDf215.to_csv('215_PCA.csv', index=False)