Convert data/fma_metadata/tracks.csv to a simpler data/train_labels.csv file where the first column is the track_id and the second column is the target musical genre.

In [17]:
import os
import ast
import pandas as pd

tracks = pd.read_csv('data/fma_metadata/tracks.csv', index_col=0, header=[0, 1])
COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'), ('track', 'genres'), ('track', 'genres_all')]
for column in COLUMNS:
    tracks[column] = tracks[column].map(ast.literal_eval)

COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
            ('album', 'date_created'), ('album', 'date_released'),
            ('artist', 'date_created'), ('artist', 'active_year_begin'),
            ('artist', 'active_year_end')]
for column in COLUMNS:
    tracks[column] = pd.to_datetime(tracks[column])

SUBSETS = ('small', 'medium', 'large')
tracks['set', 'subset'] = tracks['set', 'subset'].astype('category', categories=SUBSETS, ordered=True)

COLUMNS = [('track', 'genre_top'), ('track', 'license'), ('album', 'type'), ('album', 'information'), ('artist', 'bio')]
for column in COLUMNS:
    tracks[column] = tracks[column].astype('category')

# Faulty MP3 train files (https://github.com/mdeff/fma/issues/8).
# MP3 train file IDs with 0 second of audio.
FILES_TRAIN_NO_AUDIO = [1486, 5574, 65753, 80391, 98558, 98559, 98560, 98571,
                        99134, 105247, 108925, 126981, 127336, 133297, 143992]
# MP3 train file IDs with less than 30 seconds of audio.
FILES_TRAIN_FAULTY = FILES_TRAIN_NO_AUDIO + [98565, 98566, 98567,
                                             98568, 98569, 108924]

subset = tracks.index[tracks['set', 'subset'] <= 'medium']
labels = tracks.loc[subset, ('track', 'genre_top')]
labels.name = 'genre'
for i in FILES_TRAIN_FAULTY:
    if i in labels.index:
        labels = labels.drop(i)
train_labels = labels.sample(frac=0.8)
temp_labels = labels.drop(train_labels.index)
valid_labels = temp_labels.sample(frac=0.5)
test_labels = test_labels = temp_labels.drop(valid_labels.index)
labels.to_csv('data/labels_medium.csv', header=True)
train_labels.to_csv('data/train_labels_medium.csv', header=True)
valid_labels.to_csv('data/valid_labels_medium.csv', header=True)
test_labels.to_csv('data/test_labels_medium.csv', header=True)

features = pd.read_csv('data/features.csv', index_col=0, header=[0, 1, 2])[:25000]
train_features = features.loc[train_labels.index]
valid_features = features.loc[valid_labels.index]
test_features = features.loc[test_labels.index]

train_features.to_csv('data/train_features_medium.csv', header=True)
valid_features.to_csv('data/valid_features_medium.csv', header=True)
test_features.to_csv('data/test_features_medium.csv', header=True)

  interactivity=interactivity, compiler=compiler, result=result)
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.utils import shuffle
import pandas as pd
import numpy as np

train_labels = pd.read_csv('data/train_labels_medium.csv')
valid_labels = pd.read_csv('data/valid_labels_medium.csv')
test_labels = pd.read_csv('data/test_labels_medium.csv')

train_features = pd.read_csv('data/train_features_medium.csv', index_col=0, header=[0, 1, 2])
valid_features = pd.read_csv('data/valid_features_medium.csv', index_col=0, header=[0, 1, 2])
test_features = pd.read_csv('data/test_features_medium.csv', index_col=0, header=[0, 1, 2])

print('Music Genres:', *train_labels.genre.unique())
print("Train: ",train_labels.genre.count())
print("Valid: ",valid_labels.genre.count())
print("Test: ",test_labels.genre.count())

train_features.index = pd.Index((int(i) for i in train_features.index), name='track_id')

train_features.sort_index(inplace=True)
valid_features.sort_index(inplace=True)
test_features.sort_index(inplace=True)

train_features, train_labels = shuffle(train_features, train_labels, random_state=42)

# Standardize features by removing the mean and scaling to unit variance.
scaler = StandardScaler(copy=False)
scaler.fit_transform(train_features)
scaler.transform(valid_features)

# Train the classifier and make predictions.
clf = SVC(kernel='rbf', probability=True)
clf.fit(train_features, train_labels)
y_test = clf.predict_proba(valid_features)

Music Genres: Electronic Hip-Hop Rock Folk Instrumental Pop International Experimental Classical Old-Time / Historic Country Jazz Soul-RnB Blues Spoken Easy Listening
Train:  19983
Valid:  2498
Test:  2498


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').