# Process Data

## 0. Preliminary

In [3]:
%matplotlib inline

import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
import os
import platform
import sys
""

sns.set_context("notebook", font_scale=1.5)
plt.rcParams['figure.figsize'] = (17, 5)

if 'Windows' in platform.platform():
    ROOT_PATH = "D:/PycharmProjects/HMAN"
else:
    ROOT_PATH = "/home/xkliu/PycharmProjects/HMAN"
RAW_DATA_PATH = ROOT_PATH  + "/raw_data"
DATA_PATH = ROOT_PATH + "/data"
os.chdir(ROOT_PATH)
sys.path.append("./")

from kddirkit.utils import utils


## 1. Load Data

In [4]:
tracks = utils.load(RAW_DATA_PATH+'/fma_metadata/tracks.csv')
genres = utils.load(RAW_DATA_PATH+'/fma_metadata/genres.csv')
features = utils.load(RAW_DATA_PATH+'/fma_metadata/features.csv')
echonest = utils.load(RAW_DATA_PATH+'/fma_metadata/echonest.csv')

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, genres.shape, features.shape, echonest.shape

((106574, 52), (163, 4), (106574, 518), (13129, 249))

In [5]:
tracks.loc[:, ('set', 'split')] == 'training'

track_id
2          True
3          True
5          True
10         True
20         True
          ...  
155316     True
155317     True
155318     True
155319     True
155320    False
Name: (set, split), Length: 106574, dtype: bool

In [6]:
tracks[tracks.loc[:, ('set', 'subset')] == 'medium']

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
3,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
134,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1126,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,943,,5,,[],Street Music
136,1,2008-11-26 01:49:19,2009-01-07,,0,58,<p>A couple of unreleased mp3s from the fellas...,3331,,[],...,,1948,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1498,,0,,[],Peel Back The Mountain Sky
139,0,2008-11-26 01:49:57,2009-01-16,,1,60,"<p>A full ensamble of strings, drums, electron...",1304,,[],...,,702,en,Attribution-Noncommercial-No Derivative Works ...,582,,2,,[],CandyAss
181,0,2008-11-26 01:52:15,2007-04-13,,0,79,<p>This Human Ear Music reissue compiles a “Be...,1339,,[],...,,1736,en,Attribution-Noncommercial-No Derivative Works ...,1339,,1,,[],Gopacapulco
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155297,0,2017-03-30 09:50:32,2017-03-30,,1,22935,<p>The stars will show you the way in this unp...,6196,,[],...,,1463,,Attribution-NonCommercial,1049,,9,,[],Nebula Reborn
155298,0,2017-03-30 10:45:13,NaT,,0,22936,,2454,,[],...,,706,,Attribution,590,,2,,[],An Idiot Abroad
155306,0,2017-03-30 10:45:13,NaT,,0,22936,,2454,,[],...,,497,,Attribution,435,,1,,[],Tiny Man
155307,0,2017-03-30 12:51:05,2017-03-27,Scott Williams,0,22937,<p>Scott welcomes Russian revolutionary avant ...,1206,Scott Williams,[],...,,630,,Creative Commons Attribution-NonCommercial-NoD...,571,,1,,[],Kolka


In [7]:
from collections import Counter

In [8]:
Counter(tracks.loc[:, ('set', 'subset')])

Counter({'small': 8000, 'medium': 17000, 'large': 81574})

In [9]:
track_use = tracks.loc[:,[('album', 'id'),('album','type'),('artist','id'),
              ('set', 'split'),('set', 'subset'),('track','genre_top'),
              ('track','genres'), ('track','genres_all'),
              ('track', 'title')]]
track_use

Unnamed: 0_level_0,album,album,artist,set,set,track,track,track,track
Unnamed: 0_level_1,id,type,id,split,subset,genre_top,genres,genres_all,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2,1,Album,1,training,small,Hip-Hop,[21],[21],Food
3,1,Album,1,training,medium,Hip-Hop,[21],[21],Electric Ave
5,1,Album,1,training,small,Hip-Hop,[21],[21],This World
10,6,Album,6,training,small,Pop,[10],[10],Freeway
20,4,Album,4,training,large,,"[76, 103]","[17, 10, 76, 103]",Spiritual Level
...,...,...,...,...,...,...,...,...,...
155316,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",The Auger
155317,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",Let's Skin Ruby
155318,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",My House Smells Like Kim Deal/Pulp
155319,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",The Man With Two Mouths


In [10]:
track_use.columns

MultiIndex([( 'album',         'id'),
            ( 'album',       'type'),
            ('artist',         'id'),
            (   'set',      'split'),
            (   'set',     'subset'),
            ( 'track',  'genre_top'),
            ( 'track',     'genres'),
            ( 'track', 'genres_all'),
            ( 'track',      'title')],
           )

In [11]:
track_use.columns= ['album_id',  'album_type', 'artist_id',
                    'set_split', 'set_subset',  'track_genre_top',
                    'track_genres', 'track_genres_all', 'track_title']
track_use

Unnamed: 0_level_0,album_id,album_type,artist_id,set_split,set_subset,track_genre_top,track_genres,track_genres_all,track_title
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,1,Album,1,training,small,Hip-Hop,[21],[21],Food
3,1,Album,1,training,medium,Hip-Hop,[21],[21],Electric Ave
5,1,Album,1,training,small,Hip-Hop,[21],[21],This World
10,6,Album,6,training,small,Pop,[10],[10],Freeway
20,4,Album,4,training,large,,"[76, 103]","[17, 10, 76, 103]",Spiritual Level
...,...,...,...,...,...,...,...,...,...
155316,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",The Auger
155317,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",Let's Skin Ruby
155318,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",My House Smells Like Kim Deal/Pulp
155319,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",The Man With Two Mouths


## 2. Export Data

In [12]:
track_use.to_csv(DATA_PATH+'/tracks.csv')

In [13]:
track_data =  pd.read_csv(DATA_PATH+'/tracks.csv')
track_data

Unnamed: 0,track_id,album_id,album_type,artist_id,set_split,set_subset,track_genre_top,track_genres,track_genres_all,track_title
0,2,1,Album,1,training,small,Hip-Hop,[21],[21],Food
1,3,1,Album,1,training,medium,Hip-Hop,[21],[21],Electric Ave
2,5,1,Album,1,training,small,Hip-Hop,[21],[21],This World
3,10,6,Album,6,training,small,Pop,[10],[10],Freeway
4,20,4,Album,4,training,large,,"[76, 103]","[17, 10, 76, 103]",Spiritual Level
...,...,...,...,...,...,...,...,...,...,...
106569,155316,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",The Auger
106570,155317,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",Let's Skin Ruby
106571,155318,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",My House Smells Like Kim Deal/Pulp
106572,155319,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",The Man With Two Mouths


In [14]:
small_data = track_data[track_data['set_subset']=='small']
small_data_train = small_data[small_data['set_split']=='training']
small_data_test = small_data[small_data['set_split']=='test']
small_data_val = small_data[small_data['set_split']=='validation']

small_data.to_csv(RAW_DATA_PATH+'/small_data.csv', index = False)
small_data_train.to_csv(RAW_DATA_PATH+'/small_data_train.csv', index = False)
small_data_test.to_csv(RAW_DATA_PATH+'/small_data_test.csv', index = False)
small_data_val.to_csv(RAW_DATA_PATH+'/small_data_val.csv', index = False)

In [15]:
small_data

Unnamed: 0,track_id,album_id,album_type,artist_id,set_split,set_subset,track_genre_top,track_genres,track_genres_all,track_title
0,2,1,Album,1,training,small,Hip-Hop,[21],[21],Food
2,5,1,Album,1,training,small,Hip-Hop,[21],[21],This World
3,10,6,Album,6,training,small,Pop,[10],[10],Freeway
15,140,61,Album,54,training,small,Folk,[17],[17],Queen Of The Wires
16,141,60,Album,54,training,small,Folk,[17],[17],Ohio
...,...,...,...,...,...,...,...,...,...,...
105713,154308,22780,Album,23208,test,small,Hip-Hop,"[21, 539, 811]","[811, 539, 21]",MIA
105714,154309,22780,Album,23208,test,small,Hip-Hop,"[21, 539, 811]","[811, 539, 21]",A1 Symphony
105815,154413,22789,Live Performance,24252,training,small,Pop,[76],"[10, 76]",Do Easy
105816,154414,22789,Live Performance,24252,training,small,Pop,[76],"[10, 76]",Dead Can Dance (uncensored)


In [16]:
medium_data = track_data[track_data['set_subset']!='large']
medium_data_train = medium_data[medium_data['set_split']=='training']
medium_data_test = medium_data[medium_data['set_split']=='test']
medium_data_val = medium_data[medium_data['set_split']=='validation']

medium_data.to_csv(RAW_DATA_PATH+'/medium_data.csv', index = False)
medium_data_train.to_csv(RAW_DATA_PATH+'/medium_data_train.csv', index = False)
medium_data_test.to_csv(RAW_DATA_PATH+'/medium_data_test.csv', index = False)
medium_data_val.to_csv(RAW_DATA_PATH+'/medium_data_val.csv', index = False)

#medium_data


In [17]:
# medium_all_data = track_data[track_data['set_subset'] !="large"]
# medium_all_data_train = medium_all_data[medium_all_data['set_split'] == 'training']
# medium_all_data_test = medium_all_data[medium_all_data['set_split'] == 'test']
# medium_all_data_val = medium_all_data[medium_all_data['set_split'] == 'validation']
#
# medium_all_data.to_csv(RAW_DATA_PATH + '/medium_all_data.csv', index=False)
# medium_all_data_train.to_csv(RAW_DATA_PATH + '/medium_all_data_train.csv', index=False)
# medium_all_data_test.to_csv(RAW_DATA_PATH + '/medium_all_data_test.csv', index=False)
# medium_all_data_val.to_csv(RAW_DATA_PATH + '/medium_all_data_val.csv', index=False)
# medium_all_data

In [20]:
large_data = track_data
large_data_train = large_data[large_data['set_split']=='training']
large_data_test = large_data[large_data['set_split']=='test']
large_data_val = large_data[large_data['set_split']=='validation']

large_data.to_csv(RAW_DATA_PATH+'/large_data.csv', index = False)
large_data_train.to_csv(RAW_DATA_PATH+'/large_data_train.csv', index = False)
large_data_test.to_csv(RAW_DATA_PATH+'/large_data_test.csv', index = False)
large_data_val.to_csv(RAW_DATA_PATH+'/large_data_val.csv', index = False)
large_data

Unnamed: 0,track_id,album_id,album_type,artist_id,set_split,set_subset,track_genre_top,track_genres,track_genres_all,track_title
0,2,1,Album,1,training,small,Hip-Hop,[21],[21],Food
1,3,1,Album,1,training,medium,Hip-Hop,[21],[21],Electric Ave
2,5,1,Album,1,training,small,Hip-Hop,[21],[21],This World
3,10,6,Album,6,training,small,Pop,[10],[10],Freeway
4,20,4,Album,4,training,large,,"[76, 103]","[17, 10, 76, 103]",Spiritual Level
...,...,...,...,...,...,...,...,...,...,...
106569,155316,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",The Auger
106570,155317,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",Let's Skin Ruby
106571,155318,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",My House Smells Like Kim Deal/Pulp
106572,155319,22940,Live Performance,24357,training,large,Rock,[25],"[25, 12]",The Man With Two Mouths


In [None]:
# large_all_data = track_data
# large_all_data_train = large_all_data[large_all_data['set_split'] == 'training']
# large_all_data_test = large_all_data[large_all_data['set_split'] == 'test']
# large_all_data_val = large_all_data[large_all_data['set_split'] == 'validation']
#
# large_all_data.to_csv(RAW_DATA_PATH + '/large_all_data.csv', index=False)
# large_all_data_train.to_csv(RAW_DATA_PATH + '/large_all_data_train.csv', index=False)
# large_all_data_test.to_csv(RAW_DATA_PATH + '/large_all_data_test.csv', index=False)
# large_all_data_val.to_csv(RAW_DATA_PATH + '/large_all_data_val.csv', index=False)
# large_all_data