* Baixar todo o dataset
* Conseguir importar o dataset
* Inicialmente trabalhar com o dataset inteiro e utilizar as diferentes extrações de features
* Depois ir pelo segundo caminho

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import librosa
import librosa.display
from os import listdir
from os.path import isfile, join

import glob
from sklearn import svm

# Dataset

In [2]:
def import_signal(path):
    files = glob.glob(path + '*.au')
    
    audios = []
    
    for file in files:  
        s, sr = librosa.core.load(file, sr=22050)
        audios.append(s)

    return audios, sr

## pop 0

In [3]:
pop_audios, pop_sr = import_signal('dataset/pop/')

In [4]:
pop_audios_df = pd.DataFrame([[x] for x in pop_audios], columns=['x'])
pop_audios_df['y'] = 0
pop_audios_df['y2'] = 0

pop_audios_df.head()

Unnamed: 0,x,y,y2
0,"[-0.0340271, -0.043304443, -0.0463562, -0.0431...",0,0
1,"[-0.058288574, 0.01071167, 0.09915161, 0.07913...",0,0
2,"[-0.26919556, -0.24667358, -0.18579102, -0.136...",0,0
3,"[-0.2119751, -0.0675354, -0.18411255, 0.024597...",0,0
4,"[-0.02709961, -0.023712158, 0.052215576, 0.084...",0,0


## classical 1

In [5]:
classical_audios, classical_sr = import_signal('dataset/classical/')

In [6]:
classical_audios_df = pd.DataFrame([[x] for x in classical_audios], columns=['x'])
classical_audios_df['y'] = 1
classical_audios_df['y2'] = 0

classical_audios_df.head()

Unnamed: 0,x,y,y2
0,"[-0.06484985, -0.10720825, -0.109436035, -0.09...",1,0
1,"[0.007537842, 0.011444092, 0.010345459, 0.0132...",1,0
2,"[0.023345947, 0.028686523, 0.027038574, 0.0279...",1,0
3,"[0.007843018, 0.0072631836, 0.0058288574, 0.00...",1,0
4,"[-0.010772705, -0.020019531, -0.024261475, -0....",1,0


## jazz 2

In [7]:
jazz_audios, jazz_sr = import_signal('dataset/jazz/')

In [8]:
jazz_audios_df = pd.DataFrame([[x] for x in jazz_audios], columns=['x'])
jazz_audios_df['y'] = 2
jazz_audios_df['y2'] = 1

jazz_audios_df.head()

Unnamed: 0,x,y,y2
0,"[-0.031066895, -0.05078125, -0.04537964, -0.04...",2,1
1,"[0.0019836426, 0.0014343262, -0.0017089844, -0...",2,1
2,"[-0.03414917, -0.04660034, -0.027648926, -0.01...",2,1
3,"[-0.021911621, -0.03604126, -0.039001465, -0.0...",2,1
4,"[0.030456543, 0.010772705, -0.008544922, -0.02...",2,1


## rock 3

In [9]:
rock_audios, rock_sr = import_signal('dataset/rock/')

In [10]:
rock_audios_df = pd.DataFrame([[x] for x in rock_audios], columns=['x'])
rock_audios_df['y'] = 3
rock_audios_df['y2'] = 1

rock_audios_df.head()

Unnamed: 0,x,y,y2
0,"[0.035339355, 0.053375244, -0.0047912598, 0.00...",3,1
1,"[-0.07858276, -0.1638794, -0.11288452, 0.00088...",3,1
2,"[-0.026824951, -0.038757324, -0.029693604, -0....",3,1
3,"[-0.023529053, -0.03363037, -0.03527832, -0.04...",3,1
4,"[-0.026641846, -0.051208496, -0.05618286, -0.0...",3,1


In [11]:
pop_sr == classical_sr == jazz_sr == rock_sr

True

In [12]:
sr = pop_sr

# get audios df

In [13]:
audios = (
    pd.concat([
        pop_audios_df, classical_audios_df, 
        jazz_audios_df, rock_audios_df])
    .sample(frac=1).reset_index(drop=True) #shuffle
)

print(f'Rows: {len(audios)}')
audios.head()

Rows: 400


Unnamed: 0,x,y,y2
0,"[-0.0340271, -0.043304443, -0.0463562, -0.0431...",0,0
1,"[-0.0010070801, 0.0022888184, 0.00390625, 0.00...",1,0
2,"[-0.049987793, -0.041931152, -0.029937744, -0....",2,1
3,"[0.063201904, 0.05630493, 0.04446411, 0.031097...",2,1
4,"[0.07803345, -0.03765869, 0.12664795, 0.165618...",0,0


# split into train and test

In [14]:
msk = np.random.rand(len(audios)) < 0.8

train = audios[msk]
test = audios[~msk]

print(f'Rows train:{len(train)}')
print(f'Rows test:{len(test)}')

Rows train:329
Rows test:71


In [15]:
train.groupby('y').count()

Unnamed: 0_level_0,x,y2
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0,86,86
1,81,81
2,81,81
3,81,81


In [16]:
test.groupby('y').count()

Unnamed: 0_level_0,x,y2
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0,14,14
1,19,19
2,19,19
3,19,19


In [17]:
train.groupby('y2').count()

Unnamed: 0_level_0,x,y
y2,Unnamed: 1_level_1,Unnamed: 2_level_1
0,167,167
1,162,162


In [18]:
test.groupby('y2').count()

Unnamed: 0_level_0,x,y
y2,Unnamed: 1_level_1,Unnamed: 2_level_1
0,33,33
1,38,38


# Feature extraction

## MFCC

In [19]:
mfcc_list_train = []

for x in train['x']:
      
    mfcc = np.array(librosa.feature.mfcc(y=x, sr=sr)).flatten()
    zeros = np.zeros((26280 - len(mfcc)))
    mfcc_list_train.append(np.concatenate((mfcc, zeros), axis=0))
    
train['mfcc'] = mfcc_list_train
train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,x,y,y2,mfcc
0,"[-0.0340271, -0.043304443, -0.0463562, -0.0431...",0,0,"[-236.56199645996094, -257.2580261230469, -276..."
2,"[-0.049987793, -0.041931152, -0.029937744, -0....",2,1,"[-254.18899536132812, -235.4429473876953, -214..."
3,"[0.063201904, 0.05630493, 0.04446411, 0.031097...",2,1,"[-262.9224548339844, -275.30096435546875, -292..."
5,"[0.01651001, 0.02923584, 0.030151367, 0.035400...",2,1,"[-240.9464111328125, -252.67112731933594, -264..."
6,"[-0.02456665, -0.04626465, -0.039642334, -0.02...",1,0,"[-202.40521240234375, -222.25177001953125, -25..."


In [20]:
mfcc_list_test = []

for x in test['x']:
    
    mfcc = np.array(librosa.feature.mfcc(y=x, sr=sr)).flatten()
    zeros = np.zeros((26280 - len(mfcc)))
    mfcc_list_test.append(np.concatenate((mfcc, zeros), axis=0))
    
test['mfcc'] = mfcc_list_test
test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,x,y,y2,mfcc
1,"[-0.0010070801, 0.0022888184, 0.00390625, 0.00...",1,0,"[-351.47119140625, -365.6391906738281, -379.80..."
4,"[0.07803345, -0.03765869, 0.12664795, 0.165618...",0,0,"[25.607858657836914, 26.773752212524414, 15.20..."
7,"[0.079071045, 0.099090576, 0.037872314, 0.0298...",3,1,"[-104.98056030273438, -111.75321960449219, -12..."
12,"[-0.092285156, -0.20037842, -0.27676392, -0.13...",3,1,"[-27.37483024597168, -50.10458755493164, -93.5..."
19,"[0.049346924, 0.045135498, 0.029785156, 0.0246...",0,0,"[14.973041534423828, 66.89509582519531, 54.783..."


## LPC

In [21]:
lpc_list = []

for x in train['x']:
    
    lpc = librosa.lpc(x, 6)
    lpc_list.append(lpc)
    
train['lpc'] = lpc_list
train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,x,y,y2,mfcc,lpc
0,"[-0.0340271, -0.043304443, -0.0463562, -0.0431...",0,0,"[-236.56199645996094, -257.2580261230469, -276...","[1.0, -0.77118856, -0.19470054, -0.09308255, -..."
2,"[-0.049987793, -0.041931152, -0.029937744, -0....",2,1,"[-254.18899536132812, -235.4429473876953, -214...","[1.0, -3.170267, 4.4571915, -3.5584393, 1.5654..."
3,"[0.063201904, 0.05630493, 0.04446411, 0.031097...",2,1,"[-262.9224548339844, -275.30096435546875, -292...","[1.0, -2.671005, 3.6828148, -3.4371247, 2.0726..."
5,"[0.01651001, 0.02923584, 0.030151367, 0.035400...",2,1,"[-240.9464111328125, -252.67112731933594, -264...","[1.0, -2.1440706, 2.2017953, -1.7921026, 1.484..."
6,"[-0.02456665, -0.04626465, -0.039642334, -0.02...",1,0,"[-202.40521240234375, -222.25177001953125, -25...","[1.0, -2.359103, 2.510035, -2.1399875, 1.95702..."


In [22]:
lpc_list = []

for x in test['x']:
    
    lpc = librosa.lpc(x, 6)
    lpc_list.append(lpc)
    
test['lpc'] = lpc_list
test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,x,y,y2,mfcc,lpc
1,"[-0.0010070801, 0.0022888184, 0.00390625, 0.00...",1,0,"[-351.47119140625, -365.6391906738281, -379.80...","[1.0, -1.7944953, 1.0551944, -0.13227506, -0.0..."
4,"[0.07803345, -0.03765869, 0.12664795, 0.165618...",0,0,"[25.607858657836914, 26.773752212524414, 15.20...","[1.0, -0.8433099, -0.13616365, 0.062086865, 0...."
7,"[0.079071045, 0.099090576, 0.037872314, 0.0298...",3,1,"[-104.98056030273438, -111.75321960449219, -12...","[1.0, -1.7010816, 1.5733303, -1.4908944, 1.187..."
12,"[-0.092285156, -0.20037842, -0.27676392, -0.13...",3,1,"[-27.37483024597168, -50.10458755493164, -93.5...","[1.0, -1.97621, 2.1082149, -1.842086, 1.440225..."
19,"[0.049346924, 0.045135498, 0.029785156, 0.0246...",0,0,"[14.973041534423828, 66.89509582519531, 54.783...","[1.0, -0.384666, -0.39819127, -0.17220089, 0.0..."


# First Scenario

## Plot functions

In [23]:
# pesquisar

## Using MFCC

In [24]:
model_mfcc = svm.SVC()
model_mfcc.fit(train['mfcc'].to_list(), train['y'].to_list())



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [25]:
print('Train accuracy: {:.1%}'.format(model_mfcc.score(train['mfcc'].to_list(), train['y'].to_list())))
print('Test accuracy: {:.1%}'.format(model_mfcc.score(test['mfcc'].to_list(), test['y'].to_list())))

Train accuracy: 100.0%
Test accuracy: 21.1%


## Using LPC

In [26]:
model_lpc = svm.SVC()
model_lpc.fit(train['lpc'].to_list(), train['y'].to_list())



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [27]:
print('Train accuracy: {:.1%}'.format(model_lpc.score(train['lpc'].to_list(), train['y'].to_list())))
print('Test accuracy: {:.1%}'.format(model_lpc.score(test['lpc'].to_list(), test['y'].to_list())))

Train accuracy: 71.4%
Test accuracy: 57.7%


## Using MFCC and LPC

# Second Scenario

In [77]:
def second_scenario_classification(train, test, feature_ex_svm1='lpc', feature_ex_svm23='mfcc'):
    
    # SVM1
    model1 = svm.SVC()
    model1.fit(train[f'{feature_ex_svm1}'].to_list(), train['y2'].to_list())
    train['y2_predicted'] = model1.predict(train[f'{feature_ex_svm1}'].to_list())
    
    #SVM2
    train_2 = train[train['y2_predicted']==0]
    train_2 = train_2[train_2['y'].isin([0,1])]
    test_2 = test[test['y2']==0]
    model2 = svm.SVC()
    model2.fit(train_2[f'{feature_ex_svm23}'].to_list(), train_2['y'].to_list())
    
    #SVM3
    train_3 = train[train['y2_predicted']==1]
    train_3 = train_3[train_3['y'].isin([2,3])]
    test_3 = test[test['y2']==1]
    model3 = svm.SVC()
    model3.fit(train_3[f'{feature_ex_svm23}'].to_list(), train_3['y'].to_list())
    
    train_score = (
        model2.score(train_2[f'{feature_ex_svm23}'].to_list(), train_2['y'].to_list()) * 0.5 +
        model3.score(train_3[f'{feature_ex_svm23}'].to_list(), train_3['y'].to_list()) * 0.5
    )
    
    test_score = (
        model2.score(test_2[f'{feature_ex_svm23}'].to_list(), test_2['y'].to_list()) * 0.5 +
        model3.score(test_3[f'{feature_ex_svm23}'].to_list(), test_3['y'].to_list()) * 0.5
    )
    
    print('Train accuracy: {:.1%}'.format(train_score))
    print('Test accuracy: {:.1%}'.format(test_score))

In [78]:
second_scenario_classification(train, test, feature_ex_svm1='lpc', feature_ex_svm23='mfcc')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Train accuracy: 100.0%
Test accuracy: 46.2%


In [79]:
second_scenario_classification(train, test, feature_ex_svm1='lpc', feature_ex_svm23='lpc')

Train accuracy: 88.8%
Test accuracy: 82.7%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [80]:
second_scenario_classification(train, test, feature_ex_svm1='mfcc', feature_ex_svm23='mfcc')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Train accuracy: 100.0%
Test accuracy: 47.5%


In [81]:
second_scenario_classification(train, test, feature_ex_svm1='mfcc', feature_ex_svm23='lpc')



Train accuracy: 88.3%
Test accuracy: 85.3%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
