## Deal with GTZAN Dataset
- Read GTZAN Dataset
- extract and resample features
- save features and labels to numpy file

In [1]:
import numpy as np
import librosa
import os
from tqdm import tqdm
from scipy import signal
import warnings
warnings.filterwarnings('ignore')

# extract and resample features
def featureCal(y,sr,times_len):
    chroma_stft = librosa.feature.chroma_stft(y,sr)                 # chroma
    spectral_center = librosa.feature.spectral_centroid(y=y, sr=sr) # spectral_center
    mfcc = librosa.feature.mfcc(y,sr,n_mfcc=20)                     # mfcc
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)   # spectral_contrast
    # print(chroma_stft.shape,spectral_center.shape,mfcc.shape,spectral_contrast.shape)
    features = chroma_stft.copy()
    for feature in [spectral_center,mfcc,spectral_contrast]:
        features = np.append(features,feature,axis=0)
    features = signal.resample(features,times_len,axis=1)   # resample
    return features[:,:times_len]

genres = os.listdir('../input/gtzan-genre-collection/genres/')
genres.sort()
print(genres)

# read gtzan dataset
features = []
labels = []
for i,genre in enumerate(genres):
    files = os.listdir(f'../input/gtzan-genre-collection/genres/{genre}/')
    files.sort()
    for file in tqdm(files):
        y,sr = librosa.load(f"../input/gtzan-genre-collection/genres/{genre}/{file}")
        feature = featureCal(y,sr,times_len=256)
        features.append(feature)
        labels.append(i)
features = np.stack(features)   # stack feature from each file
labels = np.array(labels)
# np.save(f"{genre}_features.npy",features)
# np.save(f"{genre}_labels.npy",labels)
# features = np.stack(features)
np.save(f"features.npy",features)
np.save(f"labels.npy",labels)

In [47]:
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler

# features = np.load("../input/cnnmusic/features.npy")
# labels = np.load("../input/cnnmusic/labels.npy")
features = np.load("./features.npy")
labels = np.load("./labels.npy")

# normalize features, shape(files,time_series,features)
features = np.transpose(features,[0,2,1])
scaler = StandardScaler()
for i in range(features.shape[0]):
    features[i,:,:] = scaler.fit_transform(features[i,:,:])
print(features.shape)
x_train,x_test,y_train,y_test=train_test_split(features,labels,test_size=0.3,random_state=0)   #划分验证集, 70% for train 30% for test
print(x_train.shape)
print(y_train.shape)

## Train model
    Model Summary:
    _________________________________________________________________
    Layer (type)                Output Shape              Param #
    =================================================================
    conv1d_1 (Conv1D)          (None, 252, 64)           12864
    _________________________________________________________________
    dropout_1 (Dropout)        (None, 252, 64)           0
    _________________________________________________________________
    max_pooling1d_1 (MaxPoolin  (None, 126, 64)          0
    g1D)
    _________________________________________________________________
    conv1d_2 (Conv1D)          (None, 122, 32)           10272
    _________________________________________________________________
    conv1d_3 (Conv1D)          (None, 118, 16)           2576
    _________________________________________________________________
    dropout_2 (Dropout)        (None, 118, 16)           0
    _________________________________________________________________
    global_average_pooling1d_1  (None, 16)               0
    (GlobalAveragePooling1D)
    _________________________________________________________________
    flatten_1 (Flatten)        (None, 16)                0
    _________________________________________________________________
    dense_1 (Dense)            (None, 10)                170
    =================================================================
    Total params: 25,882
    Trainable params: 25,882
    Non-trainable params: 0
    ________________________________
    Training X shape: (700, 256, 40)
    Training Y shape: (700)
    ________________________________
    Test X shape: (300, 256, 40)
    Test Y shape: (300)

In [51]:
from keras import models
from keras import layers

# Convolute along time axis
model = models.Sequential()
model.add(layers.Conv1D(64,5,activation='relu',input_shape=x_train.shape[1:]))  # input_shape = (time_series,features)
model.add(layers.Dropout(0.8))
model.add(layers.MaxPooling1D(pool_size=2))
model.add(layers.Conv1D(32,5,activation='relu'))
model.add(layers.Conv1D(16,5,activation='relu'))
model.add(layers.Dropout(0.8))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Flatten())                 # flatten for dense input
model.add(layers.Dense(10, activation='softmax'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

history = model.fit(x_train,y_train,epochs=400,batch_size=32)   # train
test_loss, test_acc = model.evaluate(x_test,y_test)             # test
print('test_acc: ',test_acc)
predictions = model.predict(x_test)
model.save('Audino_CNN')

## Assess classification accuracy

In [52]:
classlist = ['blues','classical','country','disco','hiphop','jazz','metal','pop','reggae','rock']
predixt = np.argmax(predictions,axis=1)
for i, j in zip(y_test[:20], predixt[:20]):
    print(f"Predict:{classlist[int(j)]:<10}Real:{classlist[int(i)]:<10}Status:{i==j}")

from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test,predixt)   # calculate confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

# visualize confusion_matrix with heatmap
plt.figure(figsize=[10,10])
sns.heatmap(matrix,cmap='Blues',annot=True,fmt='.20g',xticklabels=classlist,yticklabels=classlist)
plt.xlabel('Predict')
plt.ylabel('Real')
np.save('matrix',matrix)