# Code For The Entire Procedure

### Import all libraries nessecary. Librosa is the library that executes algorithms to extract feature from .wav sample audio files.

In [1]:
import os
import librosa
import math
import json
import librosa.display
import IPython.display as ipd
import numpy as np
import matplotlib.pyplot as plt

### Each audio track is 30 seconds long. Define that the sampling rate is 1 second per sample.  At each second, the Band Energy Ratio (BER), Mel-Frequency Cspstral Coefficients (MFCCs), their derivatives, spectral centriods (SC), and zero-crossing (ZCF) rates are taken.

### Each track will have 30 sample extractions (and so each track maps to the 30 of them), where each sample contains multiple features. The feature dimensions for each audio track after feature generation is 30-by-the-count-of-all-features-per-sampling.

### There are 1000 tracks and 10 genres in total, where each 100 tracks are labeled to one genre. Each track is 30 seconds long, and each track will be sampled 30 times, 1 time per second.

### The 1000 tracks with all their sampled features is stored in "data.json" file.

In [2]:
DATASET_PATH = "genres"
JSON_PATH = "data.json"

SAMPLE_RATE = 22050
DURATION = 30
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION
SEGMENTS_PER_TRACK = 30

FRAME_SIZE = 2048
HOP_SIZE = 512

num_samples_per_segment = int(SAMPLES_PER_TRACK / SEGMENTS_PER_TRACK)
expected_num_mfcc_vectors_per_segment = math.ceil(num_samples_per_segment / HOP_SIZE)


def calculate_split_frequency_bin(spectrogram, split_frequency, sample_rate):
    
    frequency_range = sample_rate/2
    frequency_delta_per_bin = frequency_range / spectrogram.shape[0]
    split_frequency_bin = np.floor(split_frequency / frequency_delta_per_bin)
    return int(split_frequency_bin)

def calculate_band_energy_ratio(spectrogram, split_frequency, sample_rate):
    
    split_frequency_bin = calculate_split_frequency_bin(spectrogram, split_frequency, sample_rate)
    
    power_spec = np.abs(spectrogram)**2
    power_spec = power_spec.T
    
    band_energy_ratio = []
    
    for frequency_in_frame in power_spec:
        sum_power_low_frequencies = np.sum(frequency_in_frame[:split_frequency_bin])
        sum_power_high_frequencies = np.sum(frequency_in_frame[split_frequency_bin:])
        ber_current_frame = sum_power_high_frequencies / sum_power_low_frequencies
        if math.isnan(ber_current_frame):
            band_energy_ratio.append(100)
        else:
            band_energy_ratio.append(ber_current_frame)
        
    return np.array(band_energy_ratio)

def process_audio_data(dataset_path=DATASET_PATH,
                       json_path=JSON_PATH,
                       n_mfcc=13,
                       n_fft=FRAME_SIZE,
                       hop_length=HOP_SIZE,
                       num_segments=SEGMENTS_PER_TRACK):
    
    data = {
        "mapping": [],
        "tracks": [],
        "labels": []
    }
    
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        
        if dirpath is not dataset_path:
            dirpath_components = dirpath.split("/")
            semantic_label = dirpath_components[-1]
            
            data["mapping"].append(semantic_label)
            print("\nProcessing {}".format(semantic_label))
            
            for f in filenames:
                
                file_path = os.path.join(dirpath, f)
                signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
                
                track = {
                    "mfcc_m": [],
                    "mfcc_s": [],
                    "mfcc1_m": [],
                    "mfcc1_s": [],
                    "mfcc2_m": [],
                    "mfcc2_s": [],
                    "ber1000_m": [],
                    "ber1000_s": [],
                    "ber2000_m": [],
                    "ber2000_s": [],
                    "ber3000_m": [],
                    "ber3000_s": [],
                    "centroid_m": [],
                    "centroid_s": [],
                    "bandwidth_m": [],
                    "bandwidth_s": [],
                    "zcr_m": [],
                    "zcr_s": [],
                    "labels": []
                }
                
                for s in range(num_segments):
                    
                    start_sample = num_samples_per_segment * s
                    finish_sample = start_sample + num_samples_per_segment
                    
                    mfcc = librosa.feature.mfcc(signal[start_sample:finish_sample], 
                                                sr=sr,
                                                n_fft=n_fft,
                                                n_mfcc=n_mfcc,
                                                hop_length=hop_length)
                    mfccs = mfcc.T
                    
                    if len(mfccs)==expected_num_mfcc_vectors_per_segment:
                        
                        track["mfcc_m"].append(np.mean(mfccs, 0).tolist())
                        track["mfcc_s"].append(np.std(mfccs, 0).tolist())
                        
                        delta_mfccs = librosa.feature.delta(mfcc).T
                        delta2_mfccs = librosa.feature.delta(mfcc, order=2).T
                        
                        track["mfcc1_m"].append(np.mean(delta_mfccs, 0).tolist())
                        track["mfcc1_s"].append(np.std(delta_mfccs, 0).tolist())
                        track["mfcc2_m"].append(np.mean(delta2_mfccs, 0).tolist())
                        track["mfcc2_s"].append(np.std(delta2_mfccs, 0).tolist())
                        
                        s_music = librosa.stft(signal[start_sample:finish_sample], n_fft=n_fft, hop_length=hop_length)
                    
                        ber_music1000 = calculate_band_energy_ratio(s_music, 1000, sr) 
                        track["ber1000_m"].append(np.mean(ber_music1000).tolist())
                        track["ber1000_s"].append(np.std(ber_music1000).tolist())
                        
                        ber_music2000 = calculate_band_energy_ratio(s_music, 2000, sr) 
                        track["ber2000_m"].append(np.mean(ber_music2000).tolist())
                        track["ber2000_s"].append(np.std(ber_music2000).tolist())
                        
                        ber_music3000 = calculate_band_energy_ratio(s_music, 3000, sr) 
                        track["ber3000_m"].append(np.mean(ber_music3000).tolist())
                        track["ber3000_s"].append(np.std(ber_music3000).tolist())

                        sc_music = librosa.feature.spectral_centroid(y=signal[start_sample:finish_sample], sr=sr, n_fft=n_fft, hop_length=hop_length)[0]
                        track["centroid_m"].append(np.mean(sc_music).tolist())
                        track["centroid_s"].append(np.std(sc_music).tolist())

                        bdwh_music = librosa.feature.spectral_bandwidth(y=signal[start_sample:finish_sample], sr=sr, n_fft=n_fft, hop_length=hop_length)[0]
                        track["bandwidth_m"].append(np.mean(bdwh_music).tolist())
                        track["bandwidth_s"].append(np.std(bdwh_music).tolist())
                        
                        zcr_music = librosa.feature.zero_crossing_rate(y=signal[start_sample:finish_sample], frame_length=n_fft, hop_length=hop_length)[0]
                        track["zcr_m"].append(np.mean(zcr_music).tolist())
                        track["zcr_s"].append(np.std(zcr_music).tolist())
                        
                        track["labels"].append(i-1)
                
                data["tracks"].append(track)
                data["labels"].append(i-1)
                
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)

In [3]:
if __name__=="__main__":
    process_audio_data()


Processing pop

Processing metal

Processing disco

Processing blues

Processing reggae

Processing classical

Processing rock

Processing hiphop


  ber_current_frame = sum_power_high_frequencies / sum_power_low_frequencies



Processing country

Processing jazz


## Perform K-means clustering algorithm based on each 1-second sample, asserting that each 1-second sample (of all 1000 tracks, 30 samples per track, 30000 total) has enough information that each sample can be disdinguished and classified among the other 1-second samples. If this assertion is true, then the clustering results from the K-means algorithm may show noticeable patterns.

In [4]:
data = json.load(open("data.json"))

data_features_by_segments = [];
count = 0;

for j in range(len(data["tracks"])):

    samples = np.array([data["tracks"][j][i] for i in data["tracks"][j]], dtype=object)
    new_new_samples = np.asarray([samples[i] for i in range(len(samples))], dtype=object)
    first_row = np.asarray([new_new_samples[0][i] for i in range(len(new_new_samples[0]))], dtype=object)
    second_row = np.asarray([new_new_samples[1][i] for i in range(len(new_new_samples[1]))], dtype=object)
    third_row = np.asarray([new_new_samples[2][i] for i in range(len(new_new_samples[2]))], dtype=object)
    fourth_row = np.asarray([new_new_samples[3][i] for i in range(len(new_new_samples[3]))], dtype=object)
    fifth_row = np.asarray([new_new_samples[4][i] for i in range(len(new_new_samples[4]))], dtype=object)
    sixth_row = np.asarray([new_new_samples[5][i] for i in range(len(new_new_samples[5]))], dtype=object)
    rest = new_new_samples[6:].T

    dataset = np.concatenate((first_row, second_row, third_row, fourth_row, fifth_row, sixth_row, rest), 1)
    
    if count==0:
        data_features_by_segments = dataset
        count-=1
    else:
        data_features_by_segments = np.concatenate((data_features_by_segments, dataset), 0)

print(data_features_by_segments.shape)


(29991, 91)


In [5]:
def normalize(x):
    return (x-np.mean(x))/np.std(x)

def K_Means(X, K, error_threshold):
    
    m = np.zeros((K, X.shape[1]))
    sample_window_size = int(X.shape[0]/K)
    
    for i in range(0, K):
        m[i] = X[np.random.choice(sample_window_size, 1, replace=False) + i*sample_window_size]
    
    D = 1000000
    difference = 10.0
    
    while difference > error_threshold:
    
        current_distortion = 0
        C = {n: [] for n in range(K)}
        
        for row in X:
            error = np.sum((row[:-1]-m[:,:-1])**2, axis=1)**0.5
            min_index = np.argmin(error)
            current_distortion += error[min_index]
            C[min_index].append(row) 
            
        for samples_by_cluster in C:
            temp = np.asarray(C[samples_by_cluster])
            m[samples_by_cluster,:-1] = np.mean(temp[:,:-1],0)
        
        difference = abs(current_distortion-D)
        D = current_distortion

    return(m, D)

In [6]:
normalized_dataset_by_segments = data_features_by_segments
normalized_dataset_by_segments[:,:-1] = normalize(normalized_dataset_by_segments[:,:-1])

In [7]:
def k_means_and_display_by_number_of_cluster_centers(centroids, normalized_dataset, error_threshold):

    m, D = K_Means(normalized_dataset, centroids, error_threshold)

    genre_count = np.zeros((centroids, len(data["mapping"])))
    genres = np.asarray(data["mapping"])

    for idx in range(normalized_dataset.shape[0]):
        distance = np.sum((normalized_dataset[idx,:-1]-m[:,:-1])**2, axis=1)
        centroid = np.argmin(distance)
        genre = normalized_dataset[idx][-1]
        genre_count[centroid][int(genre)] += 1

    print('\t',end="")

    genres[5] = "classic"

    for genre in genres:
        print(genre+'\t',end="")
    print('\n')

    for i in range(genre_count.shape[0]):
        print('K = %d \t'%i,end="")
        for j in range(genre_count.shape[1]):
            print(str(int(genre_count[i,j]))+'\t',end="")
        print("\n")


In [8]:
for number_of_centroids in range(2, len(data["mapping"])+1):
    print("\nTrying with " + str(number_of_centroids) + " centroids:\n")
    k_means_and_display_by_number_of_cluster_centers(number_of_centroids, normalized_dataset_by_segments, 5)
    print("Try Complete.\n")
    


Trying with 2 centroids:

	pop	metal	disco	blues	reggae	classic	rock	hiphop	country	jazz	

K = 0 	252	710	638	2284	1545	2909	1415	694	2045	2205	

K = 1 	2748	2290	2361	716	1455	89	1584	2304	952	795	

Try Complete.


Trying with 3 centroids:

	pop	metal	disco	blues	reggae	classic	rock	hiphop	country	jazz	

K = 0 	2216	485	1011	43	564	14	483	976	368	361	

K = 1 	711	2364	1863	1428	1652	300	1881	1797	1241	934	

K = 2 	73	151	125	1529	784	2684	635	225	1388	1705	

Try Complete.


Trying with 4 centroids:

	pop	metal	disco	blues	reggae	classic	rock	hiphop	country	jazz	

K = 0 	808	2040	1587	592	909	71	1177	1525	685	519	

K = 1 	283	764	742	1306	1427	581	1252	774	1289	1037	

K = 2 	26	46	24	1092	287	2337	306	64	849	1226	

K = 3 	1883	150	646	10	377	9	264	635	174	218	

Try Complete.


Trying with 5 centroids:

	pop	metal	disco	blues	reggae	classic	rock	hiphop	country	jazz	

K = 0 	1554	73	461	5	278	3	160	452	93	139	

K = 1 	954	1230	1096	174	579	23	670	1078	475	421	

K = 2 	125	216	252	1014	1

## Then, perform K-means clustering algorithm based on each track, where the integrated information (mean, min, max, standard derivation) of 30 samples per track is the base unit for the track-based processing. 

## Track-based processing, unlike sample-based processing (demonstrated above), is intuitively more relevant to music genre classification, where each track is treated as an individual unit rather than just 1 second of each track (that the target of classification is the tracks, not the one-seconds from each track).

In [9]:
data_features_by_tracks = [];

count = 1;

for j in range(len(data["tracks"])):

    samples = np.array([data["tracks"][j][i] for i in data["tracks"][j]], dtype=object)
    new_new_samples = np.asarray([samples[i] for i in range(len(samples))], dtype=object)
    first_row = np.asarray([new_new_samples[0][i] for i in range(len(new_new_samples[0]))], dtype=object)
    second_row = np.asarray([new_new_samples[1][i] for i in range(len(new_new_samples[1]))], dtype=object)
    third_row = np.asarray([new_new_samples[2][i] for i in range(len(new_new_samples[2]))], dtype=object)
    fourth_row = np.asarray([new_new_samples[3][i] for i in range(len(new_new_samples[3]))], dtype=object)
    fifth_row = np.asarray([new_new_samples[4][i] for i in range(len(new_new_samples[4]))], dtype=object)
    sixth_row = np.asarray([new_new_samples[5][i] for i in range(len(new_new_samples[5]))], dtype=object)
    rest = new_new_samples[6:-1].T
    labels = new_new_samples[-1].T

    dataset = np.concatenate((first_row, second_row, third_row, fourth_row, fifth_row, sixth_row, rest), 1);
    dataset = dataset.T
    
    dataset_by_tracks = np.array([]);
    
    for k in range(dataset.shape[0]):
        

        dataset_by_tracks = np.append(dataset_by_tracks, np.min(dataset[k]))
        dataset_by_tracks = np.append(dataset_by_tracks, np.max(dataset[k]))
        dataset_by_tracks = np.append(dataset_by_tracks, np.mean(dataset[k]))

    dataset_by_tracks = np.append(dataset_by_tracks, labels[0])
    
    if count==1:
        data_features_by_tracks = dataset_by_tracks
        count-=1
    elif count==0:
        data_features_by_tracks = np.concatenate(([data_features_by_tracks,]*1, [dataset_by_tracks,]*1), 0)
        count-=1
    else:
        data_features_by_tracks = np.concatenate((data_features_by_tracks, [dataset_by_tracks,]*1), 0)

print(data_features_by_tracks.shape)


(1000, 271)


In [10]:
normalized_dataset_by_tracks = data_features_by_tracks
normalized_dataset_by_tracks[:,:-1] = normalize(normalized_dataset_by_tracks[:,:-1])

In [11]:
for number_of_centroids in range(2, len(data["mapping"])+1):
    print("\nTrying with " + str(number_of_centroids) + " centroids:\n")
    k_means_and_display_by_number_of_cluster_centers(number_of_centroids, normalized_dataset_by_tracks, 0)
    print("Try Complete.\n")
    


Trying with 2 centroids:

	pop	metal	disco	blues	reggae	classic	rock	hiphop	country	jazz	

K = 0 	95	75	82	21	46	2	53	79	31	25	

K = 1 	5	25	18	79	54	98	47	21	69	75	

Try Complete.


Trying with 3 centroids:

	pop	metal	disco	blues	reggae	classic	rock	hiphop	country	jazz	

K = 0 	78	6	30	0	15	0	11	33	12	11	

K = 1 	0	5	3	57	27	96	21	9	51	62	

K = 2 	22	89	67	43	58	4	68	58	37	27	

Try Complete.


Trying with 4 centroids:

	pop	metal	disco	blues	reggae	classic	rock	hiphop	country	jazz	

K = 0 	77	5	29	0	14	0	10	30	8	11	

K = 1 	0	2	0	35	5	83	7	0	21	40	

K = 2 	18	79	54	25	30	2	52	50	23	14	

K = 3 	5	14	17	40	51	15	31	20	48	35	

Try Complete.


Trying with 5 centroids:

	pop	metal	disco	blues	reggae	classic	rock	hiphop	country	jazz	

K = 0 	71	5	23	0	12	0	8	20	4	8	

K = 1 	2	83	31	29	5	2	40	25	12	9	

K = 2 	21	1	29	0	30	1	17	34	15	9	

K = 3 	0	2	0	35	5	83	7	0	20	40	

K = 4 	6	9	17	36	48	14	28	21	49	34	

Try Complete.


Trying with 6 centroids:

	pop	metal	disco	blues	reggae	classic	rock	

## Procedure complete. Observations are listed in the report.