# Create the dataset by following the feature extraction in the paper

## Dependencies

In [1]:
import os
import pandas as pd
import librosa as lr
import librosa.feature as lrf
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt

## Loading Data

In [2]:
SR = 22050 # consistent across every file

signals_path = "gtzan_dataset/genres_original"
genre_paths = [os.path.join(signals_path, genre) for genre in os.listdir(signals_path)]

def contains_dsstore(s):
    '''
    Check if the directory contains the MacOS generated file .DS_Store
    '''
    substring = s.split("/")
    return ".DS_Store" in substring

def generate_file_paths(genre_paths):
    for genre in genre_paths:
        for file in os.listdir(genre):
            file_path = genre + "/" + file
            file_paths.append(file_path)
    file_paths.sort()
    return file_paths


def load_signals(genre_paths):
    signals = []
    for genre_path in genre_paths:
        if contains_dsstore(genre_path):
            continue
        genre = genre_path.split("/")[2]        
        for file in os.listdir(genre_path):
            file_path = genre_path + "/" + file
            # Skip file system specific paths.
            if contains_dsstore(file_path):
                continue
            signal = lr.load(file_path)[0]
            signals.append([genre, signal])
    signals = sorted(signals, key = lambda list: list[0]) # sort by genre name in descending order
    return np.array(signals)

signals = load_signals(genre_paths)



## Feature Extraction

In [3]:
def feature_extraction(signal):
    '''
    Extract the features used in the paper as follows:
    
    Timbral Texture Feature Vector:
    
    means and variances of spectral centroid, rolloff, flux, zerocrossings over the texture window , low energy, and means and variances of the first five MFCC coefficients over the texture window (excluding the coefficient corresponding to the DC component).
    
    including chromatic components for 12 notes.

    '''
    # Default librosa parameters for the functions are as follows:
    # sr=22050, n_fft=2048, hop_length=512
    
    # Feature Parameters from the paper
    n_mfcc = 5 # first 5 coefficients perform better for genre classification
    
    feature_vector = []
    extracted_features = []
    
    
    # Timbral Texture Features
    feature_vector.append(lrf.spectral_centroid(signal)) # Spectral Centroid
    feature_vector.append(lrf.spectral_rolloff(signal)) # Rolloff
    feature_vector.append(lr.onset.onset_strength(signal)) # Flux
    feature_vector.append(lr.zero_crossings(signal)) # Zero Crossings
    for mfcc in lrf.mfcc(signal, n_mfcc=n_mfcc): # First 5 MFCCs
        feature_vector.append(mfcc)

    
    # Pitch Content Features
    n_mel = 10
    n_tonnetz = 6
    feature_vector.append(lr.beat.tempo(signal)) # Tempo
    
    for chroma in lrf.chroma_stft(signal, n_chroma=12): # First 5 MFCCs
        feature_vector.append(chroma)
        
    for mel in lr.power_to_db(lrf.melspectrogram(signal))[:n_mel, :] :
        feature_vector.append(mel)
    
    for tonal in lrf.tonnetz(signal)[:n_tonnetz, :] :
        feature_vector.append(tonal)
        
    for feature in feature_vector:
        extracted_features.append(np.mean(feature))
        extracted_features.append(np.std(feature)) 
        
    # Rhythymic Content Features
    peaks = -np.sort(-lr.onset.onset_strength(signal).flatten())
    A0 = peaks[0]
    A1 = peaks[1]
    RA = A1/A0    
    
    extracted_features.append(np.mean(lrf.tempogram(signal)))
    extracted_features.append(A0)
    extracted_features.append(A1)
    extracted_features.append(RA)
    
    return np.array(extracted_features)

def feature_extraction_raw(signal):
    '''
    This function extracts the features as the previous one but does not include variances for given features
    
    It was mainly used for debugging and validation between different datasets.
    '''
    # Default librosa parameters for the functions are as follows:
    # sr=22050, n_fft=2048, hop_length=512
    
    # Feature Parameters from the paper
    n_mfcc = 5 # first 5 coefficients perform better for genre classification
    
    feature_vector = []
    extracted_features = []
    
    # Timbral Texture Features 
    feature_vector.append(lrf.spectral_centroid(signal)) # Spectral Centroid
    feature_vector.append(lrf.spectral_rolloff(signal)) # Rolloff
    feature_vector.append(lr.onset.onset_strength(signal)) # Flux
    feature_vector.append(lr.zero_crossings(signal)) # Zero Crossings
    for mfcc in lrf.mfcc(signal, n_mfcc=n_mfcc): # First 5 MFCCs
        feature_vector.append(mfcc)
        
    # Pitch Content Features
    n_mel = 10
    n_tonnetz = 6
    feature_vector.append(lr.beat.tempo(signal)) # Tempo
    
    for chroma in lrf.chroma_stft(signal, n_chroma=12): # First 12 Chroma keys for 12 notes.
        feature_vector.append(chroma)
        
    for mel in lr.power_to_db(lrf.melspectrogram(signal))[:n_mel, :] :
        feature_vector.append(mel)
    
    for tonal in lrf.tonnetz(signal)[:n_tonnetz, :] :
        feature_vector.append(tonal)
    
    for feature in feature_vector:
        extracted_features.append(np.mean(feature))
    
    # Rhythymic Content Features
    peaks = -np.sort(-lr.onset.onset_strength(signal).flatten())
    A0 = peaks[0]
    A1 = peaks[1]
    RA = A1/A0    
    
    extracted_features.append(np.mean(lrf.tempogram(signal)))
    extracted_features.append(A0)
    extracted_features.append(A1)
    extracted_features.append(RA)
    
    return np.array(extracted_features)

In [4]:
extracted_features = []
for signal in signals:
    extracted_features.append(feature_extraction(signal[1]))
extracted_features = np.array(extracted_features)

## Dataset Creation (CSV file for features)

In [5]:
df = pd.DataFrame(extracted_features, columns=["Mean Spectral Centroid", "Variance Spectral Centroid", 
                                               "Mean Spectral Rolloff", "Variance Spectral Rolloff", 
                                               "Mean Spectral Flux", "Variance Spectral Flux", 
                                               "Mean Zero Crossings", "Variance Zero Crossings", 
                                               "Mean MFCC #1", "Variance MFCC #1", 
                                               "Mean MFCC #2", "Variance MFCC #2", 
                                               "Mean MFCC #3", "Variance MFCC #3", 
                                               "Mean MFCC #4", "Variance MFCC #4", 
                                               "Mean MFCC #5", "Variance MFCC #5",
                                               "Mean Tempo", "Variance Tempo",
                                              "Mean ChromaSTFT #1", "Variance ChromaSTFT #1",
                                              "Mean ChromaSTFT #2", "Variance ChromaSTFT #2",
                                              "Mean ChromaSTFT #3", "Variance ChromaSTFT #3",
                                              "Mean ChromaSTFT #4", "Variance ChromaSTFT #4",
                                              "Mean ChromaSTFT #5", "Variance ChromaSTFT #5",
                                              "Mean ChromaSTFT #6", "Variance ChromaSTFT #6",
                                              "Mean ChromaSTFT #7", "Variance ChromaSTFT #7",
                                              "Mean ChromaSTFT #8", "Variance ChromaSTFT #8",
                                              "Mean ChromaSTFT #9", "Variance ChromaSTFT #9",
                                              "Mean ChromaSTFT #10", "Variance ChromaSTFT #10",
                                              "Mean ChromaSTFT #11", "Variance ChromaSTFT #11",
                                              "Mean ChromaSTFT #12", "Variance ChromaSTFT #12",
                                              "Mean MelScale #1", "Variance MelScale #1",
                                              "Mean MelScale #2", "Variance MelScale #2",
                                              "Mean MelScale #3", "Variance MelScale #3",
                                              "Mean MelScale #4", "Variance MelScale #4",
                                              "Mean MelScale #5", "Variance MelScale #5",
                                              "Mean MelScale #6", "Variance MelScale #6",
                                              "Mean MelScale #7", "Variance MelScale #7",
                                              "Mean MelScale #8", "Variance MelScale #8",
                                              "Mean MelScale #9", "Variance MelScale #9",
                                              "Mean MelScale #10", "Variance MelScale #10",
                                              "Mean Tonnetz #1", "Variance Tonnetz #1",
                                              "Mean Tonnetz #2", "Variance Tonnetz #2",
                                              "Mean Tonnetz #3", "Variance Tonnetz #3",
                                              "Mean Tonnetz #4", "Variance Tonnetz #4",
                                              "Mean Tonnetz #5", "Variance Tonnetz #5",
                                              "Mean Tonnetz #6", "Variance Tonnetz #6",
                                              "Beat Histogram Sum",
                                              "BH Peak #1",
                                              "BH Peak #2",
                                              "BH Peak Ratio #1"])
df["Genre"] = signals[:, 0]

In [6]:
df.to_csv("dataset_gtzan.csv", index=False)

In [10]:
df.head(999)

Unnamed: 0,Mean Spectral Centroid,Variance Spectral Centroid,Mean Spectral Rolloff,Variance Spectral Rolloff,Mean Spectral Flux,Variance Spectral Flux,Mean Zero Crossings,Variance Zero Crossings,Mean MFCC #1,Variance MFCC #1,...,Variance Tonnetz #4,Mean Tonnetz #5,Variance Tonnetz #5,Mean Tonnetz #6,Variance Tonnetz #6,Beat Histogram Sum,BH Peak #1,BH Peak #2,BH Peak Ratio #1,Genre
0,570.040355,323.613897,927.651390,705.168708,0.813978,1.357986,0.021729,0.145797,-350.434662,51.552490,...,0.130899,-0.005321,0.039374,0.012748,0.030627,0.097238,18.117476,16.737059,0.923807,blues
1,1441.899257,622.389586,3082.803607,1481.811552,1.435319,1.628892,0.050946,0.219888,-155.558823,55.480629,...,0.125156,0.007895,0.032792,0.005047,0.032068,0.145336,15.125031,13.971641,0.923743,blues
2,1945.533108,384.131048,4174.593628,899.048142,1.392661,1.483896,0.085923,0.280250,-82.999718,50.050198,...,0.065041,-0.000592,0.020806,0.004117,0.017707,0.157225,11.824595,8.863704,0.749599,blues
3,2278.961252,413.395578,5198.594860,754.756161,1.326435,1.197424,0.092627,0.289909,-109.512589,28.744064,...,0.089479,0.004153,0.023568,0.005059,0.022940,0.186434,7.920386,6.736348,0.850507,blues
4,2333.510454,299.691821,4942.594552,693.969775,1.480888,1.214917,0.123987,0.329567,-2.510370,27.075670,...,0.055069,0.012361,0.016483,-0.000886,0.019928,0.198708,6.146051,6.122344,0.996143,blues
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,1867.082394,631.244890,4270.518802,1714.234772,1.388507,1.597367,0.057059,0.231954,-123.998718,49.103390,...,0.106757,0.007178,0.032550,-0.002983,0.030904,0.138249,10.277828,9.833135,0.956733,rock
995,2158.206042,499.372945,4245.912995,961.745988,1.192738,0.833018,0.121725,0.326968,-55.294132,35.307655,...,0.081412,-0.002376,0.021627,0.000041,0.022703,0.218358,7.871684,6.534355,0.830109,rock
996,2188.143060,565.473131,4432.584050,1179.301346,1.226849,0.966531,0.117682,0.322231,-56.918018,44.935715,...,0.077917,0.001694,0.021165,0.016555,0.020915,0.201039,7.513299,6.999835,0.931659,rock
997,2788.250576,531.002638,5831.991493,1115.141943,1.149338,0.715216,0.135109,0.341840,-28.014147,25.742531,...,0.088187,-0.001782,0.025949,-0.001694,0.022840,0.240391,5.986727,5.573112,0.930911,rock


In [7]:
extracted_features_raw = []
for signal in signals:
    extracted_features_raw.append(feature_extraction_raw(signal[1]))
extracted_features_raw = np.array(extracted_features_raw)

In [8]:
df_raw = pd.DataFrame(extracted_features_raw, columns=["Mean Spectral Centroid", 
                                               "Mean Spectral Rolloff", 
                                               "Mean Spectral Flux", 
                                               "Mean Zero Crossings",
                                               "Mean MFCC #1", 
                                               "Mean MFCC #2", 
                                               "Mean MFCC #3", 
                                               "Mean MFCC #4", 
                                               "Mean MFCC #5", 
                                               "Mean Tempo", 
                                              "Mean ChromaSTFT #1",
                                              "Mean ChromaSTFT #2", 
                                              "Mean ChromaSTFT #3", 
                                              "Mean ChromaSTFT #4", 
                                              "Mean ChromaSTFT #5", 
                                              "Mean ChromaSTFT #6", 
                                              "Mean ChromaSTFT #7", 
                                              "Mean ChromaSTFT #8", 
                                              "Mean ChromaSTFT #9", 
                                              "Mean ChromaSTFT #10", 
                                              "Mean ChromaSTFT #11", 
                                              "Mean ChromaSTFT #12", 
                                              "Mean MelScale #1", 
                                              "Mean MelScale #2", 
                                              "Mean MelScale #3", 
                                              "Mean MelScale #4", 
                                              "Mean MelScale #5", 
                                              "Mean MelScale #6", 
                                              "Mean MelScale #7", 
                                              "Mean MelScale #8", 
                                              "Mean MelScale #9", 
                                              "Mean MelScale #10",
                                              "Mean Tonnetz #1", 
                                              "Mean Tonnetz #2", 
                                              "Mean Tonnetz #3", 
                                              "Mean Tonnetz #4", 
                                              "Mean Tonnetz #5", 
                                              "Mean Tonnetz #6",
                                              "Beat Histogram Sum",
                                              "BH Peak #1",
                                              "BH Peak #2",
                                              "BH Peak Ratio #1"])
df_raw["Genre"] = signals[:, 0]

In [9]:
df_raw.to_csv("dataset_gtzan_raw.csv", index=False)