In [1]:
import os
import librosa
import librosa.display
import warnings
warnings.filterwarnings('ignore')
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import kurtosis
from scipy.stats import skew
import csv

In [2]:
# 파일 경로
general = "/content/drive"
audio_path = general + "/Othercomputers/MacBookAir/사운드"
output_path = general + "/MyDrive/audio_output"

In [3]:
def song_names(audio_path):                      # audio_path : 음원 파일들의 폴더 경로
    files = list(os.listdir(f'{audio_path}'))

    songs = []
    paths = []

    for file in files:
        song = f'{file}'
        path = f'{audio_path}/{file}'
        songs.append(song)
        paths.append(path)

    return songs, paths

In [4]:
def soundwave(paths):
    audio_files = []

    for path in paths:
        y, sr = librosa.load(path, duration = 60)
        y, _ = librosa.effects.trim(y)
        audio_files.append(y)

    return audio_files, sr                      # sr 디폴트 : 22050 / 변경 가능한 sr 수치 : 44100

In [5]:
def get_features(y, sr):
    # Features to concatenate in the final dictionary
    features = {'centroid': None, 'roloff': None, 'flux': None, 'rmse': None,
                'zcr': None, 'contrast': None, 'bandwidth': None, 'flatness': None, 'chroma_stft': None}
    
    # Count silence
    if 0 < len(y):
        y_sound, _ = librosa.effects.trim(y)
    features['sample_silence'] = len(y) - len(y_sound)

    # Using librosa to calculate the features
    features['chroma_stft']=librosa.feature.chroma_stft(y=y, sr=sr).ravel()
    features['centroid'] = librosa.feature.spectral_centroid(y, sr=sr).ravel()
    features['roloff'] = librosa.feature.spectral_rolloff(y, sr=sr,).ravel()
    features['zcr'] = librosa.feature.zero_crossing_rate(y).ravel()
    features['rmse'] = librosa.feature.rms(y).ravel()
    features['flux'] = librosa.onset.onset_strength(y=y, sr=sr).ravel()
    features['contrast'] = librosa.feature.spectral_contrast(y, sr=sr).ravel()
    features['bandwidth'] = librosa.feature.spectral_bandwidth(y, sr=sr).ravel()
    features['flatness'] = librosa.feature.spectral_flatness(y).ravel()
    
    # harmony, perceptral treatment (+@ 부분)
    harm, perc = librosa.effects.hpss(y)
    features['harm'] = harm.ravel()
    features['perc'] = perc.ravel()

    # MFCC treatment
    mfcc = librosa.feature.mfcc(y, sr=sr)   # n_mfcc=20 (default) 
    for idx, v_mfcc in enumerate(mfcc):
        features['mfcc_{}'.format(idx)] = v_mfcc.ravel()
        
    # Get statistics from the vectors
    def get_moments(descriptors):
        result = {}
        for k, v in descriptors.items():
            result['{}_max'.format(k)] = np.max(v)
            result['{}_min'.format(k)] = np.min(v)
            result['{}_mean'.format(k)] = np.mean(v)
            result['{}_std'.format(k)] = np.std(v)
            result['{}_kurtosis'.format(k)] = kurtosis(v)
            result['{}_skew'.format(k)] = skew(v)
        return result
    
    dict_agg_features = get_moments(features)
#    onset_env = librosa.onset.onset_strength(y=audio_file, sr=sr)
#    dict_agg_features['tempo'] = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)
    dict_agg_features['tempo'] = librosa.beat.tempo(y, sr=sr)[0]

    return dict_agg_features

In [6]:
def final_extraction(dataset_dir=audio_path):
#    genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
    last_features=[]

    songs, paths = song_names(audio_path)
    audio_files, sr = soundwave(paths)

    for audio in audio_files:
        features = get_features(audio, sr)
        last_features.append(features)
    return pd.DataFrame(last_features)

In [7]:
%%time

df_features = final_extraction()

CPU times: user 44.6 s, sys: 2.59 s, total: 47.2 s
Wall time: 49.6 s


In [8]:
# export the data to a csv file
df_features.to_csv('Data.csv', index=False)

In [9]:
# import the data
df_features = pd.read_csv('Data.csv')

In [10]:
df_features

Unnamed: 0,centroid_max,centroid_min,centroid_mean,centroid_std,centroid_kurtosis,centroid_skew,roloff_max,roloff_min,roloff_mean,roloff_std,...,mfcc_18_std,mfcc_18_kurtosis,mfcc_18_skew,mfcc_19_max,mfcc_19_min,mfcc_19_mean,mfcc_19_std,mfcc_19_kurtosis,mfcc_19_skew,tempo
0,4770.576476,147.877628,537.633031,488.240796,31.902617,5.412102,8591.748047,193.798828,829.326111,1062.228696,...,9.537965,0.10189,0.259081,32.9337,-33.39628,-4.97907,10.530147,-0.016696,0.28385,123.046875
1,908.569644,326.772485,566.82338,96.506892,-0.25479,0.01637,2002.587891,495.263672,930.468578,182.601987,...,9.587304,0.182229,-0.226135,26.164366,-34.581207,-7.516415,10.248731,-0.49589,0.072928,95.703125
2,5025.18553,157.374058,1407.873315,911.82719,0.64902,1.084272,9302.34375,118.432617,3051.481988,2372.126604,...,11.190249,-0.229197,0.051315,25.897486,-47.45894,-4.988096,10.317096,1.159996,-0.471385,86.132812
3,4143.25661,373.214565,1432.712155,512.005474,1.676911,0.756034,7568.920898,430.664062,2976.247002,1339.364612,...,8.298755,0.688311,0.505163,23.312233,-29.086233,-6.071716,8.93317,-0.340493,-0.008454,129.199219
4,3914.887835,332.882843,795.306111,393.694988,4.989771,2.008197,6815.258789,473.730469,1482.228753,1011.831476,...,7.554245,0.334192,-0.348806,15.17337,-25.707623,-5.705788,6.344338,0.020395,0.174523,129.199219


In [11]:
list(df_features.columns)

['centroid_max',
 'centroid_min',
 'centroid_mean',
 'centroid_std',
 'centroid_kurtosis',
 'centroid_skew',
 'roloff_max',
 'roloff_min',
 'roloff_mean',
 'roloff_std',
 'roloff_kurtosis',
 'roloff_skew',
 'flux_max',
 'flux_min',
 'flux_mean',
 'flux_std',
 'flux_kurtosis',
 'flux_skew',
 'rmse_max',
 'rmse_min',
 'rmse_mean',
 'rmse_std',
 'rmse_kurtosis',
 'rmse_skew',
 'zcr_max',
 'zcr_min',
 'zcr_mean',
 'zcr_std',
 'zcr_kurtosis',
 'zcr_skew',
 'contrast_max',
 'contrast_min',
 'contrast_mean',
 'contrast_std',
 'contrast_kurtosis',
 'contrast_skew',
 'bandwidth_max',
 'bandwidth_min',
 'bandwidth_mean',
 'bandwidth_std',
 'bandwidth_kurtosis',
 'bandwidth_skew',
 'flatness_max',
 'flatness_min',
 'flatness_mean',
 'flatness_std',
 'flatness_kurtosis',
 'flatness_skew',
 'chroma_stft_max',
 'chroma_stft_min',
 'chroma_stft_mean',
 'chroma_stft_std',
 'chroma_stft_kurtosis',
 'chroma_stft_skew',
 'sample_silence_max',
 'sample_silence_min',
 'sample_silence_mean',
 'sample_silenc