In [1]:
import librosa
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
from natsort import natsorted
from tqdm import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Static features extraction

In [3]:
def extract_static_feature(path_chorus, path_VA):
    id = 1  # music_ID
    feature_set = pd.DataFrame()  # Feature Matrix
    
    # Individual Feature Vectors
    songname_vector = pd.Series()
    tempo_vector = pd.Series()
    total_beats = pd.Series()
    average_beats = pd.Series()
    chroma_stft_mean = pd.Series()
    chroma_stft_std = pd.Series()
    chroma_stft_var = pd.Series()
    chroma_cq_mean = pd.Series()
    chroma_cq_std = pd.Series()
    chroma_cq_var = pd.Series()
    chroma_cens_mean = pd.Series()
    chroma_cens_std = pd.Series()
    chroma_cens_var = pd.Series()
    mel_mean = pd.Series()
    mel_std = pd.Series()
    mel_var = pd.Series()
    mfcc_mean = pd.Series()
    mfcc_std = pd.Series()
    mfcc_var = pd.Series()
    mfcc_delta_mean = pd.Series()
    mfcc_delta_std = pd.Series()
    mfcc_delta_var = pd.Series()
    #rmse_mean = pd.Series()
    #rmse_std = pd.Series()
    #rmse_var = pd.Series()
    cent_mean = pd.Series()
    cent_std = pd.Series()
    cent_var = pd.Series()
    spec_bw_mean = pd.Series()
    spec_bw_std = pd.Series()
    spec_bw_var = pd.Series()
    contrast_mean = pd.Series()
    contrast_std = pd.Series()
    contrast_var = pd.Series()
    rolloff_mean = pd.Series()
    rolloff_std = pd.Series()
    rolloff_var = pd.Series()
    poly_mean = pd.Series()
    poly_std = pd.Series()
    poly_var = pd.Series()
    tonnetz_mean = pd.Series()
    tonnetz_std = pd.Series()
    tonnetz_var = pd.Series()
    zcr_mean = pd.Series()
    zcr_std = pd.Series()
    zcr_var = pd.Series()
    harm_mean = pd.Series()
    harm_std = pd.Series()
    harm_var = pd.Series()
    perc_mean = pd.Series()
    perc_std = pd.Series()
    perc_var = pd.Series()
    frame_mean = pd.Series()
    frame_std = pd.Series()
    frame_var = pd.Series()
    
    count = 1
     
    for mp3_file in tqdm(natsorted(os.listdir(path_chorus))):
        if mp3_file.endswith(".mp3"):
        
            # prendo dal nome del file il numero del brano (music_ID)
            file_name = os.path.basename(mp3_file)
            id = file_name.split('.')[0]
            
            VA_std = pd.read_csv(path_VA + '/static_annotations_std.csv', header = None)
            VA_mean = pd.read_csv(path_VA + '/static_annotations.csv', header = None)
            
            if id != VA_std.iloc[count][0]:
                continue
            
             # prendo valori di VA dai file qua sopra
            v_std = VA_std.iloc[count][2]
            a_std = VA_std.iloc[count][1]
            v_mean = VA_mean.iloc[count][2]
            a_mean = VA_mean.iloc[count][1]
            count = count + 1

            # Reading Song
            y,sr = librosa.load(path_chorus + '/' + mp3_file)
            S = np.abs(librosa.stft(y))
        
            # Extracting Features
            tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
            chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
            chroma_cq = librosa.feature.chroma_cqt(y=y, sr=sr)
            chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)
            melspectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
            #rmse = librosa.feature.rmse(y=y)
            cent = librosa.feature.spectral_centroid(y=y, sr=sr)
            spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
            contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
            rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
            poly_features = librosa.feature.poly_features(S=S, sr=sr)
            tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
            zcr = librosa.feature.zero_crossing_rate(y)
            harmonic = librosa.effects.harmonic(y)
            percussive = librosa.effects.percussive(y)

            mfcc = librosa.feature.mfcc(y=y, sr=sr)
            mfcc_delta = librosa.feature.delta(mfcc)

            onset_frames = librosa.onset.onset_detect(y=y, sr=sr)
            frames_to_time = librosa.frames_to_time(onset_frames[:20], sr=sr)

            # Transforming Features
            songname_vector.set_value(id, id)  # song name
            tempo_vector.set_value(id, tempo)  # tempo
            total_beats.set_value(id, sum(beats))  # beats
            average_beats.set_value(id, np.average(beats))
            chroma_stft_mean.set_value(id, np.mean(chroma_stft))  # chroma stft
            chroma_stft_std.set_value(id, np.std(chroma_stft))
            chroma_stft_var.set_value(id, np.var(chroma_stft))
            chroma_cq_mean.set_value(id, np.mean(chroma_cq))  # chroma cq
            chroma_cq_std.set_value(id, np.std(chroma_cq))
            chroma_cq_var.set_value(id, np.var(chroma_cq))
            chroma_cens_mean.set_value(id, np.mean(chroma_cens))  # chroma cens
            chroma_cens_std.set_value(id, np.std(chroma_cens))
            chroma_cens_var.set_value(id, np.var(chroma_cens))
            mel_mean.set_value(id, np.mean(melspectrogram))  # melspectrogram
            mel_std.set_value(id, np.std(melspectrogram))
            mel_var.set_value(id, np.var(melspectrogram))
            mfcc_mean.set_value(id, np.mean(mfcc))  # mfcc
            mfcc_std.set_value(id, np.std(mfcc))
            mfcc_var.set_value(id, np.var(mfcc))
            mfcc_delta_mean.set_value(id, np.mean(mfcc_delta))  # mfcc delta
            mfcc_delta_std.set_value(id, np.std(mfcc_delta))
            mfcc_delta_var.set_value(id, np.var(mfcc_delta))
            #rmse_mean.set_value(id, np.mean(rmse))  # rmse
            #rmse_std.set_value(id, np.std(rmse))
            #rmse_var.set_value(id, np.var(rmse))
            cent_mean.set_value(id, np.mean(cent))  # cent
            cent_std.set_value(id, np.std(cent))
            cent_var.set_value(id, np.var(cent))
            spec_bw_mean.set_value(id, np.mean(spec_bw))  # spectral bandwidth
            spec_bw_std.set_value(id, np.std(spec_bw))
            spec_bw_var.set_value(id, np.var(spec_bw))
            contrast_mean.set_value(id, np.mean(contrast))  # contrast
            contrast_std.set_value(id, np.std(contrast))
            contrast_var.set_value(id, np.var(contrast))
            rolloff_mean.set_value(id, np.mean(rolloff))  # rolloff
            rolloff_std.set_value(id, np.std(rolloff))
            rolloff_var.set_value(id, np.var(rolloff))
            poly_mean.set_value(id, np.mean(poly_features))  # poly features
            poly_std.set_value(id, np.std(poly_features))
            poly_var.set_value(id, np.var(poly_features))
            tonnetz_mean.set_value(id, np.mean(tonnetz))  # tonnetz
            tonnetz_std.set_value(id, np.std(tonnetz))
            tonnetz_var.set_value(id, np.var(tonnetz))
            zcr_mean.set_value(id, np.mean(zcr))  # zero crossing rate
            zcr_std.set_value(id, np.std(zcr))
            zcr_var.set_value(id, np.var(zcr))
            harm_mean.set_value(id, np.mean(harmonic))  # harmonic
            harm_std.set_value(id, np.std(harmonic))
            harm_var.set_value(id, np.var(harmonic))
            perc_mean.set_value(id, np.mean(percussive))  # percussive
            perc_std.set_value(id, np.std(percussive))
            perc_var.set_value(id, np.var(percussive))
            frame_mean.set_value(id, np.mean(frames_to_time))  # frames
            frame_std.set_value(id, np.std(frames_to_time))
            frame_var.set_value(id, np.var(frames_to_time))

            #print(songname)
            #id = id+1
    
    # Concatenating Features into one csv and json format
    feature_set['music_ID'] = songname_vector  # song name 
    feature_set['tempo'] = tempo_vector  # tempo 
    feature_set['total_beats'] = total_beats  # beats
    feature_set['average_beats'] = average_beats
    feature_set['chroma_stft_mean'] = chroma_stft_mean  # chroma stft
    feature_set['chroma_stft_std'] = chroma_stft_std
    feature_set['chroma_stft_var'] = chroma_stft_var
    feature_set['chroma_cq_mean'] = chroma_cq_mean  # chroma cq
    feature_set['chroma_cq_std'] = chroma_cq_std
    feature_set['chroma_cq_var'] = chroma_cq_var
    feature_set['chroma_cens_mean'] = chroma_cens_mean  # chroma cens
    feature_set['chroma_cens_std'] = chroma_cens_std
    feature_set['chroma_cens_var'] = chroma_cens_var
    feature_set['melspectrogram_mean'] = mel_mean  # melspectrogram
    feature_set['melspectrogram_std'] = mel_std
    feature_set['melspectrogram_var'] = mel_var
    feature_set['mfcc_mean'] = mfcc_mean  # mfcc
    feature_set['mfcc_std'] = mfcc_std
    feature_set['mfcc_var'] = mfcc_var
    feature_set['mfcc_delta_mean'] = mfcc_delta_mean  # mfcc delta
    feature_set['mfcc_delta_std'] = mfcc_delta_std
    feature_set['mfcc_delta_var'] = mfcc_delta_var
    #feature_set['rmse_mean'] = rmse_mean  # rmse
    #feature_set['rmse_std'] = rmse_std
    #feature_set['rmse_var'] = rmse_var
    feature_set['cent_mean'] = cent_mean  # cent
    feature_set['cent_std'] = cent_std
    feature_set['cent_var'] = cent_var
    feature_set['spec_bw_mean'] = spec_bw_mean  # spectral bandwidth
    feature_set['spec_bw_std'] = spec_bw_std
    feature_set['spec_bw_var'] = spec_bw_var
    feature_set['contrast_mean'] = contrast_mean  # contrast
    feature_set['contrast_std'] = contrast_std
    feature_set['contrast_var'] = contrast_var
    feature_set['rolloff_mean'] = rolloff_mean  # rolloff
    feature_set['rolloff_std'] = rolloff_std
    feature_set['rolloff_var'] = rolloff_var
    feature_set['poly_mean'] = poly_mean  # poly features
    feature_set['poly_std'] = poly_std
    feature_set['poly_var'] = poly_var
    feature_set['tonnetz_mean'] = tonnetz_mean  # tonnetz
    feature_set['tonnetz_std'] = tonnetz_std
    feature_set['tonnetz_var'] = tonnetz_var
    feature_set['zcr_mean'] = zcr_mean  # zero crossing rate
    feature_set['zcr_std'] = zcr_std
    feature_set['zcr_var'] = zcr_var
    feature_set['harm_mean'] = harm_mean  # harmonic
    feature_set['harm_std'] = harm_std
    feature_set['harm_var'] = harm_var
    feature_set['perc_mean'] = perc_mean  # percussive
    feature_set['perc_std'] = perc_std
    feature_set['perc_var'] = perc_var
    feature_set['frame_mean'] = frame_mean  # frames
    feature_set['frame_std'] = frame_std
    feature_set['frame_var'] = frame_var
    
    return feature_set


In [4]:
# Extracting Feature Function Call

#path_chorus = '/Users/gioelepozzi/Desktop/MasterThesis/code/features_extraction/data'

path_chorus = '/Users/gioelepozzi/Desktop/data/chorus'
path_VA = '/Users/gioelepozzi/Desktop/data/annotations'

static = extract_static_feature(path_chorus, path_VA)

100%|██████████| 794/794 [1:34:26<00:00,  7.14s/it]


In [5]:
# Converting Dataframe into CSV Excel and JSON file

static.to_csv('static_features.csv', index=False)
#feature_set.to_json('Emotion_features.json')

# Dynamic features extraction

In [6]:
def window_with_overlap(a, window, stride):
    nrows = ((a.size-window)//stride)+1
    n = a.strides[0]
    # create a view into the array a with the given shape and strides
    return np.lib.stride_tricks.as_strided(a, shape=(nrows,window), strides=(stride*n,n))

In [8]:
def extract_dynamic_feature(path_chorus, path_VA, window, stride):

    feature_set = pd.DataFrame()
    
    count = 1
        
    for mp3_file in tqdm(natsorted(os.listdir(path_chorus))):
        if mp3_file.endswith(".mp3"):
            
            file_name = os.path.basename(mp3_file)
            id = file_name.split('.')[0]
            
            VA_std = pd.read_csv(path_VA + '/static_annotations_std.csv', header = None)
            VA_mean = pd.read_csv(path_VA + '/static_annotations.csv', header = None)
            
            if id != VA_std.iloc[count][0]:
                continue
            
             # prendo valori di VA dai file qua sopra
            v_std = VA_std.iloc[count][2]
            a_std = VA_std.iloc[count][1]
            v_mean = VA_mean.iloc[count][2]
            a_mean = VA_mean.iloc[count][1]
            count = count + 1
            
            y,sr = librosa.load(path_chorus + '/' + mp3_file)
            
            times = np.arange(len(y))/sr
            frames = window_with_overlap(y, window, stride)
            time_frames = window_with_overlap(times, window, stride)

            for fidx, frame in enumerate(frames):
                
                feature = {}
                
                frame_time = fidx*0.5+1
                times = time_frames[fidx]
                S = np.abs(librosa.stft(frame))
                                
                tempo, beats = librosa.beat.beat_track(y=frame, sr=sr)
                chroma_stft = librosa.feature.chroma_stft(y=frame, sr=sr)
                chroma_cq = librosa.feature.chroma_cqt(y=frame, sr=sr)
                chroma_cens = librosa.feature.chroma_cens(y=frame, sr=sr)
                melspectrogram = librosa.feature.melspectrogram(y=frame, sr=sr)
                #rmse = librosa.feature.rmse(y=frame)
                cent = librosa.feature.spectral_centroid(y=frame, sr=sr)
                spec_bw = librosa.feature.spectral_bandwidth(y=frame, sr=sr)
                contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
                rolloff = librosa.feature.spectral_rolloff(y=frame, sr=sr)
                poly_features = librosa.feature.poly_features(S=S, sr=sr)
                tonnetz = librosa.feature.tonnetz(y=frame, sr=sr)
                zcr = librosa.feature.zero_crossing_rate(frame)
                harmonic = librosa.effects.harmonic(frame)
                percussive = librosa.effects.percussive(frame)

                mfcc = librosa.feature.mfcc(y=frame, sr=sr)
                mfcc_delta = librosa.feature.delta(mfcc)

                onset_frames = librosa.onset.onset_detect(y=frame, sr=sr)
                frames_to_time = librosa.frames_to_time(onset_frames[:20], sr=sr)
                
                
                feature['music_ID'] = id
                feature['frameTime'] = frame_time
                feature['tempo'] = tempo
                feature['total_beats'] = sum(beats)
                feature['average_beats'] = np.average(beats)
                feature['chroma_stft_mean'] = np.mean(chroma_stft)
                feature['chroma_stft_std'] = np.std(chroma_stft)
                feature['chroma_stft_var'] = np.var(chroma_stft)
                feature['chroma_cq_mean'] = np.mean(chroma_cq)
                feature['chroma_cq_std'] = np.std(chroma_cq)
                feature['chroma_cq_var'] = np.var(chroma_cq)
                feature['chroma_cens_mean'] = np.mean(chroma_cens)
                feature['chroma_cens_std'] = np.std(chroma_cens)
                feature['chroma_cens_var'] = np.var(chroma_cens)
                feature['melspectrogram_mean'] = np.mean(melspectrogram)
                feature['melspectrogram_std'] = np.std(melspectrogram)
                feature['melspectrogram_var'] = np.var(melspectrogram)
                feature['mfcc_mean'] = np.mean(mfcc)
                feature['mfcc_std'] = np.std(mfcc)
                feature['mfcc_var'] = np.var(mfcc)
                feature['mfcc_delta_mean'] = np.mean(mfcc_delta)
                feature['mfcc_delta_std'] = np.std(mfcc_delta)
                feature['mfcc_delta_var'] = np.var(mfcc_delta)
                #feature['rmse_mean'] = np.mean(rmse)
                #feature['rmse_std'] = np.std(rmse)
                #feature['rmse_var'] = np.var(rmse)
                feature['cent_mean'] = np.mean(cent)
                feature['cent_std'] = np.std(cent)
                feature['cent_var'] = np.var(cent)
                feature['spec_bw_mean'] = np.mean(spec_bw)
                feature['spec_bw_std'] = np.std(spec_bw)
                feature['spec_bw_var'] = np.var(spec_bw)
                feature['contrast_mean'] = np.mean(contrast)
                feature['contrast_std'] = np.std(contrast)
                feature['contrast_var'] = np.var(contrast)
                feature['rolloff_mean'] = np.mean(rolloff)
                feature['rolloff_std'] = np.std(rolloff)
                feature['rolloff_var'] = np.var(rolloff)
                feature['poly_mean'] = np.mean(poly_features)
                feature['poly_std'] = np.std(poly_features)
                feature['poly_var'] = np.var(poly_features)
                feature['tonnetz_mean'] = np.mean(tonnetz)
                feature['tonnetz_std'] = np.std(tonnetz)
                feature['tonnetz_var'] = np.var(tonnetz)
                feature['zcr_mean'] = np.mean(zcr)
                feature['zcr_std'] = np.std(zcr)
                feature['zcr_var'] = np.var(zcr)
                feature['harm_mean'] = np.mean(harmonic)
                feature['harm_std'] = np.std(harmonic)
                feature['harm_var'] = np.var(harmonic)
                feature['perc_mean'] = np.mean(percussive)
                feature['perc_std'] = np.std(percussive)
                feature['perc_var'] = np.var(percussive)
                feature['frame_mean'] = np.mean(frames_to_time)
                feature['frame_std'] = np.std(frames_to_time)
                feature['frame_var'] = np.var(frames_to_time)
                
                
                feature_set = feature_set.append(pd.DataFrame(data=feature, index=[0]))
    
    
    return feature_set

In [None]:
# Extracting Feature Function Call

#path_chorus = '/Users/gioelepozzi/Desktop/MasterThesis/code/features_extraction/data'

path_chorus = '/Users/gioelepozzi/Desktop/data/chorus'
path_VA = '/Users/gioelepozzi/Desktop/data/annotations'

dynamic = extract_dynamic_feature(path_chorus, path_VA, window=22050, stride=11025)


  0%|          | 0/794 [00:00<?, ?it/s][A

  0%|          | 2/794 [00:29<3:20:53, 15.22s/it][A
  0%|          | 3/794 [01:11<5:05:51, 23.20s/it][A
  1%|          | 4/794 [01:24<4:28:24, 20.39s/it][A
  1%|          | 5/794 [01:38<4:01:45, 18.38s/it][A
  1%|          | 6/794 [01:57<4:03:26, 18.54s/it][A
  1%|          | 7/794 [02:08<3:33:16, 16.26s/it][A
  1%|          | 8/794 [02:23<3:28:36, 15.92s/it][A
  1%|▏         | 10/794 [02:36<2:51:15, 13.11s/it][A
  1%|▏         | 11/794 [02:52<3:01:11, 13.88s/it][A
  2%|▏         | 12/794 [03:04<2:54:39, 13.40s/it][A
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)

  2%|▏         | 14/794 [03:46<3:48:17, 17.56s/it][A
  2%|▏         | 15/794 [04:11<4:18:10, 19.89s/it][A
  2%|▏         | 16/794 [04:36<4:36:38, 21.33s/it][A
  2%|▏         | 17/794 [04:56<4:32:40, 21.06s/it][A
  2%|▏         | 18/794 [05:36<5:43:58, 26.60s/it][A
  2%|▏         | 19/794 [05:57<5:23:44, 25.06s/it][A
  3%|▎         | 20/794 [06:11<4:39:22,

 19%|█▊        | 148/794 [47:33<3:29:24, 19.45s/it][A
 19%|█▉        | 149/794 [47:46<3:08:02, 17.49s/it][A
 19%|█▉        | 150/794 [48:02<3:03:06, 17.06s/it][A
 19%|█▉        | 151/794 [48:13<2:42:25, 15.16s/it][A
 19%|█▉        | 152/794 [48:28<2:40:17, 14.98s/it][A
 19%|█▉        | 153/794 [48:44<2:44:18, 15.38s/it][A
 19%|█▉        | 154/794 [48:53<2:24:34, 13.55s/it][A
 20%|█▉        | 155/794 [49:02<2:09:33, 12.16s/it][A
 20%|█▉        | 156/794 [49:20<2:28:17, 13.95s/it][A
 20%|█▉        | 157/794 [49:38<2:40:36, 15.13s/it][A
 20%|█▉        | 158/794 [49:54<2:41:28, 15.23s/it][A
 20%|██        | 159/794 [50:08<2:39:02, 15.03s/it][A
 20%|██        | 160/794 [50:24<2:41:48, 15.31s/it][A
 20%|██        | 161/794 [50:35<2:28:33, 14.08s/it][A
 20%|██        | 162/794 [50:46<2:16:17, 12.94s/it][A
 21%|██        | 163/794 [50:59<2:17:26, 13.07s/it][A
 21%|██        | 164/794 [51:08<2:05:43, 11.97s/it][A
 21%|██        | 165/794 [51:26<2:22:57, 13.64s/it][A
 21%|██   

 38%|███▊      | 304/794 [1:31:51<2:45:54, 20.31s/it][A
 38%|███▊      | 305/794 [1:32:00<2:16:48, 16.79s/it][A
 39%|███▊      | 306/794 [1:32:17<2:18:38, 17.05s/it][A
 39%|███▊      | 307/794 [1:32:31<2:10:47, 16.11s/it][A
 39%|███▉      | 309/794 [1:32:52<1:56:37, 14.43s/it][A
 39%|███▉      | 310/794 [1:33:24<2:37:59, 19.58s/it][A
 39%|███▉      | 311/794 [1:33:37<2:22:37, 17.72s/it][A
 39%|███▉      | 312/794 [1:33:50<2:10:47, 16.28s/it][A
 39%|███▉      | 313/794 [1:34:07<2:12:22, 16.51s/it][A
 40%|███▉      | 315/794 [1:34:21<1:48:28, 13.59s/it][A
 40%|███▉      | 316/794 [1:34:47<2:17:15, 17.23s/it][A
 40%|███▉      | 317/794 [1:35:12<2:35:21, 19.54s/it][A
 40%|████      | 318/794 [1:35:31<2:34:11, 19.44s/it][A
 40%|████      | 319/794 [1:35:53<2:40:30, 20.27s/it][A
 40%|████      | 320/794 [1:36:09<2:29:47, 18.96s/it][A
 40%|████      | 321/794 [1:36:29<2:31:34, 19.23s/it][A
 41%|████      | 322/794 [1:36:54<2:44:44, 20.94s/it][A
 41%|████      | 323/794 [1:37:

In [None]:
# Converting Dataframe into CSV Excel and JSON file

dynamic.to_csv('dynamic_features.csv', index=False)
#feature_set.to_json('Emotion_features.json')