In [1]:
import numpy as np
import os, glob
import os.path
import ffmpeg
from pydub import AudioSegment
import pydub
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt

# MP3 파일 wav 형식으로 변환하기

In [2]:
# MP3 파일 wav 형식으로 변환하기
emotion_class = ['Exciting','Fear','Neutral','Relaxation','Sad','Tension']

i = 0
for emotion in emotion_class:
    targerdir = r'Music_Video_Emotion_Dataset-master/' + emotion +'mp3'
    files = os.listdir(targerdir)
    for music in files:
        file_name = targerdir+'/'+ music
        AudioSegment.from_mp3(file_name).export(targerdir+'/'+str(i)+'.wav',format='wav')
        i+=1

# Mel spectrogram 만들기

In [3]:
# Architectural constants.
NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
EMBEDDING_SIZE = 128  # Size of embedding layer.

# Hyperparameters used in feature and example generation.
SAMPLE_RATE = 16000
STFT_WINDOW_LENGTH_SECONDS = 0.025
STFT_HOP_LENGTH_SECONDS = 0.010
NUM_MEL_BINS = NUM_BANDS
MEL_MIN_HZ = 125
MEL_MAX_HZ = 7500
LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
EXAMPLE_HOP_SECONDS = 0.96     # with zero overlap.

# Parameters used for embedding postprocessing.
PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
PCA_MEANS_NAME = 'pca_means'
QUANTIZE_MIN_VAL = -2.0
QUANTIZE_MAX_VAL = +2.0

# Hyperparameters used in training.
INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.

# Names of ops, tensors, and features.
INPUT_OP_NAME = 'vggish/input_features'
INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
OUTPUT_OP_NAME = 'vggish/embedding'
OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'

In [4]:
import numpy as np


def frame(data, window_length, hop_length):

    num_samples = data.shape[0]
    num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
    shape = (num_frames, window_length) + data.shape[1:]
    strides = (data.strides[0] * hop_length,) + data.strides
    
    return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)


def periodic_hann(window_length):

    return 0.5 - (0.5 * np.cos(2 * np.pi / window_length *
                             np.arange(window_length)))


def stft_magnitude(signal, fft_length,
                   hop_length=None,
                   window_length=None):

    frames = frame(signal, window_length, hop_length)
    window = periodic_hann(window_length)
    windowed_frames = frames * window
    return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))


# Mel spectrum constants and functions.
_MEL_BREAK_FREQUENCY_HERTZ = 700.0
_MEL_HIGH_FREQUENCY_Q = 1127.0


def hertz_to_mel(frequencies_hertz):

    return _MEL_HIGH_FREQUENCY_Q * np.log(
      1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))


def spectrogram_to_mel_matrix(num_mel_bins=20,
                              num_spectrogram_bins=129,
                              audio_sample_rate=8000,
                              lower_edge_hertz=125.0,
                              upper_edge_hertz=3800.0):

    nyquist_hertz = audio_sample_rate / 2.
    if lower_edge_hertz < 0.0:
        raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz)
    if lower_edge_hertz >= upper_edge_hertz:
        raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
                         (lower_edge_hertz, upper_edge_hertz))
    if upper_edge_hertz > nyquist_hertz:
        raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" %
                         (upper_edge_hertz, nyquist_hertz))
    spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins)
    spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)

    band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
                                   hertz_to_mel(upper_edge_hertz), num_mel_bins + 2)

    mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
    for i in range(num_mel_bins):
        lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
        # Calculate lower and upper slopes for every spectrogram bin.
        # Line segments are linear in the *mel* domain, not hertz.
        lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
                       (center_mel - lower_edge_mel))
        upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
                       (upper_edge_mel - center_mel))
        # .. then intersect them with each other and zero.
        mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope,
                                                              upper_slope))
  # HTK excludes the spectrogram DC bin; make sure it always gets a zero
  # coefficient.
    mel_weights_matrix[0, :] = 0.0
    return mel_weights_matrix


def log_mel_spectrogram(data,
                        audio_sample_rate=8000,
                        log_offset=0.0,
                        window_length_secs=0.025,
                        hop_length_secs=0.010,
                        **kwargs):

    window_length_samples = int(round(audio_sample_rate * window_length_secs))
    hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
    fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
    spectrogram = stft_magnitude(
      data,
      fft_length=fft_length,
      hop_length=hop_length_samples,
      window_length=window_length_samples)
    mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix(
      num_spectrogram_bins=spectrogram.shape[1],
      audio_sample_rate=audio_sample_rate, **kwargs))
    return np.log(mel_spectrogram + log_offset)

In [5]:
from __future__ import division
import resampy

In [6]:
def preprocess_sound(data, sample_rate):


    if len(data.shape) > 1:
        data = np.mean(data, axis=1)

    if sample_rate != SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, SAMPLE_RATE)

  # Compute log mel spectrogram features.
    log_mel = log_mel_spectrogram(
      data,
      audio_sample_rate=SAMPLE_RATE,
      log_offset=LOG_OFFSET,
      window_length_secs=STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=NUM_MEL_BINS,
      lower_edge_hertz=MEL_MIN_HZ,
      upper_edge_hertz=MEL_MAX_HZ)

  # Frame features into examples.
    features_sample_rate = 1.0 / STFT_HOP_LENGTH_SECONDS
    example_window_length = int(round(
      EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(round(
      EXAMPLE_HOP_SECONDS * features_sample_rate))
    log_mel_examples = frame(
      log_mel,
      window_length=example_window_length,
      hop_length=example_hop_length)
    return log_mel_examples

In [82]:
import random
emotion_class = ['Exciting','Fear','Neutral','Relaxation','Sad','Tension']
X_train = np.empty((0,96,64))
X_test = np.empty((0,96,64))
X_val = np.empty((0,96,64))
y_train = []
y_test = []
y_val = []
i = 0
j = 0
for emotion in emotion_class:
    targerdir = r'Music_Video_Emotion_Dataset-master/' + emotion +'mp3'
    files = os.listdir(targerdir)
    for music in files:
        scale_file = targerdir+'/'+ music
        scale, sr = librosa.load(scale_file)
        # mel spectrogram extraction
        mel_spectrogram = preprocess_sound(scale,sr)
        # zeropadding
        if random.random() <= 0.7:
            X_train= np.append(X_train,mel_spectrogram, axis = 0)
            for _ in range(mel_spectrogram.shape[0]):
                y_train.append(i)
        elif (random.random() > 0.7) and (random.random() <= 0.85):
            X_test= np.append(X_test,mel_spectrogram, axis = 0)
            for _ in range(mel_spectrogram.shape[0]):
                y_test.append(i)
        else:
            X_val= np.append(X_val,mel_spectrogram, axis = 0)
            for _ in range(mel_spectrogram.shape[0]):
                y_val.append(i)
    i += 1

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, shuffle=True, random_state=1004)

In [85]:
X_train=X_train.reshape((X_train.shape[0],96,64,1))
X_test=X_test.reshape((X_test.shape[0],96,64,1))
X_val=X_val.reshape((X_val.shape[0],96,64,1))

In [86]:
from keras.utils import np_utils
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
y_val = np_utils.to_categorical(y_val)

In [48]:
np.savez('music_train.npz',X=X_train,Y=y_train)
np.savez('music_test.npz',X=X_test,Y=y_test)