In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import librosa
import IPython
import tensorflow as tf
import time
tf.compat.v1.disable_eager_execution()
import soundfile as sf
import norbert
import os
loc = os.getcwd()
# from jupyterthemes import jtplot
# jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
config = tf.compat.v1.ConfigProto()
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
config.gpu_options.per_process_gpu_memory_fraction = 0.99
sess = tf.compat.v1.Session(config=config)
os.environ['TF_CUDNN_WORKSPACE_LIMIT_IN_MB'] = '256'

In [None]:
def calc_sdr(src, y_pred):
    num = 1 / (len(y_pred) * np.sum(y_pred ** 2))
    dem = 1 / (len(src) * np.sum(src ** 2))
    
    y = 10 * np.log10(num / dem)
    
    return y

In [None]:
class Generator(tf.keras.utils.Sequence):
    def __init__(self, inp_files, out_files, batch_size, steps, n_songs):
        self.batch_size = batch_size
        self.inp_files = inp_files
        self.out_files = out_files
        self.steps = steps
        self.n_songs = n_songs

        # Set the frame length
        self.n_fft = 4096
        # Set the hop length
        self.hop_length = self.n_fft // 4
        # Set the sampling Frequency
        self.sr = 44100
        # Define the number of time frames needed
        self.time = 256
        # Calculate the duration of 1 STFT frame in milliseconds
        self.dur = self.hop_length / self.sr
        # Calculate the duration for the audio length that
        # needs to be cropped
        self.seconds = np.round(self.time * self.dur, 4)

        # Output STFT feature shape would be 
        # Time Steps x Frequency Bins x n_channels
        
    def __len__(self):
        return self.steps

    def audio_scale(self, audio, amplitude):
        scale_factor = amplitude * np.sqrt(np.mean(audio * audio))
        scaled_audio = audio * scale_factor

        return scaled_audio

    def read_data(self, x):
        inp = []
        out = []
        idx = np.zeros(self.batch_size)
        idx[self.batch_size // 2:] = 1
        np.random.shuffle(idx)
        amp_idx = np.arange(0.5, 1.25, 0.01)
        amp_idx = np.random.choice(amp_idx, size=self.batch_size, replace=False)

        for i in range(self.batch_size):
            
            # Read the input mixture audio
            audio, _ = sf.read(self.inp_files[x[i]])
            # convert to mono audio
            audio = np.mean(audio, axis=1)
            duration = len(audio) / self.sr
            # Read the particular duration audio
            index = np.random.choice(list(range(int(duration - self.seconds))), size=1)[0]
            temp = audio[index * self.sr : int((index + self.seconds) * self.sr)]

            if idx[i]:
                temp = self.audio_scale(temp, amp_idx[i])
            # Calculate the STFT features on the go
            feat = np.abs(librosa.stft(temp, self.n_fft, self.hop_length).T)[:, :-1]
            inp.append(feat)
            
            # Read the source vocal audio as the ground truth
            audio, _ = sf.read(self.out_files[x[i]])
            # convert to mono audio
            audio = np.mean(audio, axis=1)
            temp = audio[index * self.sr : int((index + self.seconds) * self.sr)]

            if idx[i]:
                temp = self.audio_scale(temp, amp_idx[i])
            # Calculate the STFT features on the go
            feat = np.abs(librosa.stft(temp, self.n_fft, self.hop_length).T)[:, :-1]
            out.append(feat)

        inp = np.expand_dims(np.array(inp), axis=-1)
        out = np.expand_dims(np.array(out), axis=-1)
        
        return inp, out
        
    def __getitem__(self, idx):

        idx = np.random.choice(list(range(self.n_songs)), size=self.batch_size, replace=False)
        inp, out = self.read_data(idx)
        
        return inp, out

In [None]:
def get_model():

    weight_decay = 0
    reg = tf.keras.regularizers.l2(weight_decay)
    input_layer = tf.keras.layers.Input(shape = [512, 2048, 1])

    model = Conv2D(32, (3, 3), activation='relu', padding='same'
                    , kernel_regularizer=reg)(input_layer)
    model = BatchNormalization()(model)
    block1 = Conv2D(32, (3, 3), activation='relu', padding='same'
                        , kernel_regularizer=reg)(model)
    model = BatchNormalization()(model)
    pool1 = MaxPooling2D((2,2), strides=(2,2))(block1)

    model = Conv2D(64, (3, 3), activation='relu', padding='same'
                    , kernel_regularizer=reg)(pool1)
    model = BatchNormalization()(model)
    block2 = Conv2D(64, (3, 3), activation='relu', padding='same'
                        , kernel_regularizer=reg)(model)
    model = BatchNormalization()(model)
    pool2 = MaxPooling2D((2,2), strides=(2,2))(block2)

    model = Conv2D(128, (3, 3), activation='relu', padding='same'
                    , kernel_regularizer=reg)(pool2)
    model = BatchNormalization()(model)
    block3 = Conv2D(128, (3, 3), activation='relu', padding='same'
                        , kernel_regularizer=reg)(model)
    model = BatchNormalization()(model)
    pool3 = MaxPooling2D((2,2), strides=(2,2))(block3)

    model = Conv2D(256, (3, 3), activation='relu', padding='same'
                    , kernel_regularizer=reg)(pool3)
    model = BatchNormalization()(model)
    block4 = Conv2D(256, (3, 3), activation='relu', padding='same'
                        , kernel_regularizer=reg)(model)
    model = BatchNormalization()(model)
    pool4 = MaxPooling2D((2, 2), strides=(2,2))(block4)

    model = Conv2D(512, (3, 3), activation='relu', padding='same'
                        , kernel_regularizer=reg)(pool4)
    model = BatchNormalization()(model)

    # End of Encoder

    # Start of Decoder

    model = Conv2DTranspose(256, (2, 2), (2, 2), padding='same')(model)

    # model = tf.pad(model, [[0, 0], [1, 0], [0, 0], [0, 0]])
    model = Concatenate()([model, block4])
    model = Conv2D(256, (3, 3), activation='relu', padding='same'
                        , kernel_regularizer=reg)(model)
    model = BatchNormalization()(model)


    model = Conv2DTranspose(128, (2, 2), (2, 2), padding='same')(model)
    
    # model = tf.pad(model, [[0, 0], [1, 0], [0, 0], [0, 0]])
    model = Concatenate()([model, block3])
    model = Conv2D(128, (3, 3), activation='relu', padding='same'
                        , kernel_regularizer=reg)(model)
    model = BatchNormalization()(model)


    model = Conv2DTranspose(64, (2, 2), (2, 2), padding='same')(model)
    
    # model = tf.pad(model, [[0, 0], [1, 0], [0, 0], [0, 0]])
    model = Concatenate()([model, block2])
            
    model = Conv2DTranspose(32, (2, 2), (2, 2), padding='same')(model)

    # model = tf.pad(model, [[0, 0], [0, 0], [1, 0], [0, 0]])
    model = Concatenate()([model, block1])
    model = Conv2D(32, (3, 3), activation='relu', padding='same'
                        , kernel_regularizer=reg)(model)
    model = BatchNormalization()(model)


    out = Conv2D(1, (3, 3), padding='same', kernel_regularizer=reg)(model)
    model = tf.keras.models.Model(input_layer, out)
    
    return model

In [None]:
learning_rate = 1e-03
batch_size = 32
epochs = 100
train_steps = 500
valid_steps = 100

train_inp_path = r"D:\Voice Datasets\DSD100\Mixtures\Dev"
train_out_path = r"D:\Voice Datasets\DSD100\Sources\Dev"

test_inp_path = r"D:\Voice Datasets\DSD100\Mixtures\Test"
test_out_path = r"D:\Voice Datasets\DSD100\Sources\Test"

train_inp_filenames = []
train_out_filenames = []

test_inp_filenames = []
test_out_filenames = []

for i, j in zip(sorted(os.listdir(train_inp_path)), sorted(os.listdir(train_out_path))):
    path = os.path.join(train_inp_path, i)
    train_inp_filenames.append(os.path.join(path, "mixture.wav"))

    path = os.path.join(train_out_path, j)
    train_out_filenames.append(os.path.join(path, "vocals.wav"))


for i, j in zip(sorted(os.listdir(test_inp_path)), sorted(os.listdir(test_out_path))):
    path = os.path.join(test_inp_path, i)
    test_inp_filenames.append(os.path.join(path, "mixture.wav"))

    path = os.path.join(test_out_path, j)
    test_out_filenames.append(os.path.join(path, "vocals.wav"))


train_gen = Generator(train_inp_filenames, train_out_filenames, batch_size, train_steps)
valid_gen = Generator(test_inp_filenames, test_out_filenames, batch_size, valid_steps)


opt = tf.keras.optimizers.Adam(lr=learning_rate)
# opt = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)

model = get_model()
print(model.summary())
model.compile(opt, loss="mean_squared_error")

# Define the keras call back for model checkpoint
model_checkpoint1 = tf.keras.callbacks.ModelCheckpoint(r"D:\NLP Weights\Text Summarization\summarization_valid.h5", monitor='val_loss', save_best_only=True,
                        mode='min')

model_checkpoint2 = tf.keras.callbacks.ModelCheckpoint(r"D:\NLP Weights\Text Summarization\summarization_train.h5", monitor='loss', save_best_only=True,
                        mode='min')

log_dir = f'logs\Vocal Separation - {time.strftime("%H-%M-%S", time.localtime())}'
tensorboard = tf.compat.v1.keras.callbacks.TensorBoard(log_dir=log_dir, write_grads=True)

history = model.fit(train_gen, 
                    epochs=epochs,
                    validation_data=valid_gen,
                    steps_per_epoch = 100,
                    callbacks=[model_checkpoint1, model_checkpoint2, tensorboard]
                   )

hist_df = pd.DataFrame(history.history)
hist_csv_file = os.path.join(loc, r'history.csv')

In [None]:
# model = tf.keras.models.load_model("valid_model.h5")
# path = r"D:\Music\Aaromale.mp3"
# path = r"D:\Music\Aa Jao Na.mp3"
# path = r"D:\Music\Re Bawree.mp3"
# path = r"D:\Music\Lut Gaye.mp3"
path = r"D:\Music\Yaar Azhaippadhu.mp3"
# path = r"D:\Music\Yaaraiyum Ivlo Azhaga.mp3"
audio, sr = librosa.load(path, sr=44100)
# audio = np.mean(audio, axis=-1)
IPython.display.Audio(audio, rate=sr)

In [None]:
# Set the frame length
n_fft = 4096
# Set the hop length
hop_length = n_fft // 4
# Set the sampling Frequency
sr = 44100
# Define the number of time frames needed
time = 256
# Calculate the duration of 1 STFT frame in milliseconds
dur = hop_length / sr
# Calculate the duration for the audio length that
# needs to be cropped
seconds = np.round(time * dur, 4)
final_out = []
start = 0
end = int(sr * seconds)
while True:
    inp = np.abs(librosa.stft(audio[start : end], n_fft, hop_length).T)[:, :-1]
    y_pred = model.predict(np.expand_dims(inp, axis=[0, -1]))[0, :, :, 0]
    final_out.append(librosa.griffinlim(y_pred.T))
    start = end
    end += int(sr * seconds)
    
    if end >= len(audio):
        final_out = np.array(final_out)
        final_out = np.reshape(final_out, newshape=np.prod(final_out.shape))
        break

In [None]:
IPython.display.Audio(final_out, rate=sr)

In [None]:
x = np.expand_dims(librosa.stft(audio[:len(final_out)]), axis=-1)
v = np.expand_dims(np.abs(librosa.stft(final_out)), axis=(2, 3))
y = norbert.wiener(v, x)
# y = norbert.softmask(v, x, logit=0.4)
estimate = librosa.istft(y[:, :, 0, 0])

In [None]:
IPython.display.Audio(estimate, rate=sr)

In [None]:
learning_rate = 1e-04
batch_size = 2
epochs = 500
train_steps = 500
valid_steps = 100
train_path = os.path.join(loc, r"MUSDB 18\train")
test_path = os.path.join(loc, r"MUSDB 18\test")
train_songs = 100
valid_songs = 50

# train_path = r"D:\Downloads\MUSDB 18\train"
# test_path = r"D:\Downloads\MUSDB 18\test"

train_inp_filenames = []
train_out_filenames = []

test_inp_filenames = []
test_out_filenames = []

for i in sorted(os.listdir(train_path)):
    path = os.path.join(train_path, i)

    train_inp_filenames.append(os.path.join(path, "mixture.wav"))
    train_out_filenames.append(os.path.join(path, "vocals.wav"))


for i in sorted(os.listdir(test_path)):
    path = os.path.join(test_path, i)

    test_inp_filenames.append(os.path.join(path, "mixture.wav"))
    test_out_filenames.append(os.path.join(path, "vocals.wav"))


train_gen = Generator(train_inp_filenames, train_out_filenames, batch_size, train_steps, train_songs)
valid_gen = Generator(test_inp_filenames, test_out_filenames, batch_size, valid_steps, valid_songs)

In [None]:
sr = 44100
normalize = False
x, y = valid_gen[0]
for i, j in zip(x, y):
    sdr = calc_sdr(i, j)
    print(f"SDR : {sdr} dB")
    IPython.display.display(IPython.display.Audio(i.T, rate=sr, normalize=normalize))
    IPython.display.display(IPython.display.Audio(j.T, rate=sr, normalize=normalize))
    print("\n")

In [None]:
x, y = train_gen[0]

In [None]:
for i in range(len(x)):
    plt.figure(figsize=(15, 9))
    plt.subplot(1, 2, 1)
    plt.imshow(x[i, :, 1024:, 0])
    
    plt.subplot(1, 2, 2)
    plt.imshow(y[i, :, 1024:, 0])
    plt.show()