In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [2]:
!unzip -q ./gdrive/My\ Drive/DL2/DL_AV_sep/DL_AV_sep.zip

In [None]:
!unzip -q ./gdrive/My\ Drive/DL2/DL_AV_sep/data.zip

In [None]:
!pip install -r requirements.txt

In [None]:
!python setup.py develop

In [7]:
rootpath = "/content/gdrive/My Drive/DL2/DL_AV_sep/data/"

In [None]:
!pip install youtube-dl
!apt install ffmpeg
!apt -qq install -y sox

In [None]:
import os
from av_sep.preprocessing.audio.audio_downloader import AudioDownloader
from av_sep.preprocessing.audio.audio_norm import AudioNorm
from av_sep.preprocessing.audio.build_audio_database import AudioDatabaseBuilder

from av_sep.preprocessing.av_log.gentxtnew import GenLogs

import av_sep.preprocessing.lib.AVHandler as avh

from av_sep.preprocessing.video.video_download import VideoDownloader
from av_sep.preprocessing.video.MTCNN_detect import MtcnnDetector
from av_sep.preprocessing.video.frame_inspector import FrameInspector


In [None]:
    av_range = (0, 1000)
    max_num_sample = 50000

    avh.mkdir(rootpath + 'audio')
    avh.mkdir(rootpath + 'video')

    # Download audio data
    AudioDownloader.run(rootpath, av_range)

    # Normalize audio data
    AudioNorm.run(rootpath, av_range)
    
    # Download visual data
    VideoDownloader.run(rootpath, av_range)

    # Detect and Crop face
    MtcnnDetector.run(rootpath, av_range)
    FrameInspector.run(rootpath, av_range)
    
    # Create audio database
    AudioDatabaseBuilder.run(rootpath, av_range, max_num_sample)

    # Generate log file for data generator
    GenLogs.run(rootpath)

In [None]:
from av_sep.models.pretrain_model.pretrain_load_test import LoadPretrained

LoadPretrained.run(rootpath)

In [5]:
from keras.models import Sequential
from keras import optimizers
from keras.layers import Input, Dense, Convolution2D, Bidirectional, concatenate
from keras.layers import Flatten, BatchNormalization, ReLU, Reshape, Lambda, TimeDistributed
from keras.models import Model, load_model
from keras.layers.recurrent import LSTM
from keras.initializers import he_normal, glorot_uniform
from keras.callbacks import ModelCheckpoint, LearningRateScheduler
from keras.callbacks import TensorBoard
import tensorflow as tf
import os
from av_sep.models.lib.MyGenerator import AVGenerator


def AV_model(people_num=2):
    def UpSampling2DBilinear(size):
        return Lambda(lambda x: tf.image.resize(x, size))

    def sliced(x, index):
        return x[..., index]

    # --------------------------- AS start ---------------------------
    audio_input = Input(shape=(298, 257, 2))
    print('as_0:', audio_input.shape)
    as_conv1 = Convolution2D(96, kernel_size=(1, 7), strides=(1, 1), padding='same', dilation_rate=(1, 1), name='as_conv1')(audio_input)
    as_conv1 = BatchNormalization()(as_conv1)
    as_conv1 = ReLU()(as_conv1)
    print('as_1:', as_conv1.shape)

    as_conv2 = Convolution2D(96, kernel_size=(7, 1), strides=(1, 1), padding='same', dilation_rate=(1, 1), name='as_conv2')(as_conv1)
    as_conv2 = BatchNormalization()(as_conv2)
    as_conv2 = ReLU()(as_conv2)
    print('as_2:', as_conv2.shape)

    as_conv3 = Convolution2D(96, kernel_size=(5, 5), strides=(1, 1), padding='same', dilation_rate=(1, 1), name='as_conv3')(as_conv2)
    as_conv3 = BatchNormalization()(as_conv3)
    as_conv3 = ReLU()(as_conv3)
    print('as_3:', as_conv3.shape)

    as_conv4 = Convolution2D(96, kernel_size=(5, 5), strides=(1, 1), padding='same', dilation_rate=(2, 1), name='as_conv4')(as_conv3)
    as_conv4 = BatchNormalization()(as_conv4)
    as_conv4 = ReLU()(as_conv4)
    print('as_4:', as_conv4.shape)

    as_conv5 = Convolution2D(96, kernel_size=(5, 5), strides=(1, 1), padding='same', dilation_rate=(4, 1), name='as_conv5')(as_conv4)
    as_conv5 = BatchNormalization()(as_conv5)
    as_conv5 = ReLU()(as_conv5)
    print('as_5:', as_conv5.shape)

    as_conv6 = Convolution2D(96, kernel_size=(5, 5), strides=(1, 1), padding='same', dilation_rate=(8, 1), name='as_conv6')(as_conv5)
    as_conv6 = BatchNormalization()(as_conv6)
    as_conv6 = ReLU()(as_conv6)
    print('as_6:', as_conv6.shape)

    as_conv7 = Convolution2D(96, kernel_size=(5, 5), strides=(1, 1), padding='same', dilation_rate=(16, 1), name='as_conv7')(as_conv6)
    as_conv7 = BatchNormalization()(as_conv7)
    as_conv7 = ReLU()(as_conv7)
    print('as_7:', as_conv7.shape)

    as_conv8 = Convolution2D(96, kernel_size=(5, 5), strides=(1, 1), padding='same', dilation_rate=(32, 1), name='as_conv8')(as_conv7)
    as_conv8 = BatchNormalization()(as_conv8)
    as_conv8 = ReLU()(as_conv8)
    print('as_8:', as_conv8.shape)

    as_conv9 = Convolution2D(96, kernel_size=(5, 5), strides=(1, 1), padding='same', dilation_rate=(1, 1), name='as_conv9')(as_conv8)
    as_conv9 = BatchNormalization()(as_conv9)
    as_conv9 = ReLU()(as_conv9)
    print('as_9:', as_conv9.shape)

    as_conv10 = Convolution2D(96, kernel_size=(5, 5), strides=(1, 1), padding='same', dilation_rate=(2, 2), name='as_conv10')(as_conv9)
    as_conv10 = BatchNormalization()(as_conv10)
    as_conv10 = ReLU()(as_conv10)
    print('as_10:', as_conv10.shape)

    as_conv11 = Convolution2D(96, kernel_size=(5, 5), strides=(1, 1), padding='same', dilation_rate=(4, 4), name='as_conv11')(as_conv10)
    as_conv11 = BatchNormalization()(as_conv11)
    as_conv11 = ReLU()(as_conv11)
    print('as_11:', as_conv11.shape)

    as_conv12 = Convolution2D(96, kernel_size=(5, 5), strides=(1, 1), padding='same', dilation_rate=(8, 8), name='as_conv12')(as_conv11)
    as_conv12 = BatchNormalization()(as_conv12)
    as_conv12 = ReLU()(as_conv12)
    print('as_12:', as_conv12.shape)

    as_conv13 = Convolution2D(96, kernel_size=(5, 5), strides=(1, 1), padding='same', dilation_rate=(16, 16), name='as_conv13')(as_conv12)
    as_conv13 = BatchNormalization()(as_conv13)
    as_conv13 = ReLU()(as_conv13)
    print('as_13:', as_conv13.shape)

    as_conv14 = Convolution2D(96, kernel_size=(5, 5), strides=(1, 1), padding='same', dilation_rate=(32, 32), name='as_conv14')(as_conv13)
    as_conv14 = BatchNormalization()(as_conv14)
    as_conv14 = ReLU()(as_conv14)
    print('as_14:', as_conv14.shape)

    as_conv15 = Convolution2D(8, kernel_size=(1, 1), strides=(1, 1), padding='same', dilation_rate=(1, 1), name='as_conv15')(as_conv14)
    as_conv15 = BatchNormalization()(as_conv15)
    as_conv15 = ReLU()(as_conv15)
    print('as_15:', as_conv15.shape)

    AS_out = Reshape((298, 8 * 257))(as_conv15)
    print('AS_out:', AS_out.shape)
    # --------------------------- AS end ---------------------------

    # --------------------------- VS_model start ---------------------------
    VS_model = Sequential()
    VS_model.add(Convolution2D(512, kernel_size=(7, 1), strides=(1, 1), padding='same', dilation_rate=(1, 1), name='vs_conv1'))
    VS_model.add(BatchNormalization())
    VS_model.add(ReLU())
    VS_model.add(Convolution2D(512, kernel_size=(5, 1), strides=(1, 1), padding='same', dilation_rate=(1, 1), name='vs_conv2'))
    VS_model.add(BatchNormalization())
    VS_model.add(ReLU())
    VS_model.add(Convolution2D(512, kernel_size=(5, 1), strides=(1, 1), padding='same', dilation_rate=(2, 1), name='vs_conv3'))
    VS_model.add(BatchNormalization())
    VS_model.add(ReLU())
    VS_model.add(Convolution2D(512, kernel_size=(5, 1), strides=(1, 1), padding='same', dilation_rate=(4, 1), name='vs_conv4'))
    VS_model.add(BatchNormalization())
    VS_model.add(ReLU())
    VS_model.add(Convolution2D(512, kernel_size=(5, 1), strides=(1, 1), padding='same', dilation_rate=(8, 1), name='vs_conv5'))
    VS_model.add(BatchNormalization())
    VS_model.add(ReLU())
    VS_model.add(Convolution2D(512, kernel_size=(5, 1), strides=(1, 1), padding='same', dilation_rate=(16, 1), name='vs_conv6'))
    VS_model.add(BatchNormalization())
    VS_model.add(ReLU())
    VS_model.add(Reshape((75, 512, 1)))
    VS_model.add(UpSampling2DBilinear((298, 512)))
    VS_model.add(Reshape((298, 512)))
    # --------------------------- VS_model end ---------------------------

    video_input = Input(shape=(75, 1, 1792, people_num))
    AVfusion_list = [AS_out]
    for i in range(people_num):
        single_input = Lambda(sliced, arguments={'index': i})(video_input)
        VS_out = VS_model(single_input)
        AVfusion_list.append(VS_out)

    AVfusion = concatenate(AVfusion_list, axis=2)
    AVfusion = TimeDistributed(Flatten())(AVfusion)
    print('AVfusion:', AVfusion.shape)

    lstm = Bidirectional(LSTM(1024, input_shape=(298, 8 * 257), return_sequences=True), merge_mode='sum')(AVfusion)
    print('lstm:', lstm.shape)

    fc1 = Dense(1024, name="fc1", activation='relu', kernel_initializer=he_normal(seed=27))(lstm)
    print('fc1:', fc1.shape)
    fc2 = Dense(1024, name="fc2", activation='relu', kernel_initializer=he_normal(seed=42))(fc1)
    print('fc2:', fc2.shape)
    fc3 = Dense(1024, name="fc3", activation='relu', kernel_initializer=he_normal(seed=65))(fc2)
    print('fc3:', fc3.shape)

    complex_mask = Dense(257 * 2 * people_num, name="complex_mask", kernel_initializer=glorot_uniform(seed=87))(fc3)
    print('complex_mask:', complex_mask.shape)

    complex_mask_out = Reshape((298, 257, 2, people_num))(complex_mask)
    print('complex_mask_out:', complex_mask_out.shape)

    AV_model = Model(inputs=[audio_input, video_input], outputs=complex_mask_out)

    # # compile AV_model
    # AV_model.compile(optimizer='adam', loss='mse')

    return AV_model



Using TensorFlow backend.


In [None]:
from av_sep.models.lib import model_ops
from keras.callbacks import ModelCheckpoint, LearningRateScheduler
from keras.models import load_model
from av_sep.models.lib.MyGenerator import AVGenerator
from keras.callbacks import TensorBoard
from keras import optimizers
import os
from av_sep.models.lib.model_loss import audio_discriminate_loss2 as audio_loss
import tensorflow as tf

ROOTPATH = rootpath

#############################################################
# automatically change lr
def scheduler(epoch):
    ini_lr = 0.00001
    lr = ini_lr
    if epoch >= 5:
        lr = ini_lr / 5
    if epoch >= 10:
        lr = ini_lr / 10
    return lr

# create AV model
#############################################################
RESTORE = False
# If set true, continue training from last checkpoint
# needed change 1:h5 file name, 2:epochs num, 3:initial_epoch

# super parameters
people_num = 2
epochs = 50
initial_epoch = 0
batch_size = 4  # 4 to feed one 16G GPU
gamma_loss = 0.1
beta_loss = gamma_loss*2

# physical devices option to accelerate training process
workers = 1 # num of core
use_multiprocessing = False
NUM_GPU = 1

# PATH
path = ROOTPATH + 'saved_AV_models'  # model path
database_dir_path = ROOTPATH
#############################################################

# create folder to save models
folder = os.path.exists(path)
if not folder:
    os.makedirs(path)
    print('create folder to save models')
filepath = path + "/AVmodel-" + str(people_num) + "p-{epoch:03d}-{val_loss:.5f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')


rlr = LearningRateScheduler(scheduler, verbose=1)
#############################################################
# read train and val file name
# format: mix.npy single.npy single.npy
trainfile = []
valfile = []
with open((database_dir_path+'audio/AVdataset_train.txt'), 'r') as t:
    trainfile = t.readlines()
with open((database_dir_path+'audio/AVdataset_val.txt'), 'r') as v:
    valfile = v.readlines()
# ///////////////////////////////////////////////////////// #

# the training steps
if RESTORE:
    latest_file = model_ops.latest_file(path+'/')
    AV_model = load_model(latest_file,custom_objects={"tf": tf})
    info = latest_file.strip().split('-')
    initial_epoch = int(info[-2])
else:
    AV_model = AV_model(people_num)

train_generator = AVGenerator(trainfile,database_dir_path= database_dir_path, batch_size=batch_size, shuffle=True)
val_generator = AVGenerator(valfile,database_dir_path=database_dir_path, batch_size=batch_size, shuffle=True)

if NUM_GPU > 1:
    parallel_model = model_ops.ModelMGPU(AV_model,NUM_GPU)
    adam = optimizers.Adam()
    loss = audio_loss(gamma=gamma_loss,beta=beta_loss,num_speaker=people_num)
    parallel_model.compile(loss=loss,optimizer=adam)
    print(AV_model.summary())
    parallel_model.fit_generator(generator=train_generator,
                            validation_data=val_generator,
                            epochs=epochs,
                            workers = workers,
                            use_multiprocessing= use_multiprocessing,
                            callbacks=[TensorBoard(log_dir='./log_AV'), checkpoint, rlr],
                            initial_epoch=initial_epoch
                            )
if NUM_GPU <= 1:
    adam = optimizers.Adam()
    loss = audio_loss(gamma=gamma_loss,beta=beta_loss, num_speaker=people_num)
    AV_model.compile(optimizer=adam, loss=loss)
    print(AV_model.summary())
    AV_model.fit_generator(generator=train_generator,
                            validation_data=val_generator,
                            epochs=epochs,
                            workers = workers,
                            use_multiprocessing= use_multiprocessing,
                            callbacks=[TensorBoard(log_dir='./log_AV'), checkpoint, rlr],
                            initial_epoch=initial_epoch
                            )


as_0: (None, 298, 257, 2)
as_1: (None, 298, 257, 96)
as_2: (None, 298, 257, 96)
as_3: (None, 298, 257, 96)
as_4: (None, 298, 257, 96)
as_5: (None, 298, 257, 96)
as_6: (None, 298, 257, 96)
as_7: (None, 298, 257, 96)
as_8: (None, 298, 257, 96)
as_9: (None, 298, 257, 96)
as_10: (None, 298, 257, 96)
as_11: (None, 298, 257, 96)
as_12: (None, 298, 257, 96)
as_13: (None, 298, 257, 96)
as_14: (None, 298, 257, 96)
as_15: (None, 298, 257, 8)
AS_out: (None, 298, 2056)
AVfusion: (None, 298, 3080)
lstm: (None, 298, 1024)
fc1: (None, 298, 1024)
fc2: (None, 298, 1024)
fc3: (None, 298, 1024)
complex_mask: (None, 298, 1028)
complex_mask_out: (None, 298, 257, 2, 2)
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 298, 257, 2)  0                                            
______________________________




Epoch 00001: val_loss improved from inf to 0.42258, saving model to /content/gdrive/My Drive/DL2/DL_AV_sep/data/saved_AV_models/AVmodel-2p-001-0.42258.h5
Epoch 2/50

Epoch 00002: LearningRateScheduler setting learning rate to 1e-05.

Epoch 00002: val_loss did not improve from 0.42258
Epoch 3/50

Epoch 00003: LearningRateScheduler setting learning rate to 1e-05.

Epoch 00003: val_loss did not improve from 0.42258
Epoch 4/50

Epoch 00004: LearningRateScheduler setting learning rate to 1e-05.

Epoch 00004: val_loss improved from 0.42258 to 0.35225, saving model to /content/gdrive/My Drive/DL2/DL_AV_sep/data/saved_AV_models/AVmodel-2p-004-0.35225.h5
Epoch 5/50

Epoch 00005: LearningRateScheduler setting learning rate to 1e-05.

Epoch 00005: val_loss improved from 0.35225 to 0.34394, saving model to /content/gdrive/My Drive/DL2/DL_AV_sep/data/saved_AV_models/AVmodel-2p-005-0.34394.h5
Epoch 6/50

Epoch 00006: LearningRateScheduler setting learning rate to 2.0000000000000003e-06.

Epoch 0000