## Generate video by convolutional LSTM network using Keras

Predicts next frame of the video contains moving squares
* Author: Gao Yang
* Note:
    * 40 minutes per epoch on Intel i5 CPU (12000 minutes for 300 epochs, too long!)
    * 2.5 minutes per epoch on Tesla K80 (Floydhub)
    * ? seconds per epoch on Titan (Lab)

#### Setup the enviorment

In [None]:
from __future__ import division, print_function
import keras
from keras.models import Sequential
from keras.layers.convolutional import Conv3D
from keras.layers.convolutional_recurrent import ConvLSTM2D
from keras.layers.normalization import BatchNormalization

import numpy as np
import matplotlib.pyplot as plt

#### Generate artificial movie
* noisy_movies: x_train
* shifted_movies: y_train
* 3 to 7 moving squares inside
* 1x1 or 2x2 pixels each
* moving linearly
* first create movie with bigger size=80x80 then crop it to 40x40

In [None]:
def generate_movies(num_sample=1200, num_frame=15):
    
    row = 80
    col = 80
    
    noisy_movies = np.zeros((num_sample, num_frame, row, col, 1), dtype=np.float)
    shifted_movies = noisy_movies
    
    for i in range(num_sample):
        
        # add squares
        n = np.random.randint(3,8)
        
        for j in range(n):
            # initial position
            x_start = np.random.randint(20,60)
            y_start = np.random.randint(20,60)
            
            # direction
            x_direction = np.random.randint(0,3) - 1 # [-1,0,1]
            y_direction = np.random.randint(0,3) - 1
            
            # size
            size_square = np.random.randint(2,4)
            
            for index_frame in range(num_frame):
                
                # label the positions of squares on each frame
                x_shift = x_start + x_direction * index_frame
                y_shift = y_start + y_direction * index_frame
                
                noisy_movies[i, index_frame, 
                            x_shift - size_square:x_shift + size_square,
                            y_shift - size_square:y_shift + size_square, 0] += 1
                
                # add noise
                if np.random.randint(0,2):
                    noise_f = (-1)**np.random.randint(0,2)
                    noisy_movies[i, index_frame,
                                x_shift - size_square - 1:x_shift + size_square + 1,
                                y_shift - size_square - 1:y_shift + size_square + 1, 0] += noise_f * 0.2
                
                # shift the ground truth by 1
                x_shift = x_start + x_direction * (index_frame + 1)
                y_shift = y_start + y_direction * (index_frame + 1)
                shifted_movies[i, index_frame, 
                              x_shift - size_square:x_shift + size_square,
                              y_shift - size_square:y_shift + size_square, 0] += 1
    
    # crop to 40x40
    noisy_movies = noisy_movies[:,:,20:60,20:60,:]
    shifted_movies = shifted_movies[:,:,20:60,20:60,:]
    noisy_movies[noisy_movies >= 1] = 1
    shifted_movies[shifted_movies >= 1] = 1
    
    return noisy_movies, shifted_movies

# generate videos
noisy_movies, shifted_movies = generate_movies()

print('Videos are generated.')

#### Build LSTM conv network

In [None]:
model = Sequential()
model.add(ConvLSTM2D(filters = 40, kernel_size = (3,3),
                    input_shape = (None,40,40,1),
                    padding = 'same', return_sequences = True))
model.add(BatchNormalization())

model.add(ConvLSTM2D(filters = 40, kernel_size = (3,3),
                    padding = 'same', return_sequences = True))
model.add(BatchNormalization())

model.add(ConvLSTM2D(filters = 40, kernel_size = (3,3),
                    padding = 'same', return_sequences = True))
model.add(BatchNormalization())

model.add(ConvLSTM2D(filters = 40, kernel_size = (3,3),
                    padding = 'same', return_sequences = True))
model.add(BatchNormalization())

model.add(Conv3D(filters = 1, kernel_size = (3,3,3),
                    activation = 'sigmoid',
                    padding = 'same', data_format = 'channels_last'))

model.summary()

#### Train the model on GPU (Floydhub or Local)

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adadelta')


model.fit(noisy_movies[:1000], shifted_movies[:1000],
         batch_size = 10,
         epochs = 300, # Total time for 72 epochs: 72*2.5=180min=3hr
         verbose = 1,
         validation_split = 0.05)

#### Save the model (Only execute this cell when using Floydhub)

In [None]:
## training is done
# !mkdir keras_lstm_conv_video
## save weights
# model.save_weights('./keras_lstm_conv_video/model_lstm_conv_weights.h5')
## then download the weights.h5 from floydhub

#### Save the model (Only execute this cell when using Local GPU)

In [None]:
# training is done
!mkdir keras_lstm_conv_video
# save weights
model.save_weights('./keras_lstm_conv_video/model_lstm_conv_weights.h5')

#### Reload the model (optional)

In [None]:
working_pwd = '/Users/Yang/Projects/keras-examples/keras_lstm_conv_video/'
from keras.models import load_model

model.load_weights(working_pwd + 'model_lstm_conv_weights.h5')

#### Test the model on a movie

In [None]:
example_mov = 1004
track = noisy_movies[example_mov][:7,::,::,::]

for j in range(16):
    new_position = model.predict(track[np.newaxis,::,::,::,::], verbose = 1)
    new = new_position[::,-1,::,::,::]
    track = np.concatenate((track,new), axis = 0)

#### Then compare the predictions with the truth

In [None]:
track2 = noisy_movies[example_mov][::,::,::,::]

for i in range(15):
    fig = plt.figure(figsize = (10,5))
    ax = fig.add_subplot(121)
    
    if i >= 7:
        ax.text(1,3,'Predictions!', fontsize = 20, color = 'w')
    else:
        ax.text(1,3,'Initial Trajactory', fontsize = 20, color = 'w')
    
    toplot = track[i,:,:,0]
    plt.imshow(toplot)
    
    ax = fig.add_subplot(122)
    plt.text(1,3,'Ground Truth', fontsize = 20, color = 'w')
    
    toplot = track2[i,:,:,0]
    if i >= 2:
        toplot = shifted_movies[example_mov][i - 1,:,:,0]
    
    plt.imshow(toplot)
    plt.savefig(working_pwd + '{:03d}_animate.png'.format(i + 1))
    print('Image {:03d} : animation finished.')