-
Notifications
You must be signed in to change notification settings - Fork 65
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
fanlijie
committed
Apr 2, 2018
0 parents
commit f7ad20b
Showing
15 changed files
with
1,107 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# End-to-End Learning of Motion Representation for Video Understanding | ||
|
||
This repository contains implementation code for the project 'End-to-End Learning of Motion Representation for Video Understanding' (CVPR 2018). | ||
|
||
```http://lijiefan.me/project_webpage/TVNet_cvpr/index.html``` | ||
|
||
## Prerequisites | ||
#### tensorflow | ||
We use tensorflow (https://www.tensorflow.org) for our implementation. | ||
|
||
#### Matlab (optional) | ||
We use `.mat` file for TVNet generated results saving, and `Matlab` for results [`visualization`](http://sintel.is.tue.mpg.de). | ||
|
||
## Installation | ||
Our current release has been tested on Ubuntu 16.04. | ||
|
||
#### Clone the repository | ||
``` | ||
git clone https://github.com/LijieFan/tvnet.git | ||
``` | ||
|
||
## Steps to run | ||
|
||
#### I) Put input frames in `frame/img1.png`, `frame/img2.png`. | ||
|
||
#### II) Use TVNet to generate motion representation | ||
|
||
The file (`demo.py`) has the following options: | ||
- `-scale`: Number of scales in TVNet (default: 1) | ||
- `-warp`: Number of warppings in TVNet (default: 1) | ||
- `-iteration`: Number of iterations in TVNet(default: 50) | ||
- `-gpu`: the gpu to run on (0-indexed, -1 for CPU) | ||
|
||
Sample usages include | ||
- Generate motion representation for frames in `frame/img1.png` and `frame/img2.png`. | ||
|
||
``` | ||
python demo.py --scale 1 --warp 1 --iteration 50 --gpu 1 | ||
``` | ||
|
||
#### III) Check results and visualization | ||
|
||
-TVNet generated results are saved in `result/result.mat` | ||
|
||
-Use the MPI-Sintel tool box for result visualization. In matlab, run ```run visualize/visualize.m```. | ||
|
||
|
||
## Sample input & output | ||
|
||
<table> | ||
<tr> | ||
<td><img src="frame/img1.png" height="160"></td> | ||
<td><img src="frame/img2.png" height="160"></td> | ||
<td><img src="result/result.png" height="160"></td> | ||
</tr> | ||
</table> | ||
|
||
|
||
## Reference | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import os | ||
import cv2 | ||
import numpy as np | ||
import tensorflow as tf | ||
import scipy.io as sio | ||
from tvnet import TVNet | ||
|
||
flags = tf.app.flags | ||
flags.DEFINE_integer("scale", 5, " TVNet scale [3]") | ||
flags.DEFINE_integer("warp", 5, " TVNet warp [1]") | ||
flags.DEFINE_integer("iteration", 50, " TVNet iteration [10]") | ||
flags.DEFINE_string("gpu", '0', " gpu to use [0]") | ||
FLAGS = flags.FLAGS | ||
|
||
scale = FLAGS.scale | ||
warp = FLAGS.warp | ||
iteration = FLAGS.iteration | ||
if int(FLAGS.gpu > -1): | ||
os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu | ||
|
||
print 'TVNet Params:\n scale: %d\n warp: %d\n iteration: %d\nUsing gpu: %s' \ | ||
% (scale, warp, iteration, FLAGS.gpu) | ||
|
||
# load image | ||
img1 = cv2.imread('frame/img1.png') | ||
img2 = cv2.imread('frame/img2.png') | ||
h, w, c = img1.shape | ||
|
||
# model construct | ||
x1 = tf.placeholder(shape=[1, h, w, 3], dtype=tf.float32) | ||
x2 = tf.placeholder(shape=[1, h, w, 3], dtype=tf.float32) | ||
tvnet = TVNet() | ||
u1, u2, rho = tvnet.tvnet_flow(x1,x2,max_scales=scale, | ||
warps=warp, | ||
max_iterations=iteration) | ||
|
||
# init | ||
sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True), allow_soft_placement=True)) | ||
sess.run(tf.global_variables_initializer()) | ||
|
||
# run model | ||
u1_np, u2_np = sess.run([u1, u2], feed_dict={x1: img1[np.newaxis, ...], x2: img2[np.newaxis, ...]}) | ||
|
||
u1_np = np.squeeze(u1_np) | ||
u2_np = np.squeeze(u2_np) | ||
flow_mat = np.zeros([h, w, 2]) | ||
flow_mat[:, :, 0] = u1_np | ||
flow_mat[:, :, 1] = u2_np | ||
|
||
if not os.path.exists('result'): | ||
os.mkdir('result') | ||
res_path = os.path.join('result', 'result.mat') | ||
sio.savemat(res_path, {'flow': flow_mat}) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
# Copyright 2016 The TensorFlow Authors. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# ============================================================================== | ||
import tensorflow as tf | ||
|
||
|
||
def transformer(U, theta, out_size, name='SpatialTransformer', **kwargs): | ||
"""Spatial Transformer Layer | ||
Implements a spatial transformer layer as described in [1]_. | ||
Based on [2]_ and edited by David Dao for Tensorflow. | ||
Parameters | ||
---------- | ||
U : float | ||
The output of a convolutional net should have the | ||
shape [num_batch, height, width, num_channels]. | ||
theta: float | ||
The output of the | ||
localisation network should be [num_batch, 6]. | ||
out_size: tuple of two ints | ||
The size of the output of the network (height, width) | ||
References | ||
---------- | ||
.. [1] Spatial Transformer Networks | ||
Max Jaderberg, Karen Simonyan, Andrew Zisserman, Koray Kavukcuoglu | ||
Submitted on 5 Jun 2015 | ||
.. [2] https://github.com/skaae/transformer_network/blob/master/transformerlayer.py | ||
Notes | ||
----- | ||
To initialize the network to the identity transform init | ||
``theta`` to : | ||
identity = np.array([[1., 0., 0.], | ||
[0., 1., 0.]]) | ||
identity = identity.flatten() | ||
theta = tf.Variable(initial_value=identity) | ||
""" | ||
|
||
def _repeat(x, n_repeats): | ||
with tf.variable_scope('_repeat'): | ||
rep = tf.transpose( | ||
tf.expand_dims(tf.ones(shape=tf.stack([n_repeats, ])), 1), [1, 0]) | ||
rep = tf.cast(rep, 'int32') | ||
x = tf.matmul(tf.reshape(x, (-1, 1)), rep) | ||
return tf.reshape(x, [-1]) | ||
|
||
def _interpolate(im, x, y, out_size): | ||
with tf.variable_scope('_interpolate'): | ||
# constants | ||
num_batch = tf.shape(im)[0] | ||
height = tf.shape(im)[1] | ||
width = tf.shape(im)[2] | ||
channels = tf.shape(im)[3] | ||
|
||
x = tf.cast(x, 'float32') | ||
y = tf.cast(y, 'float32') | ||
height_f = tf.cast(height, 'float32') | ||
width_f = tf.cast(width, 'float32') | ||
out_height = out_size[0] | ||
out_width = out_size[1] | ||
zero = tf.zeros([], dtype='int32') | ||
max_y = tf.cast(tf.shape(im)[1] - 1, 'int32') | ||
max_x = tf.cast(tf.shape(im)[2] - 1, 'int32') | ||
|
||
# scale indices from [-1, 1] to [0, width/height-1] | ||
x = (x + 1.0)*(width_f-1) / 2.0 | ||
y = (y + 1.0)*(height_f-1) / 2.0 | ||
|
||
# do sampling | ||
x0 = tf.cast(tf.floor(x), 'int32') | ||
x1 = x0 + 1 | ||
y0 = tf.cast(tf.floor(y), 'int32') | ||
y1 = y0 + 1 | ||
|
||
x0 = tf.clip_by_value(x0, zero, max_x-1) | ||
x1 = tf.clip_by_value(x1, zero, max_x) | ||
y0 = tf.clip_by_value(y0, zero, max_y-1) | ||
y1 = tf.clip_by_value(y1, zero, max_y) | ||
dim2 = width | ||
dim1 = width*height | ||
base = _repeat(tf.range(num_batch)*dim1, out_height*out_width) | ||
base_y0 = base + y0*dim2 | ||
base_y1 = base + y1*dim2 | ||
idx_a = base_y0 + x0 | ||
idx_b = base_y1 + x0 | ||
idx_c = base_y0 + x1 | ||
idx_d = base_y1 + x1 | ||
|
||
# use indices to lookup pixels in the flat image and restore | ||
# channels dim | ||
im_flat = tf.reshape(im, tf.stack([-1, channels])) | ||
im_flat = tf.cast(im_flat, 'float32') | ||
Ia = tf.gather(im_flat, idx_a) | ||
Ib = tf.gather(im_flat, idx_b) | ||
Ic = tf.gather(im_flat, idx_c) | ||
Id = tf.gather(im_flat, idx_d) | ||
|
||
# and finally calculate interpolated values | ||
x0_f = tf.cast(x0, 'float32') | ||
x1_f = tf.cast(x1, 'float32') | ||
y0_f = tf.cast(y0, 'float32') | ||
y1_f = tf.cast(y1, 'float32') | ||
wa = tf.expand_dims(((x1_f-x) * (y1_f-y)), 1) | ||
wb = tf.expand_dims(((x1_f-x) * (y-y0_f)), 1) | ||
wc = tf.expand_dims(((x-x0_f) * (y1_f-y)), 1) | ||
wd = tf.expand_dims(((x-x0_f) * (y-y0_f)), 1) | ||
output = tf.add_n([wa*Ia, wb*Ib, wc*Ic, wd*Id]) | ||
return output | ||
|
||
def _meshgrid(height, width): | ||
with tf.variable_scope('_meshgrid'): | ||
# This should be equivalent to: | ||
# x_t, y_t = np.meshgrid(np.linspace(-1, 1, width), | ||
# np.linspace(-1, 1, height)) | ||
# ones = np.ones(np.prod(x_t.shape)) | ||
# grid = np.vstack([x_t.flatten(), y_t.flatten(), ones]) | ||
x_t = tf.matmul(tf.ones(shape=tf.stack([height, 1])), | ||
tf.transpose(tf.expand_dims(tf.linspace(-1.0, 1.0, width), 1), [1, 0])) | ||
y_t = tf.matmul(tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1), | ||
tf.ones(shape=tf.stack([1, width]))) | ||
|
||
x_t_flat = tf.reshape(x_t, (1, -1)) | ||
y_t_flat = tf.reshape(y_t, (1, -1)) | ||
|
||
#ones = tf.ones_like(x_t_flat) | ||
#grid = tf.concat(axis=0, values=[x_t_flat, y_t_flat, ones]) | ||
grid = tf.concat(axis=0, values=[x_t_flat, y_t_flat]) | ||
return grid | ||
|
||
def _transform(theta, input_dim, out_size): | ||
with tf.variable_scope('_transform'): | ||
num_batch = tf.shape(input_dim)[0] | ||
height = tf.shape(input_dim)[1] | ||
width = tf.shape(input_dim)[2] | ||
num_channels = tf.shape(input_dim)[3] | ||
#theta = tf.reshape(theta, (-1, 2, 3)) | ||
theta = tf.cast(theta, 'float32') | ||
|
||
# grid of (x_t, y_t, 1), eq (1) in ref [1] | ||
height_f = tf.cast(height, 'float32') | ||
width_f = tf.cast(width, 'float32') | ||
out_height = out_size[0] | ||
out_width = out_size[1] | ||
grid = _meshgrid(out_height, out_width) | ||
grid = tf.expand_dims(grid, 0) | ||
grid = tf.reshape(grid, [-1]) | ||
grid = tf.tile(grid, tf.stack([num_batch])) | ||
#grid = tf.reshape(grid, tf.stack([num_batch, 3, -1])) | ||
grid = tf.reshape(grid, tf.stack([num_batch, 2, -1])) | ||
|
||
# Transform A x (x_t, y_t, 1)^T -> (x_s, y_s) | ||
#T_g = tf.matmul(theta, grid) | ||
T_g = theta + grid | ||
x_s = tf.slice(T_g, [0, 0, 0], [-1, 1, -1]) | ||
y_s = tf.slice(T_g, [0, 1, 0], [-1, 1, -1]) | ||
x_s_flat = tf.reshape(x_s, [-1]) | ||
y_s_flat = tf.reshape(y_s, [-1]) | ||
|
||
input_transformed = _interpolate( | ||
input_dim, x_s_flat, y_s_flat, | ||
out_size) | ||
|
||
output = tf.reshape( | ||
input_transformed, tf.stack([num_batch, out_height, out_width, num_channels])) | ||
|
||
return output | ||
|
||
with tf.variable_scope(name): | ||
output = _transform(theta, U, out_size) | ||
return output | ||
|
||
|
||
def batch_transformer(U, thetas, out_size, name='BatchSpatialTransformer'): | ||
"""Batch Spatial Transformer Layer | ||
Parameters | ||
---------- | ||
U : float | ||
tensor of inputs [num_batch,height,width,num_channels] | ||
thetas : float | ||
a set of transformations for each input [num_batch,num_transforms,6] | ||
out_size : int | ||
the size of the output [out_height,out_width] | ||
Returns: float | ||
Tensor of size [num_batch*num_transforms,out_height,out_width,num_channels] | ||
""" | ||
with tf.variable_scope(name): | ||
num_batch, num_transforms = map(int, thetas.get_shape().as_list()[:2]) | ||
indices = [[i]*num_transforms for i in xrange(num_batch)] | ||
input_repeated = tf.gather(U, tf.reshape(indices, [-1])) | ||
return transformer(input_repeated, thetas, out_size) |
Oops, something went wrong.