From f82420be4a065a9cd192b88be118a9a27e8eb399 Mon Sep 17 00:00:00 2001 From: LiuYang <651636074@qq.com> Date: Sun, 6 Aug 2017 12:49:48 +0800 Subject: [PATCH] first init code --- closed_form_matting.py | 58 +++++++ deep_photostyle.py | 111 +++++++++++++ photo_style.py | 327 ++++++++++++++++++++++++++++++++++++ smooth_local_affine.py | 370 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 866 insertions(+) create mode 100755 closed_form_matting.py create mode 100644 deep_photostyle.py create mode 100644 photo_style.py create mode 100644 smooth_local_affine.py diff --git a/closed_form_matting.py b/closed_form_matting.py new file mode 100755 index 0000000..a62703c --- /dev/null +++ b/closed_form_matting.py @@ -0,0 +1,58 @@ +from __future__ import division +import argparse +import os +import scipy.misc as spm +import scipy.ndimage as spi +import scipy.sparse as sps +import numpy as np +import tensorflow as tf + +def getlaplacian1(i_arr, consts, epsilon=1e-5, win_rad=1): + neb_size = (win_rad * 2 + 1) ** 2 + h, w, c = i_arr.shape + img_size = w * h + consts = spi.morphology.grey_erosion(consts, footprint=np.ones(shape=(win_rad * 2 + 1, win_rad * 2 + 1))) + + indsM = np.reshape(np.array(range(img_size)), newshape=(h, w), order='F') + tlen = int((-consts[win_rad:-win_rad, win_rad:-win_rad] + 1).sum() * (neb_size ** 2)) + row_inds = np.zeros(tlen) + col_inds = np.zeros(tlen) + vals = np.zeros(tlen) + l = 0 + for j in range(win_rad, w - win_rad): + for i in range(win_rad, h - win_rad): + if consts[i, j]: + continue + win_inds = indsM[i - win_rad:i + win_rad + 1, j - win_rad: j + win_rad + 1] + win_inds = win_inds.ravel(order='F') + win_i = i_arr[i - win_rad:i + win_rad + 1, j - win_rad: j + win_rad + 1, :] + win_i = win_i.reshape((neb_size, c), order='F') + win_mu = np.mean(win_i, axis=0).reshape(c, 1) + win_var = np.linalg.inv( + np.matmul(win_i.T, win_i) / neb_size - np.matmul(win_mu, win_mu.T) + epsilon / neb_size * np.identity( + c)) + + win_i2 = win_i - np.repeat(win_mu.transpose(), neb_size, 0) + tvals = (1 + np.matmul(np.matmul(win_i2, win_var), win_i2.T)) / neb_size + + ind_mat = np.broadcast_to(win_inds, (neb_size, neb_size)) + row_inds[l: (neb_size ** 2 + l)] = ind_mat.ravel(order='C') + col_inds[l: neb_size ** 2 + l] = ind_mat.ravel(order='F') + vals[l: neb_size ** 2 + l] = tvals.ravel(order='F') + l += neb_size ** 2 + + vals = vals.ravel(order='F')[0: l] + row_inds = row_inds.ravel(order='F')[0: l] + col_inds = col_inds.ravel(order='F')[0: l] + a_sparse = sps.csr_matrix((vals, (row_inds, col_inds)), shape=(img_size, img_size)) + + sum_a = a_sparse.sum(axis=1).T.tolist()[0] + a_sparse = sps.diags([sum_a], [0], shape=(img_size, img_size)) - a_sparse + + return a_sparse + +def getLaplacian(img): + h, w, _ = img.shape + coo = getlaplacian1(img, np.zeros(shape=(h, w)), 1e-5, 1).tocoo() + indices = np.mat([coo.row, coo.col]).transpose() + return tf.SparseTensor(indices, coo.data, coo.shape) diff --git a/deep_photostyle.py b/deep_photostyle.py new file mode 100644 index 0000000..35eafe9 --- /dev/null +++ b/deep_photostyle.py @@ -0,0 +1,111 @@ +import argparse +from PIL import Image +import numpy as np +from photo_style import stylize + +parser = argparse.ArgumentParser() +# Input Options +parser.add_argument("--content_image_path", dest='content_image_path', nargs='?', + help="Path to the content image") +parser.add_argument("--style_image_path", dest='style_image_path', nargs='?', + help="Path to the style image") +parser.add_argument("--content_seg_path", dest='content_seg_path', nargs='?', + help="Path to the style segmentation") +parser.add_argument("--style_seg_path", dest='style_seg_path', nargs='?', + help="Path to the style segmentation") +parser.add_argument("--init_image_path", dest='init_image_path', nargs='?', + help="Path to init image", default="") +parser.add_argument("--output_image", dest='output_image', nargs='?', + help='Path to output the stylized image', default="best_stylized.png") + +# Training Optimizer Options +parser.add_argument("--max_iter", dest='max_iter', nargs='?', type=int, + help='maximum image iteration', default=1000) +parser.add_argument("--learning_rate", dest='learning_rate', nargs='?', type=float, + help='learning rate for adam optimizer', default=1.0) +parser.add_argument("--print_iter", dest='print_iter', nargs='?', type=int, + help='print loss per iterations', default=1) +# Note the result might not be smooth enough since not applying smooth for temp result +parser.add_argument("--save_iter", dest='save_iter', nargs='?', type=int, + help='save temporary result per iterations', default=100) +parser.add_argument("--lbfgs", dest='lbfgs', nargs='?', + help="True=lbfgs, False=Adam", default=True) + +# Weight Options +parser.add_argument("--content_weight", dest='content_weight', nargs='?', type=float, + help="weight of content loss", default=5e0) +parser.add_argument("--style_weight", dest='style_weight', nargs='?', type=float, + help="weight of style loss", default=1e2) +parser.add_argument("--tv_weight", dest='tv_weight', nargs='?', type=float, + help="weight of total variational loss", default=1e-3) +parser.add_argument("--affine_weight", dest='affine_weight', nargs='?', type=float, + help="weight of affine loss", default=1e4) + +# Style Options +parser.add_argument("--style_option", dest='style_option', nargs='?', type=int, + help="0=non-Matting, 1=only Matting, 2=first non-Matting, then Matting", default=0) +parser.add_argument("--apply_smooth", dest='apply_smooth', nargs='?', + help="if apply local affine smooth", default=True) + +# Smoothing Argument +parser.add_argument("--f_radius", dest='f_radius', nargs='?', type=int, + help="smooth argument", default=15) +parser.add_argument("--f_edge", dest='f_edge', nargs='?', type=float, + help="smooth argument", default=1e-1) + +args = parser.parse_args() + +def main(): + if args.style_option == 0: + best_image_bgr = stylize(args, False) + result = Image.fromarray(np.uint8(np.clip(best_image_bgr[:, :, ::-1], 0, 255.0))) + result.save(args.output_image) + elif args.style_option == 1: + best_image_bgr = stylize(args, True) + if not args.apply_smooth: + result = Image.fromarray(np.uint8(np.clip(best_image_bgr[:, :, ::-1], 0, 255.0))) + result.save(args.output_image) + else: + # Pycuda runtime incompatible with Tensorflow + from smooth_local_affine import smooth_local_affine + content_input = np.array(Image.open(args.content_image_path).convert("RGB"), dtype=np.float32) + # RGB to BGR + content_input = content_input[:, :, ::-1] + # H * W * C to C * H * W + content_input = content_input.transpose((2, 0, 1)) + input_ = np.ascontiguousarray(content_input, dtype=np.float32) / 255. + + _, H, W = np.shape(input_) + + output_ = np.ascontiguousarray(best_image_bgr.transpose((2, 0, 1)), dtype=np.float32) / 255. + best_ = smooth_local_affine(output_, input_, 1e-7, 3, H, W, args.f_radius, args.f_edge).transpose(1, 2, 0) + result = Image.fromarray(np.uint8(np.clip(best_ * 255., 0, 255.))) + result.save(args.output_image) + elif args.style_option == 2: + tmp_image_bgr = stylize(args, False) + result = Image.fromarray(np.uint8(np.clip(tmp_image_bgr[:, :, ::-1], 0, 255.0))) + result.save("./tmp_result.png") + + args.init_image_path = "./tmp_result.png" + best_image_bgr = stylize(args, True) + if not args.apply_smooth: + result = Image.fromarray(np.uint8(np.clip(best_image_bgr[:, :, ::-1], 0, 255.0))) + result.save(args.output_image) + else: + from smooth_local_affine import smooth_local_affine + content_input = np.array(Image.open(args.content_image_path).convert("RGB"), dtype=np.float32) + # RGB to BGR + content_input = content_input[:, :, ::-1] + # H * W * C to C * H * W + content_input = content_input.transpose((2, 0, 1)) + input_ = np.ascontiguousarray(content_input, dtype=np.float32) / 255. + + _, H, W = np.shape(input_) + + output_ = np.ascontiguousarray(best_image_bgr.transpose((2, 0, 1)), dtype=np.float32) / 255. + best_ = smooth_local_affine(output_, input_, 1e-7, 3, H, W, args.f_radius, args.f_edge).transpose(1, 2, 0) + result = Image.fromarray(np.uint8(np.clip(best_ * 255., 0, 255.))) + result.save(args.output_image) + +if __name__ == "__main__": + main() diff --git a/photo_style.py b/photo_style.py new file mode 100644 index 0000000..5a12634 --- /dev/null +++ b/photo_style.py @@ -0,0 +1,327 @@ +from __future__ import division + +import numpy as np +import tensorflow as tf +import scipy.misc +from vgg19.vgg import Vgg19 +from PIL import Image +import time +from closed_form_matting import * +from operator import mul +import math +from functools import partial +import copy + +VGG_MEAN = [103.939, 116.779, 123.68] + +def rgb2bgr(rgb, vgg_mean=True): + if vgg_mean: + return rgb[:, :, ::-1] - VGG_MEAN + else: + return rgb[:, :, ::-1] + +def bgr2rgb(bgr, vgg_mean=False): + if vgg_mean: + return bgr[:, :, ::-1] + VGG_MEAN + else: + return bgr[:, :, ::-1] + +def load_seg(content_seg_path, style_seg_path, content_shape, style_shape): + color_codes = ['BLUE', 'GREEN', 'BLACK', 'WHITE', 'RED', 'YELLOW', 'GREY', 'LIGHT_BLUE', 'PURPLE'] + def _extract_mask(seg, color_str): + h, w, c = np.shape(seg) + if color_str == "BLUE": + mask_r = (seg[:, :, 0] < 0.1).astype(np.uint8) + mask_g = (seg[:, :, 1] < 0.1).astype(np.uint8) + mask_b = (seg[:, :, 2] > 0.9).astype(np.uint8) + elif color_str == "GREEN": + mask_r = (seg[:, :, 0] < 0.1).astype(np.uint8) + mask_g = (seg[:, :, 1] > 0.9).astype(np.uint8) + mask_b = (seg[:, :, 2] < 0.1).astype(np.uint8) + elif color_str == "BLACK": + mask_r = (seg[:, :, 0] < 0.1).astype(np.uint8) + mask_g = (seg[:, :, 1] < 0.1).astype(np.uint8) + mask_b = (seg[:, :, 2] < 0.1).astype(np.uint8) + elif color_str == "WHITE": + mask_r = (seg[:, :, 0] > 0.9).astype(np.uint8) + mask_g = (seg[:, :, 1] > 0.9).astype(np.uint8) + mask_b = (seg[:, :, 2] > 0.9).astype(np.uint8) + elif color_str == "RED": + mask_r = (seg[:, :, 0] > 0.9).astype(np.uint8) + mask_g = (seg[:, :, 1] < 0.1).astype(np.uint8) + mask_b = (seg[:, :, 2] < 0.1).astype(np.uint8) + elif color_str == "YELLOW": + mask_r = (seg[:, :, 0] > 0.9).astype(np.uint8) + mask_g = (seg[:, :, 1] > 0.9).astype(np.uint8) + mask_b = (seg[:, :, 2] < 0.1).astype(np.uint8) + elif color_str == "GREY": + mask_r = np.multiply((seg[:, :, 0] > 0.4).astype(np.uint8), \ + (seg[:, :, 0] < 0.6).astype(np.uint8)) + mask_g = np.multiply((seg[:, :, 1] > 0.4).astype(np.uint8), \ + (seg[:, :, 1] < 0.6).astype(np.uint8)) + mask_b = np.multiply((seg[:, :, 2] > 0.4).astype(np.uint8), \ + (seg[:, :, 2] < 0.6).astype(np.uint8)) + elif color_str == "LIGHT_BLUE": + mask_r = (seg[:, :, 0] < 0.1).astype(np.uint8) + mask_g = (seg[:, :, 1] > 0.9).astype(np.uint8) + mask_b = (seg[:, :, 2] > 0.9).astype(np.uint8) + elif color_str == "PURPLE": + mask_r = (seg[:, :, 0] > 0.9).astype(np.uint8) + mask_g = (seg[:, :, 1] < 0.1).astype(np.uint8) + mask_b = (seg[:, :, 2] > 0.9).astype(np.uint8) + return np.multiply(np.multiply(mask_r, mask_g), mask_b).astype(np.float32) + + # PIL resize has different order of np.shape + content_seg = np.array(Image.open(content_seg_path).convert("RGB").resize(content_shape, resample=Image.BILINEAR), dtype=np.float32) / 255.0 + style_seg = np.array(Image.open(style_seg_path).convert("RGB").resize(style_shape, resample=Image.BILINEAR), dtype=np.float32) / 255.0 + + color_content_masks = [] + color_style_masks = [] + for i in xrange(len(color_codes)): + color_content_masks.append(tf.expand_dims(tf.expand_dims(tf.constant(_extract_mask(content_seg, color_codes[i])), 0), -1)) + color_style_masks.append(tf.expand_dims(tf.expand_dims(tf.constant(_extract_mask(style_seg, color_codes[i])), 0), -1)) + + return color_content_masks, color_style_masks + +def gram_matrix(activations): + height = tf.shape(activations)[1] + width = tf.shape(activations)[2] + num_channels = tf.shape(activations)[3] + gram_matrix = tf.transpose(activations, [0, 3, 1, 2]) + gram_matrix = tf.reshape(gram_matrix, [num_channels, width * height]) + gram_matrix = tf.matmul(gram_matrix, gram_matrix, transpose_b=True) + return gram_matrix + +def content_loss(const_layer, var_layer, weight): + return tf.reduce_mean(tf.squared_difference(const_layer, var_layer)) * weight + +def style_loss(CNN_structure, const_layers, var_layers, content_segs, style_segs, weight): + loss_styles = [] + layer_count = float(len(const_layers)) + layer_index = 0 + + _, content_seg_height, content_seg_width, _ = content_segs[0].get_shape().as_list() + _, style_seg_height, style_seg_width, _ = style_segs[0].get_shape().as_list() + for layer_name in CNN_structure: + layer_name = layer_name[layer_name.find("/") + 1:] + + # downsampling segmentation + if "pool" in layer_name: + content_seg_width, content_seg_height = int(math.ceil(content_seg_width / 2)), int(math.ceil(content_seg_height / 2)) + style_seg_width, style_seg_height = int(math.ceil(style_seg_width / 2)), int(math.ceil(style_seg_height / 2)) + + for i in xrange(len(content_segs)): + content_segs[i] = tf.image.resize_bilinear(content_segs[i], tf.constant((content_seg_height, content_seg_width))) + style_segs[i] = tf.image.resize_bilinear(style_segs[i], tf.constant((style_seg_height, style_seg_width))) + + elif "conv" in layer_name: + for i in xrange(len(content_segs)): + # have some differences on border with torch + content_segs[i] = tf.nn.avg_pool(tf.pad(content_segs[i], [[0, 0], [1, 1], [1, 1], [0, 0]], "CONSTANT"), \ + ksize=[1, 3, 3, 1], strides=[1, 1, 1, 1], padding='VALID') + style_segs[i] = tf.nn.avg_pool(tf.pad(style_segs[i], [[0, 0], [1, 1], [1, 1], [0, 0]], "CONSTANT"), \ + ksize=[1, 3, 3, 1], strides=[1, 1, 1, 1], padding='VALID') + + if layer_name == var_layers[layer_index].name[var_layers[layer_index].name.find("/") + 1: ]: + print "Setting up style layer: <{}>".format(layer_name) + const_layer = const_layers[layer_index] + var_layer = var_layers[layer_index] + + layer_index = layer_index + 1 + + layer_style_loss = 0.0 + for content_seg, style_seg in zip(content_segs, style_segs): + gram_matrix_const = gram_matrix(tf.multiply(const_layer, style_seg)) + style_mask_mean = tf.reduce_mean(style_seg) + gram_matrix_const = tf.cond(tf.greater(style_mask_mean, 0.), + lambda: gram_matrix_const / (tf.to_float(tf.size(const_layer)) * style_mask_mean), + lambda: gram_matrix_const + ) + + gram_matrix_var = gram_matrix(tf.multiply(var_layer, content_seg)) + content_mask_mean = tf.reduce_mean(content_seg) + gram_matrix_var = tf.cond(tf.greater(content_mask_mean, 0.), + lambda: gram_matrix_var / (tf.to_float(tf.size(var_layer)) * content_mask_mean), + lambda: gram_matrix_var + ) + + diff_style_sum = tf.reduce_mean(tf.squared_difference(gram_matrix_const, gram_matrix_var)) * content_mask_mean + + layer_style_loss += diff_style_sum + + loss_styles.append(layer_style_loss * weight) + return loss_styles + +def total_variation_loss(output, weight): + shape = output.get_shape() + tv_loss = tf.reduce_sum((output[:, :-1, :-1, :] - output[:, :-1, 1:, :]) * (output[:, :-1, :-1, :] - output[:, :-1, 1:, :]) + \ + (output[:, :-1, :-1, :] - output[:, 1:, :-1, :]) * (output[:, :-1, :-1, :] - output[:, 1:, :-1, :])) / 2.0 + return tv_loss * weight + +def affine_loss(output, M, weight): + loss_affine = 0.0 + output_t = output / 255. + for Vc in tf.unstack(output_t, axis=-1): + Vc_ravel = tf.reshape(tf.transpose(Vc), [-1]) + loss_affine += tf.matmul(tf.expand_dims(Vc_ravel, 0), tf.sparse_tensor_dense_matmul(M, tf.expand_dims(Vc_ravel, -1))) + + return loss_affine * weight + +def save_result(img_, str_): + result = Image.fromarray(np.uint8(np.clip(img_, 0, 255.0))) + result.save(str_) + +iter_count = 0 +min_loss, best_image = float("inf"), None +def print_loss(args, loss_content, loss_styles_list, loss_tv, loss_affine, overall_loss, output_image): + global iter_count, min_loss, best_image + if iter_count % args.print_iter == 0: + print 'Iteration {} / {}\n\tContent loss: {}'.format(iter_count, args.max_iter, loss_content) + for j, style_loss in enumerate(loss_styles_list): + print '\tStyle {} loss: {}'.format(j + 1, style_loss) + print '\tTV loss: {}'.format(loss_tv) + print '\tAffine loss: {}'.format(loss_affine) + print '\tTotal loss: {}'.format(overall_loss - loss_affine) + + if overall_loss < min_loss: + min_loss, best_image = overall_loss, output_image + + if iter_count % args.save_iter == 0: + save_result(best_image[:, :, ::-1], 'out_iter_{}.png'.format(iter_count + args.max_iter)) + + iter_count += 1 + +def stylize(args, Matting): + config = tf.ConfigProto() + config.gpu_options.allow_growth=True + sess = tf.Session(config=config) + + start = time.time() + # prepare input images + content_image = np.array(Image.open(args.content_image_path).convert("RGB"), dtype=np.float32) + content_width, content_height = content_image.shape[1], content_image.shape[0] + + if Matting: + M = tf.to_float(getLaplacian(content_image / 255.)) + + content_image = rgb2bgr(content_image) + content_image = content_image.reshape((1, content_height, content_width, 3)).astype(np.float32) + + style_image = rgb2bgr(np.array(Image.open(args.style_image_path).convert("RGB"), dtype=np.float32)) + style_width, style_height = style_image.shape[1], style_image.shape[0] + style_image = style_image.reshape((1, style_height, style_width, 3)).astype(np.float32) + + content_masks, style_masks = load_seg(args.content_seg_path, args.style_seg_path, [content_width, content_height], [style_width, style_height]) + + if not args.init_image_path: + if Matting: + print (": Apply Matting with random init") + init_image = np.random.randn(1, content_height, content_width, 3).astype(np.float32) * 0.0001 + else: + init_image = np.expand_dims(rgb2bgr(np.array(Image.open(args.init_image_path).convert("RGB"), dtype=np.float32)).astype(np.float32), 0) + + mean_pixel = tf.constant(VGG_MEAN) + input_image = tf.Variable(init_image) + + with tf.name_scope("constant"): + vgg_const = Vgg19() + vgg_const.build(tf.constant(content_image), clear_data=False) + + content_fv = sess.run(vgg_const.conv4_2) + content_layer_const = tf.constant(content_fv) + + vgg_const.build(tf.constant(style_image)) + style_layers_const = [vgg_const.conv1_1, vgg_const.conv2_1, vgg_const.conv3_1, vgg_const.conv4_1, vgg_const.conv5_1] + style_fvs = sess.run(style_layers_const) + style_layers_const = [tf.constant(fv) for fv in style_fvs] + + with tf.name_scope("variable"): + vgg_var = Vgg19() + vgg_var.build(input_image) + + # which layers we want to use? + style_layers_var = [vgg_var.conv1_1, vgg_var.conv2_1, vgg_var.conv3_1, vgg_var.conv4_1, vgg_var.conv5_1] + content_layer_var = vgg_var.conv4_2 + + # The whole CNN structure to downsample mask + layer_structure_all = [layer.name for layer in vgg_var.get_all_layers()] + + # Content Loss + loss_content = content_loss(content_layer_const, content_layer_var, float(args.content_weight)) + + # Style Loss + loss_styles_list = style_loss(layer_structure_all, style_layers_const, style_layers_var, content_masks, style_masks, float(args.style_weight)) + loss_style = 0.0 + for loss in loss_styles_list: + loss_style += loss + + input_image_plus = tf.squeeze(input_image + mean_pixel, [0]) + + # Affine Loss + if Matting: + loss_affine = affine_loss(input_image_plus, M, args.affine_weight) + else: + loss_affine = tf.constant(0.00001) # junk value + + # Total Variational Loss + loss_tv = total_variation_loss(input_image, float(args.tv_weight)) + + if args.lbfgs: + if not Matting: + overall_loss = loss_content + loss_tv + loss_style + else: + overall_loss = loss_content + loss_style + loss_tv + loss_affine + + optimizer = tf.contrib.opt.ScipyOptimizerInterface(overall_loss, method='L-BFGS-B', options={'maxiter': args.max_iter, 'disp': 0}) + sess.run(tf.global_variables_initializer()) + print_loss_partial = partial(print_loss, args) + optimizer.minimize(sess, fetches=[loss_content, loss_styles_list, loss_tv, loss_affine, overall_loss, input_image_plus], loss_callback=print_loss_partial) + + best_result = copy.deepcopy(best_image) + global min_loss, best_image, iter_count + iter_count = 0 + min_loss, best_image = float("inf"), None + return best_result + else: + VGGNetLoss = loss_content + loss_tv + loss_style + optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) + VGG_grads = optimizer.compute_gradients(VGGNetLoss, [input_image]) + + if Matting: + b, g, r = tf.unstack(input_image_plus / 255., axis=-1) + b_gradient = tf.transpose(tf.reshape(2 * tf.sparse_tensor_dense_matmul(M, tf.expand_dims(tf.reshape(tf.transpose(b), [-1]), -1)), [content_width, content_height])) + g_gradient = tf.transpose(tf.reshape(2 * tf.sparse_tensor_dense_matmul(M, tf.expand_dims(tf.reshape(tf.transpose(g), [-1]), -1)), [content_width, content_height])) + r_gradient = tf.transpose(tf.reshape(2 * tf.sparse_tensor_dense_matmul(M, tf.expand_dims(tf.reshape(tf.transpose(r), [-1]), -1)), [content_width, content_height])) + + Matting_grad = tf.expand_dims(tf.stack([b_gradient, g_gradient, r_gradient], axis=-1), 0) / 255. * args.affine_weight + VGGMatting_grad = [(VGG_grad[0] + Matting_grad, VGG_grad[1]) for VGG_grad in VGG_grads] + + train_op = optimizer.apply_gradients(VGGMatting_grad) + else: + train_op = optimizer.apply_gradients(VGG_grads) + + sess.run(tf.global_variables_initializer()) + min_loss, best_image = float("inf"), None + for i in xrange(1, args.max_iter): + _, loss_content_, loss_styles_list_, loss_tv_, loss_affine_, overall_loss_, output_image_ = sess.run([ + train_op, loss_content, loss_styles_list, loss_tv, loss_affine, VGGNetLoss, input_image_plus + ]) + if i % args.print_iter == 0: + print 'Iteration {} / {}\n\tContent loss: {}'.format(i, args.max_iter, loss_content_) + for j, style_loss_ in enumerate(loss_styles_list_): + print '\tStyle {} loss: {}'.format(j + 1, style_loss_) + print '\tTV loss: {}'.format(loss_tv_) + if Matting: + print '\tAffine loss: {}'.format(loss_affine_) + print '\tTotal loss: {}'.format(overall_loss_ - loss_tv_) + + if overall_loss_ < min_loss: + min_loss, best_image = overall_loss_, output_image_ + + if i % args.save_iter == 0 and i != 0: + save_result(best_image[:, :, ::-1], 'out_iter_{}.png'.format(i)) + + return best_image + +if __name__ == "__main__": + stylize() diff --git a/smooth_local_affine.py b/smooth_local_affine.py new file mode 100644 index 0000000..acc41e5 --- /dev/null +++ b/smooth_local_affine.py @@ -0,0 +1,370 @@ +import numpy as np +from PIL import Image +import pycuda.autoinit +import pycuda.driver as drv +import scipy.io +from pycuda.compiler import SourceModule + +def smooth_local_affine(output_, input_, epsilon, patch, h, w, f_r, f_e): + mod = SourceModule(""" + + #include + #include + #include + #include + #include + #include + + #define TB 256 + #define EPS 1e-7 + + __device__ bool InverseMat4x4(double m_in[4][4], double inv_out[4][4]) { + double m[16], inv[16]; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + m[i * 4 + j] = m_in[i][j]; + } + } + + inv[0] = m[5] * m[10] * m[15] - + m[5] * m[11] * m[14] - + m[9] * m[6] * m[15] + + m[9] * m[7] * m[14] + + m[13] * m[6] * m[11] - + m[13] * m[7] * m[10]; + + inv[4] = -m[4] * m[10] * m[15] + + m[4] * m[11] * m[14] + + m[8] * m[6] * m[15] - + m[8] * m[7] * m[14] - + m[12] * m[6] * m[11] + + m[12] * m[7] * m[10]; + + inv[8] = m[4] * m[9] * m[15] - + m[4] * m[11] * m[13] - + m[8] * m[5] * m[15] + + m[8] * m[7] * m[13] + + m[12] * m[5] * m[11] - + m[12] * m[7] * m[9]; + + inv[12] = -m[4] * m[9] * m[14] + + m[4] * m[10] * m[13] + + m[8] * m[5] * m[14] - + m[8] * m[6] * m[13] - + m[12] * m[5] * m[10] + + m[12] * m[6] * m[9]; + + inv[1] = -m[1] * m[10] * m[15] + + m[1] * m[11] * m[14] + + m[9] * m[2] * m[15] - + m[9] * m[3] * m[14] - + m[13] * m[2] * m[11] + + m[13] * m[3] * m[10]; + + inv[5] = m[0] * m[10] * m[15] - + m[0] * m[11] * m[14] - + m[8] * m[2] * m[15] + + m[8] * m[3] * m[14] + + m[12] * m[2] * m[11] - + m[12] * m[3] * m[10]; + + inv[9] = -m[0] * m[9] * m[15] + + m[0] * m[11] * m[13] + + m[8] * m[1] * m[15] - + m[8] * m[3] * m[13] - + m[12] * m[1] * m[11] + + m[12] * m[3] * m[9]; + + inv[13] = m[0] * m[9] * m[14] - + m[0] * m[10] * m[13] - + m[8] * m[1] * m[14] + + m[8] * m[2] * m[13] + + m[12] * m[1] * m[10] - + m[12] * m[2] * m[9]; + + inv[2] = m[1] * m[6] * m[15] - + m[1] * m[7] * m[14] - + m[5] * m[2] * m[15] + + m[5] * m[3] * m[14] + + m[13] * m[2] * m[7] - + m[13] * m[3] * m[6]; + + inv[6] = -m[0] * m[6] * m[15] + + m[0] * m[7] * m[14] + + m[4] * m[2] * m[15] - + m[4] * m[3] * m[14] - + m[12] * m[2] * m[7] + + m[12] * m[3] * m[6]; + + inv[10] = m[0] * m[5] * m[15] - + m[0] * m[7] * m[13] - + m[4] * m[1] * m[15] + + m[4] * m[3] * m[13] + + m[12] * m[1] * m[7] - + m[12] * m[3] * m[5]; + + inv[14] = -m[0] * m[5] * m[14] + + m[0] * m[6] * m[13] + + m[4] * m[1] * m[14] - + m[4] * m[2] * m[13] - + m[12] * m[1] * m[6] + + m[12] * m[2] * m[5]; + + inv[3] = -m[1] * m[6] * m[11] + + m[1] * m[7] * m[10] + + m[5] * m[2] * m[11] - + m[5] * m[3] * m[10] - + m[9] * m[2] * m[7] + + m[9] * m[3] * m[6]; + + inv[7] = m[0] * m[6] * m[11] - + m[0] * m[7] * m[10] - + m[4] * m[2] * m[11] + + m[4] * m[3] * m[10] + + m[8] * m[2] * m[7] - + m[8] * m[3] * m[6]; + + inv[11] = -m[0] * m[5] * m[11] + + m[0] * m[7] * m[9] + + m[4] * m[1] * m[11] - + m[4] * m[3] * m[9] - + m[8] * m[1] * m[7] + + m[8] * m[3] * m[5]; + + inv[15] = m[0] * m[5] * m[10] - + m[0] * m[6] * m[9] - + m[4] * m[1] * m[10] + + m[4] * m[2] * m[9] + + m[8] * m[1] * m[6] - + m[8] * m[2] * m[5]; + + double det = m[0] * inv[0] + m[1] * inv[4] + m[2] * inv[8] + m[3] * inv[12]; + + if (abs(det) < 1e-9) { + return false; + } + + + det = 1.0 / det; + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + inv_out[i][j] = inv[i * 4 + j] * det; + } + } + + return true; + } + + __global__ void best_local_affine_kernel( + float *output, float *input, float *affine_model, + int h, int w, float epsilon, int kernel_radius + ) + { + int size = h * w; + int id = blockIdx.x * blockDim.x + threadIdx.x; + + if (id < size) { + int x = id % w, y = id / w; + + double Mt_M[4][4] = {}; // 4x4 + double invMt_M[4][4] = {}; + double Mt_S[3][4] = {}; // RGB -> 1x4 + double A[3][4] = {}; + for (int i = 0; i < 4; i++) + for (int j = 0; j < 4; j++) { + Mt_M[i][j] = 0, invMt_M[i][j] = 0; + if (i != 3) { + Mt_S[i][j] = 0, A[i][j] = 0; + if (i == j) + Mt_M[i][j] = 1e-3; + } + } + + for (int dy = -kernel_radius; dy <= kernel_radius; dy++) { + for (int dx = -kernel_radius; dx <= kernel_radius; dx++) { + + int xx = x + dx, yy = y + dy; + int id2 = yy * w + xx; + + if (0 <= xx && xx < w && 0 <= yy && yy < h) { + + Mt_M[0][0] += input[id2 + 2*size] * input[id2 + 2*size]; + Mt_M[0][1] += input[id2 + 2*size] * input[id2 + size]; + Mt_M[0][2] += input[id2 + 2*size] * input[id2]; + Mt_M[0][3] += input[id2 + 2*size]; + + Mt_M[1][0] += input[id2 + size] * input[id2 + 2*size]; + Mt_M[1][1] += input[id2 + size] * input[id2 + size]; + Mt_M[1][2] += input[id2 + size] * input[id2]; + Mt_M[1][3] += input[id2 + size]; + + Mt_M[2][0] += input[id2] * input[id2 + 2*size]; + Mt_M[2][1] += input[id2] * input[id2 + size]; + Mt_M[2][2] += input[id2] * input[id2]; + Mt_M[2][3] += input[id2]; + + Mt_M[3][0] += input[id2 + 2*size]; + Mt_M[3][1] += input[id2 + size]; + Mt_M[3][2] += input[id2]; + Mt_M[3][3] += 1; + + Mt_S[0][0] += input[id2 + 2*size] * output[id2 + 2*size]; + Mt_S[0][1] += input[id2 + size] * output[id2 + 2*size]; + Mt_S[0][2] += input[id2] * output[id2 + 2*size]; + Mt_S[0][3] += output[id2 + 2*size]; + + Mt_S[1][0] += input[id2 + 2*size] * output[id2 + size]; + Mt_S[1][1] += input[id2 + size] * output[id2 + size]; + Mt_S[1][2] += input[id2] * output[id2 + size]; + Mt_S[1][3] += output[id2 + size]; + + Mt_S[2][0] += input[id2 + 2*size] * output[id2]; + Mt_S[2][1] += input[id2 + size] * output[id2]; + Mt_S[2][2] += input[id2] * output[id2]; + Mt_S[2][3] += output[id2]; + } + } + } + + bool success = InverseMat4x4(Mt_M, invMt_M); + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 4; j++) { + for (int k = 0; k < 4; k++) { + A[i][j] += invMt_M[j][k] * Mt_S[i][k]; + } + } + } + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 4; j++) { + int affine_id = i * 4 + j; + affine_model[12 * id + affine_id] = A[i][j]; + } + } + + + + } + return ; + } + + __global__ void bilateral_smooth_kernel( + float *affine_model, float *filtered_affine_model, float *guide, + int h, int w, int kernel_radius, float sigma1, float sigma2 + ) + { + int id = blockIdx.x * blockDim.x + threadIdx.x; + int size = h * w; + if (id < size) { + int x = id % w; + int y = id / w; + + double sum_affine[12] = {}; + double sum_weight = 0; + for (int dx = -kernel_radius; dx <= kernel_radius; dx++) { + for (int dy = -kernel_radius; dy <= kernel_radius; dy++) { + int yy = y + dy, xx = x + dx; + int id2 = yy * w + xx; + if (0 <= xx && xx < w && 0 <= yy && yy < h) { + float color_diff1 = guide[yy*w + xx] - guide[y*w + x]; + float color_diff2 = guide[yy*w + xx + size] - guide[y*w + x + size]; + float color_diff3 = guide[yy*w + xx + 2*size] - guide[y*w + x + 2*size]; + float color_diff_sqr = + (color_diff1*color_diff1 + color_diff2*color_diff2 + color_diff3*color_diff3) / 3; + + float v1 = exp(-(dx * dx + dy * dy) / (2 * sigma1 * sigma1)); + float v2 = exp(-(color_diff_sqr) / (2 * sigma2 * sigma2)); + float weight = v1 * v2; + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 4; j++) { + int affine_id = i * 4 + j; + sum_affine[affine_id] += weight * affine_model[id2*12 + affine_id]; + } + } + sum_weight += weight; + } + } + } + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 4; j++) { + int affine_id = i * 4 + j; + filtered_affine_model[id*12 + affine_id] = sum_affine[affine_id] / sum_weight; + } + } + } + return ; + } + + __global__ void reconstruction_best_kernel( + float *input, float *filtered_affine_model, float *filtered_best_output, + int h, int w + ) + { + int id = blockIdx.x * blockDim.x + threadIdx.x; + int size = h * w; + if (id < size) { + double out1 = + input[id + 2*size] * filtered_affine_model[id*12 + 0] + // A[0][0] + + input[id + size] * filtered_affine_model[id*12 + 1] + // A[0][1] + + input[id] * filtered_affine_model[id*12 + 2] + // A[0][2] + + filtered_affine_model[id*12 + 3]; //A[0][3]; + double out2 = + input[id + 2*size] * filtered_affine_model[id*12 + 4] + //A[1][0] + + input[id + size] * filtered_affine_model[id*12 + 5] + //A[1][1] + + input[id] * filtered_affine_model[id*12 + 6] + //A[1][2] + + filtered_affine_model[id*12 + 7]; //A[1][3]; + double out3 = + input[id + 2*size] * filtered_affine_model[id*12 + 8] + //A[2][0] + + input[id + size] * filtered_affine_model[id*12 + 9] + //A[2][1] + + input[id] * filtered_affine_model[id*12 + 10] + //A[2][2] + + filtered_affine_model[id*12 + 11]; // A[2][3]; + + filtered_best_output[id] = out1; + filtered_best_output[id + size] = out2; + filtered_best_output[id + 2*size] = out3; + } + return ; + } + """) + _best_local_affine_kernel = mod.get_function("best_local_affine_kernel") + _bilateral_smooth_kernel = mod.get_function("bilateral_smooth_kernel") + _reconstruction_best_kernel = mod.get_function("reconstruction_best_kernel") + + filter_radius = f_r + sigma1, sigma2 = filter_radius / 3, f_e + + filtered_best_output = np.zeros(np.shape(input_), dtype=np.float32) + affine_model = np.zeros((h * w, 12), dtype=np.float32) + filtered_affine_model = np.zeros((h * w, 12), dtype=np.float32) + + radius = (patch - 1) / 2 + + _best_local_affine_kernel( + drv.InOut(output_), drv.InOut(input_), drv.InOut(affine_model), + np.int32(h), np.int32(w), np.float32(epsilon), np.int32(radius), block=(256, 1, 1), grid=((h * w) / 256 + 1, 1) + ) + + _bilateral_smooth_kernel( + drv.InOut(affine_model), drv.InOut(filtered_affine_model), + drv.InOut(input_), np.int32(h), np.int32(w), np.int32(f_r), np.float32(sigma1), np.float32(sigma2), + block=(256, 1, 1), grid=((h * w) / 256 + 1, 1) + ) + _reconstruction_best_kernel( + drv.InOut(input_), drv.InOut(filtered_affine_model), drv.InOut(filtered_best_output), + np.int32(h), np.int32(w), block=(256, 1, 1), grid=((h * w) / 256 + 1, 1) + ) + return filtered_best_output + +if __name__ == "__main__": + X = scipy.io.loadmat("./best3_t_1000.mat") + output_ = np.ascontiguousarray(X['output'], dtype=np.float32) / 256. + # output_ = np.ascontiguousarray(np.array(Image.open("test2.png").convert("RGB"), dtype=np.float32)[:, :, ::-1].transpose((2, 0, 1)), dtype=np.float32) / 256. + input_ = np.ascontiguousarray(np.array(Image.open("./examples/input/in3.png").convert("RGB"), dtype=np.float32)[:, :, ::-1].transpose((2, 0, 1)), dtype=np.float32)/256. + c, h, w = np.shape(input_) + best = smooth_local_affine(output_, input_, 1e-7, 3, h, w, 15, 0.01).transpose(1, 2, 0) + best_img = Image.fromarray(np.uint8(np.clip(best * 256, 0, 255.0))) + best_img.save("./best2.png")