Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
JaywongWang committed Oct 6, 2018
1 parent 7ba20de commit 954f3f1
Show file tree
Hide file tree
Showing 7 changed files with 1,924 additions and 0 deletions.
21 changes: 21 additions & 0 deletions LICENSE
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2018 Jingwen Wang

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
41 changes: 41 additions & 0 deletions README.md
@@ -0,0 +1,41 @@
# DenseVideoCaptioning

Tensorflow Implementation of the Paper [Bidirectional Attentive Fusion with Context Gating for Dense Video Captioning](https://arxiv.org/abs/1804.00100) by Jingwen Wang *et al.* in *CVPR* 2018.

### Data Preparation

Please download annotation data and C3D features from the website [ActivityNet Captions](https://cs.stanford.edu/people/ranjaykrishna/densevid/).

Please follow the script dataset/ActivityNet_Captions/preprocess/anchors/get_anchors.py to obtain clustered anchors and their pos/neg weights (for handling imbalance class problem). I already put the generated files in dataset/ActivityNet_Captions/preprocess/anchors/.

Please follow the script dataset/ActivityNet_Captions/preprocess/build_vocab.py to build word dictionary and to build train/val/test encoded sentence data.

### Hyper Parameters

The configuration (from my experiments) is given in opt.py, including model setup, training options, and testing options.

### Training

Train dense-captioning model using the script train.py.

First pre-train the proposal module for around 5 epochs. Set train_proposal=True and train_caption=False. Then train the whold dense-captioning model by setting train_proposal=True and train_caption=True

### Prediction

Follow the script test.py to make proposal predictions and to evaluate the predictions.

### Evaluation

Please note that the official evaluation metric has been [updated](https://github.com/ranjaykrishna/densevid_eval/commit/bbbd49d31a038acf2642f7ae158bb6b9da6937fc)(Line 194). In the paper, old metric is reported (but still, you can compare results from different methods, all CVPR-2018 papers report old metric).

### Results

The predicted results for val/test set can be found in results/.

### Dependencies

tensorflow==1.0.1

python==2.7.5

Other versions may also work.
268 changes: 268 additions & 0 deletions data_provider.py
@@ -0,0 +1,268 @@
"""
Data provider for the built models
"""

import random
import numpy as np
import os
import h5py
from collections import OrderedDict
import json
from opt import *
import random
import math

np.set_printoptions(threshold=np.inf)

class DataProvision:
def __init__(self, options):
assert options['batch_size'] == 1
self._options = options
self._splits = {'train':'train', 'val':'val_1'}

self._ids = {} # video ids
captions = {}
self._sizes = {}
print('Loading paragraph data ...')
for split in self._splits:
tmp_ids = open(os.path.join(self._options['caption_data_root'], split, 'ids.txt'), 'r').readlines()
tmp_ids = [id.strip() for id in tmp_ids]
self._ids[split] = tmp_ids

self._sizes[split] = len(self._ids[split])

tmp_captions = json.load(open(os.path.join(self._options['caption_data_root'], split, 'encoded_sentences.json'), 'r'))
captions[split] = {tmp_ids[i]:tmp_captions[i] for i in range(len(tmp_ids))}

# merge two caption dictionaries
self._captions = {}
for split in self._splits:
self._captions = dict(self._captions.items() + captions[split].items())


# feature dictionary
print('Loading c3d features ...')
features = h5py.File(self._options['feature_data_path'], 'r')
self._feature_ids = features.keys()
self._features = {video_id:np.asarray(features[video_id].values()[0]) for video_id in self._feature_ids}


# load label weight data
print('Loading label weight data ...')
self._proposal_weight = json.load(open(os.path.join(self._options['caption_data_root'], 'anchors', 'weights.json')))
if self._options['proposal_tiou_threshold'] != 0.5:
raise ValueError('Might need to recalculate class weights to handle imbalance data')

# get anchors
print('Loading anchor data ...')
anchor_path = os.path.join(self._options['caption_data_root'], 'anchors', 'anchors.txt')
anchors = open(anchor_path).readlines()
self._anchors = [float(line.strip()) for line in anchors]
# time stamp data
print('Loading localization data ...')
self._localization = {}
for split in self._splits:
data = json.load(open(os.path.join(self._options['localization_data_path'], '%s.json'%self._splits[split])))
self._localization[split] = data


print('Done loading.')



def get_size(self, split):
return self._sizes[split]

def get_ids(self, split):
return self._ids[split]

def get_anchors(self):
return self._anchors

def get_localization(self):
return self._localization

# process caption batch data into standard format
def process_batch_paragraph(self, batch_paragraph):
paragraph_length = []
caption_length = []
for captions in batch_paragraph:
paragraph_length.append(len(captions))
cap_len = []
for caption in captions:
cap_len.append(len(caption))

caption_length.append(cap_len)

caption_num = len(batch_paragraph[0])
input_idx = np.zeros((len(batch_paragraph), caption_num, self._options['caption_seq_len']), dtype='int32')
input_mask = np.zeros_like(input_idx)

for i, captions in enumerate(batch_paragraph):
for j in range(caption_num):
caption = captions[j]
effective_len = min(caption_length[i][j], self._options['caption_seq_len'])
input_idx[i, j, 0:effective_len] = caption[:effective_len]
input_mask[i, j, 0:effective_len-1] = 1

return input_idx, input_mask

# provide batch data
def iterate_batch(self, split, batch_size):

ids = list(self._ids[split])

if split == 'train':
print('Randomly shuffle training data ...')
random.shuffle(ids)

current = 0

while True:

batch_paragraph = []
batch_feature_fw = []
batch_feature_bw = []
batch_proposal_fw = []
batch_proposal_bw = []

# train in pair, use one caption as common gt
batch_proposal_caption_fw = [] # 0/1 to indicate whether to select the lstm state to feed into captioning module (based on tIoU)
batch_proposal_caption_bw = [] # index to select corresponding backward feature

i = 0 # batch_size = 1
vid = ids[i+current]
feature_fw = self._features[vid]
feature_len = feature_fw.shape[0]

if 'print_debug' in self._options and self._options['print_debug']:
print('vid: %s'%vid)
print('feature_len: %d'%feature_len)

feature_bw = np.flip(feature_fw, axis=0)

batch_feature_fw.append(feature_fw)
batch_feature_bw.append(feature_bw)


localization = self._localization[split][vid]
timestamps = localization['timestamps']
duration = localization['duration']

# start and end time of the video stream
start_time = 0.
end_time = duration


n_anchors = len(self._anchors)
# ground truth proposal
gt_proposal_fw = np.zeros(shape=(feature_len, n_anchors), dtype='int32')
gt_proposal_bw = np.zeros(shape=(feature_len, n_anchors), dtype='int32')
# ground truth proposal for feeding into captioning module
gt_proposal_caption_fw = np.zeros(shape=(feature_len, ), dtype='int32')
# corresponding backward index
gt_proposal_caption_bw = np.zeros(shape=(feature_len, ), dtype='int32')
# ground truth encoded caption in each time step
gt_caption = [[0] for i in range(feature_len)]

paragraph = self._captions[vid]

assert self._options['caption_tiou_threshold'] >= self._options['proposal_tiou_threshold']

# calculate ground truth labels
for stamp_id, stamp in enumerate(timestamps):
t1 = stamp[0]
t2 = stamp[1]
if t1 > t2:
temp = t1
t1 = t2
t2 = temp
start = t1
end = t2

start_bw = duration - end
end_bw = duration - start


# if not end or if no overlap at all
if end > end_time or start > end_time:
continue

end_feat_id = max(int(round(end*feature_len/duration)-1), 0)
start_feat_id = max(int(round(start*feature_len/duration) - 1), 0)

mid_feature_id = int(round(((1.-self._options['proposal_tiou_threshold'])*end + self._options['proposal_tiou_threshold']*start) * feature_len / duration)) - 1
mid_feature_id = max(0, mid_feature_id)

for i in range(mid_feature_id, feature_len):
overlap = False
for anchor_id, anchor in enumerate(self._anchors):
end_pred = (float(i+1)/feature_len) * duration
start_pred = end_pred - anchor

intersection = max(0, min(end, end_pred) - max(start, start_pred))
union = min(max(end, end_pred) - min(start, start_pred), end-start + end_pred-start_pred)
iou = float(intersection) / (union + 1e-8)


if iou > self._options['proposal_tiou_threshold']:
overlap = True
# the corresonding label of backward lstm
i_bw = feature_len - 1 - (start_feat_id+end_feat_id-i)
i_bw = max(min(i_bw, feature_len-1), 0)

if end_pred >= end and i - end_feat_id <= 5:
gt_proposal_fw[i, anchor_id] = 1
gt_proposal_bw[i_bw, anchor_id] = 1


if iou > self._options['caption_tiou_threshold']:
gt_proposal_caption_fw[i] = 1
gt_proposal_caption_bw[i] = i_bw
gt_caption[i] = paragraph[stamp_id]

elif overlap:
break


batch_proposal_fw.append(gt_proposal_fw)
batch_proposal_bw.append(gt_proposal_bw)
batch_proposal_caption_fw.append(gt_proposal_caption_fw)
batch_proposal_caption_bw.append(gt_proposal_caption_bw)
batch_paragraph.append(gt_caption)

batch_caption, batch_caption_mask = self.process_batch_paragraph(batch_paragraph)

batch_feature_fw = np.asarray(batch_feature_fw, dtype='float32')
batch_feature_bw = np.asarray(batch_feature_bw, dtype='float32')
batch_caption = np.asarray(batch_caption, dtype='int32')
batch_caption_mask = np.asarray(batch_caption_mask, dtype='int32')

batch_proposal_fw = np.asarray(batch_proposal_fw, dtype='int32')
batch_proposal_bw = np.asarray(batch_proposal_bw, dtype='int32')
batch_proposal_caption_fw = np.asarray(batch_proposal_caption_fw, dtype='int32')
batch_proposal_caption_bw = np.asarray(batch_proposal_caption_bw, dtype='int32')


# serve as a tuple
batch_data = {'video_feat_fw': batch_feature_fw, 'video_feat_bw': batch_feature_bw, 'caption': batch_caption, 'caption_mask': batch_caption_mask, 'proposal_fw': batch_proposal_fw, 'proposal_bw': batch_proposal_bw, 'proposal_caption_fw': batch_proposal_caption_fw, 'proposal_caption_bw': batch_proposal_caption_bw, 'proposal_weight': np.array(self._proposal_weight)}


yield batch_data

current = current + batch_size

if current + batch_size > self.get_size(split):
current = 0
# at the end of list, shuffle it
if split == 'train':
print('Randomly shuffle training data ...')
random.shuffle(ids)
print('The new shuffled ids are:')
print('%s, %s, %s, ..., %s'%(ids[0], ids[1], ids[2], ids[-1]))
time.sleep(3)
else:
break



0 comments on commit 954f3f1

Please sign in to comment.