Add files via upload

JaywongWang · Oct 6, 2018 · 954f3f1 · 954f3f1
1 parent 7ba20de
commit 954f3f1
Show file tree

Hide file tree

Showing 7 changed files with 1,924 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Jingwen Wang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,41 @@
+# DenseVideoCaptioning
+
+Tensorflow Implementation of the Paper [Bidirectional Attentive Fusion with Context Gating for Dense Video Captioning](https://arxiv.org/abs/1804.00100) by Jingwen Wang *et al.* in *CVPR* 2018.
+
+### Data Preparation
+
+Please download annotation data and C3D features from the website [ActivityNet Captions](https://cs.stanford.edu/people/ranjaykrishna/densevid/).
+
+Please follow the script dataset/ActivityNet_Captions/preprocess/anchors/get_anchors.py to obtain clustered anchors and their pos/neg weights (for handling imbalance class problem). I already put the generated files in dataset/ActivityNet_Captions/preprocess/anchors/.
+
+Please follow the script dataset/ActivityNet_Captions/preprocess/build_vocab.py to build word dictionary and to build train/val/test encoded sentence data.
+
+### Hyper Parameters
+
+The configuration (from my experiments) is given in opt.py, including model setup, training options, and testing options.
+
+### Training
+
+Train dense-captioning model using the script train.py.
+
+First pre-train the proposal module for around 5 epochs. Set train_proposal=True and train_caption=False. Then train the whold dense-captioning model by setting train_proposal=True and train_caption=True
+
+### Prediction
+
+Follow the script test.py to make proposal predictions and to evaluate the predictions.
+
+### Evaluation
+
+Please note that the official evaluation metric has been [updated](https://github.com/ranjaykrishna/densevid_eval/commit/bbbd49d31a038acf2642f7ae158bb6b9da6937fc)(Line 194). In the paper, old metric is reported (but still, you can compare results from different methods, all CVPR-2018 papers report old metric).
+
+### Results
+
+The predicted results for val/test set can be found in results/.
+
+### Dependencies
+
+tensorflow==1.0.1
+
+python==2.7.5
+
+Other versions may also work.
diff --git a/data_provider.py b/data_provider.py
@@ -0,0 +1,268 @@
+"""
+Data provider for the built models
+"""
+
+import random
+import numpy as np
+import os
+import h5py
+from collections import OrderedDict
+import json
+from opt import *
+import random
+import math
+
+np.set_printoptions(threshold=np.inf)
+
+class DataProvision:
+    def __init__(self, options):
+        assert options['batch_size'] == 1
+        self._options = options
+        self._splits = {'train':'train', 'val':'val_1'}
+
+        self._ids = {}  # video ids
+        captions = {}
+        self._sizes = {}
+        print('Loading paragraph data ...')
+        for split in self._splits:
+            tmp_ids = open(os.path.join(self._options['caption_data_root'], split, 'ids.txt'), 'r').readlines()
+            tmp_ids = [id.strip() for id in tmp_ids]
+            self._ids[split] = tmp_ids
+
+            self._sizes[split] = len(self._ids[split])
+
+            tmp_captions = json.load(open(os.path.join(self._options['caption_data_root'], split, 'encoded_sentences.json'), 'r'))
+            captions[split] = {tmp_ids[i]:tmp_captions[i] for i in range(len(tmp_ids))}
+
+        # merge two caption dictionaries
+        self._captions = {}
+        for split in self._splits:
+            self._captions = dict(self._captions.items() + captions[split].items())
+
+
+        # feature dictionary
+        print('Loading c3d features ...')
+        features = h5py.File(self._options['feature_data_path'], 'r')
+        self._feature_ids = features.keys()
+        self._features = {video_id:np.asarray(features[video_id].values()[0]) for video_id in self._feature_ids}
+
+
+        # load label weight data
+        print('Loading label weight data ...')
+        self._proposal_weight = json.load(open(os.path.join(self._options['caption_data_root'], 'anchors', 'weights.json')))
+        if self._options['proposal_tiou_threshold'] != 0.5:
+            raise ValueError('Might need to recalculate class weights to handle imbalance data')
+
+        # get anchors
+        print('Loading anchor data ...')
+        anchor_path = os.path.join(self._options['caption_data_root'], 'anchors', 'anchors.txt')
+        anchors = open(anchor_path).readlines()
+        self._anchors = [float(line.strip()) for line in anchors]
+        # time stamp data
+        print('Loading localization data ...')
+        self._localization = {}
+        for split in self._splits:
+            data = json.load(open(os.path.join(self._options['localization_data_path'], '%s.json'%self._splits[split])))
+            self._localization[split] = data
+
+
+        print('Done loading.')
+
+
+
+    def get_size(self, split):
+        return self._sizes[split]
+
+    def get_ids(self, split):
+        return self._ids[split]
+
+    def get_anchors(self):
+        return self._anchors
+
+    def get_localization(self):
+        return self._localization
+
+    # process caption batch data into standard format
+    def process_batch_paragraph(self, batch_paragraph):
+        paragraph_length = []
+        caption_length = []
+        for captions in batch_paragraph:
+            paragraph_length.append(len(captions))
+            cap_len = []
+            for caption in captions:
+                cap_len.append(len(caption))
+
+            caption_length.append(cap_len)
+
+        caption_num = len(batch_paragraph[0])
+        input_idx = np.zeros((len(batch_paragraph), caption_num, self._options['caption_seq_len']), dtype='int32')
+        input_mask = np.zeros_like(input_idx)
+
+        for i, captions in enumerate(batch_paragraph):
+            for j in range(caption_num):
+                caption = captions[j]
+                effective_len = min(caption_length[i][j], self._options['caption_seq_len'])
+                input_idx[i, j, 0:effective_len] = caption[:effective_len]
+                input_mask[i, j, 0:effective_len-1] = 1
+
+        return input_idx, input_mask
+
+    # provide batch data
+    def iterate_batch(self, split, batch_size):
+
+        ids = list(self._ids[split])
+
+        if split == 'train':
+            print('Randomly shuffle training data ...')
+            random.shuffle(ids)
+
+        current = 0
+
+        while True:
+
+            batch_paragraph = []
+            batch_feature_fw = []
+            batch_feature_bw = []
+            batch_proposal_fw = []
+            batch_proposal_bw = []
+
+            # train in pair, use one caption as common gt
+            batch_proposal_caption_fw = [] # 0/1 to indicate whether to select the lstm state to feed into captioning module (based on tIoU)
+            batch_proposal_caption_bw = [] # index to select corresponding backward feature
+
+            i = 0 # batch_size = 1
+            vid = ids[i+current]
+            feature_fw = self._features[vid]
+            feature_len = feature_fw.shape[0]
+
+            if 'print_debug' in self._options and self._options['print_debug']:
+                print('vid: %s'%vid)
+                print('feature_len: %d'%feature_len)
+
+            feature_bw = np.flip(feature_fw, axis=0)
+
+            batch_feature_fw.append(feature_fw)
+            batch_feature_bw.append(feature_bw)
+
+
+            localization = self._localization[split][vid]
+            timestamps = localization['timestamps']
+            duration = localization['duration']
+
+            # start and end time of the video stream
+            start_time = 0.
+            end_time = duration
+
+
+            n_anchors = len(self._anchors)
+            # ground truth proposal
+            gt_proposal_fw = np.zeros(shape=(feature_len, n_anchors), dtype='int32')
+            gt_proposal_bw = np.zeros(shape=(feature_len, n_anchors), dtype='int32')
+            # ground truth proposal for feeding into captioning module
+            gt_proposal_caption_fw = np.zeros(shape=(feature_len, ), dtype='int32')
+            # corresponding backward index
+            gt_proposal_caption_bw = np.zeros(shape=(feature_len, ), dtype='int32')
+            # ground truth encoded caption in each time step
+            gt_caption = [[0] for i in range(feature_len)]
+
+            paragraph = self._captions[vid]
+
+            assert self._options['caption_tiou_threshold'] >= self._options['proposal_tiou_threshold']
+
+            # calculate ground truth labels
+            for stamp_id, stamp in enumerate(timestamps):
+                t1 = stamp[0]
+                t2 = stamp[1]
+                if t1 > t2:
+                    temp = t1
+                    t1 = t2
+                    t2 = temp
+                start = t1
+                end = t2
+
+                start_bw = duration - end
+                end_bw = duration - start
+
+
+                # if not end or if no overlap at all
+                if end > end_time or start > end_time:
+                    continue
+
+                end_feat_id = max(int(round(end*feature_len/duration)-1), 0)
+                start_feat_id = max(int(round(start*feature_len/duration) - 1), 0)
+
+                mid_feature_id = int(round(((1.-self._options['proposal_tiou_threshold'])*end + self._options['proposal_tiou_threshold']*start) * feature_len / duration)) - 1
+                mid_feature_id = max(0, mid_feature_id)
+
+                for i in range(mid_feature_id, feature_len):
+                    overlap = False
+                    for anchor_id, anchor in enumerate(self._anchors):
+                        end_pred = (float(i+1)/feature_len) * duration
+                        start_pred = end_pred - anchor
+
+                        intersection = max(0, min(end, end_pred) - max(start, start_pred))
+                        union = min(max(end, end_pred) - min(start, start_pred), end-start + end_pred-start_pred)
+                        iou = float(intersection) / (union + 1e-8)
+
+
+                        if iou > self._options['proposal_tiou_threshold']:
+                            overlap = True
+                            # the corresonding label of backward lstm
+                            i_bw = feature_len - 1 - (start_feat_id+end_feat_id-i)
+                            i_bw = max(min(i_bw, feature_len-1), 0)
+
+                            if end_pred >= end and i - end_feat_id <= 5:
+                                gt_proposal_fw[i, anchor_id] = 1
+                                gt_proposal_bw[i_bw, anchor_id] = 1
+
+
+                            if iou > self._options['caption_tiou_threshold']:
+                                gt_proposal_caption_fw[i] = 1
+                                gt_proposal_caption_bw[i] = i_bw
+                                gt_caption[i] = paragraph[stamp_id]
+
+                        elif overlap:
+                            break
+
+
+            batch_proposal_fw.append(gt_proposal_fw)
+            batch_proposal_bw.append(gt_proposal_bw)
+            batch_proposal_caption_fw.append(gt_proposal_caption_fw)
+            batch_proposal_caption_bw.append(gt_proposal_caption_bw)
+            batch_paragraph.append(gt_caption)
+
+            batch_caption, batch_caption_mask = self.process_batch_paragraph(batch_paragraph)
+
+            batch_feature_fw = np.asarray(batch_feature_fw, dtype='float32')
+            batch_feature_bw = np.asarray(batch_feature_bw, dtype='float32')
+            batch_caption = np.asarray(batch_caption, dtype='int32')
+            batch_caption_mask = np.asarray(batch_caption_mask, dtype='int32')
+
+            batch_proposal_fw = np.asarray(batch_proposal_fw, dtype='int32')
+            batch_proposal_bw = np.asarray(batch_proposal_bw, dtype='int32')
+            batch_proposal_caption_fw = np.asarray(batch_proposal_caption_fw, dtype='int32')
+            batch_proposal_caption_bw = np.asarray(batch_proposal_caption_bw, dtype='int32')
+
+
+            # serve as a tuple
+            batch_data = {'video_feat_fw': batch_feature_fw, 'video_feat_bw': batch_feature_bw, 'caption': batch_caption, 'caption_mask': batch_caption_mask, 'proposal_fw': batch_proposal_fw, 'proposal_bw': batch_proposal_bw, 'proposal_caption_fw': batch_proposal_caption_fw, 'proposal_caption_bw': batch_proposal_caption_bw, 'proposal_weight': np.array(self._proposal_weight)}
+
+
+            yield batch_data
+
+            current = current + batch_size
+
+            if current + batch_size > self.get_size(split):
+                current = 0
+                # at the end of list, shuffle it
+                if split == 'train':
+                    print('Randomly shuffle training data ...')
+                    random.shuffle(ids)
+                    print('The new shuffled ids are:')
+                    print('%s, %s, %s, ..., %s'%(ids[0], ids[1], ids[2], ids[-1]))
+                    time.sleep(3)
+                else:
+                    break
+
+
+