# Dataset Creation 

we have files 
- `resultsAnnotation.tsv`, 
- `datasetAnnotation.tsv`, 
- `taskAnnotation.tsv`, 
- `paper_links.tsv`, 
- `TDM_taxonomy.tsv`, 
- `TDMs_taxonomy.tsv` 
- `paper_name_taxonomy.tsv` 

Created mostly from the file `evaluation-tables.json` from [paperswithcode](https://paperswithcode.com/about)

In [31]:
# imports
import ipdb, os

from sklearn.model_selection import train_test_split 

In [1]:
with open(f"../data/resultsAnnotation.tsv", errors='replace') as f:
    resultsAnnotation = f.read().splitlines()

with open(f"../data/datasetAnnotation.tsv", errors='replace') as f:
    datasetAnnotation = f.read().splitlines()
    
with open(f"../data/taskAnnotation.tsv", errors='replace') as f:
    taskAnnotation = f.read().splitlines()
    
with open(f"../data/TDM_taxonomy.tsv", errors='replace') as f:
    TDM_taxonomy = f.read().splitlines()
    
with open(f"../data/paper_name_taxonomy.tsv", errors='replace') as f:
    paper_name_taxonomy = f.read().splitlines()

In [2]:
resultsAnnotation[5]

'1510.05067v4.pdf\tHandwritten Digit Recognition#MNIST#PERCENTAGE ERROR#0.91$Image Classification#STL-10#Percentage correct#57.32$Image Classification#CIFAR-100#Percentage correct#48.75$Image Classification#SVHN#Percentage error#10.16$Image Classification#CIFAR-10#Percentage correct#80.98'

In [3]:
datasetAnnotation[5]

'1510.05067v4.pdf\tMNIST#STL-10#CIFAR-100#SVHN#CIFAR-10'

In [4]:
taskAnnotation[5]

'1510.05067v4.pdf\tHandwritten Digit Recognition'

In [5]:
TDM_taxonomy[9]

'Deblurring#HIDE (trained on GOPRO)#PSNR (sRGB)\t8'

In [6]:
paper_name_taxonomy[5]

'1510.05067v4.pdf\t5'

In [301]:
def create_training_data(path_to_resultsAnnotation, path_to_TDM_taxonomy, path_parsed_files,
                         output_dir, test_set_portion=0.2,
                         leaderboard_threshold=5, num_negative_instances=5, allowed_unknown=10):
    
    with open(f"{path_to_resultsAnnotation}/resultsAnnotation.tsv", errors='replace') as f:
        resultsAnnotation = f.read().splitlines()
    
    paper_TDM = {}
    for paper in resultsAnnotation:
        if len(paper.split("\t")) != 2:
            continue
            
        title, TDMSList = paper.split("\t")
        
        title = '.'.join(title.split('/')[-1].split('.')[:-1])
        paper_TDM[title] = TDMSList

    with open(f"{path_to_TDM_taxonomy}/TDM_taxonomy.tsv", errors='replace') as f:
        TDM_taxonomy = f.read().splitlines()
        
    TDM_taxonomy_dict = {}
    unknown_count = 0
    for TDMCount in TDM_taxonomy:
        if len(TDMCount.split("\t")) != 2:
            ipdb.set_trace()
            continue
        TDM, count = TDMCount.split("\t")
        count = int(count)
        if count >= leaderboard_threshold:
            TDM_taxonomy_dict[TDM] = count
#     ipdb.set_trace()
    list_parsed_pdf = os.listdir(path_parsed_files)
    if '.ipynb_checkpoints' in list_parsed_pdf:
        list_parsed_pdf.remove('.ipynb_checkpoints')

    
    # ToDo: will it be interresting to use stratified ? using the label ? 
    train_valid = train_test_split(list_parsed_pdf, test_size=10/100, shuffle=True)
    train, valid = train_valid[0], train_valid[1]
    
    if os.path.exists(f"{output_dir}/train.tsv"):
        os.remove(f"{output_dir}/train.tsv")
    
    
    for paper in train :
#         ipdb.set_trace()
        with open(f"{path_parsed_files}{paper}", errors='replace') as f:
            txt = f.read().splitlines()
        content = ' '.join(txt)
#         ipdb.set_trace()
        content = re.sub(r"[^a-zA-Z0-9?,'’‘´`%]+", ' ', content).strip()
#         content = re.sub(r"[\n]+", '', content).strip()
#         content = ' '.join("None")
        
        paper_id = '.'.join(paper.split('/')[-1].split('.')[:-1])
        
        not_seen = True
        if paper_id in paper_TDM.keys():
            cache_tdm = set()
            for contrib in paper_TDM[paper_id].split("$"):
                # keep uniq TDM 
                
                if len(contrib.split("#")) != 4:
    #                 missed += 1
#                     ipdb.set_trace()
                    continue

                task, dataset, metric, score = contrib.split("#")
        
                if (f"{task}#{dataset}#{metric}" in cache_tdm):
                    continue
                
                if f"{task}#{dataset}#{metric}" in TDM_taxonomy_dict.keys():
                    not_seen = False
                    cache_tdm.add(f"{task}#{dataset}#{metric}")
                    with open(f"{output_dir}/train.tsv", "a+", encoding="utf-8") as text_file:
                        text_file.write(f"true\t{paper_id}\t{task}#{dataset}#{metric}\t{content}\n")
                        
            if not_seen and (unknown_count <= allowed_unknown):
                with open(f"{output_dir}/train.tsv", "a+", encoding="utf-8") as text_file:
                    text_file.write(f"true\t{paper_id}\tunknown\t{content}\n")
                    
                    
            random_tdm =  list(TDM_taxonomy_dict.keys()) 
            random_tdm.sort()
            for RandTDM in random_tdm[:num_negative_instances]:
                task, dataset, metric = RandTDM.split("#")
                with open(f"{output_dir}/train.tsv", "a+", encoding="utf-8") as text_file:
                    text_file.write(f"false\t{paper_id}\t{task}#{dataset}#{metric}\t{content}\n")
        else:
            print(f"Paper {paper_id} not in the resultsAnnotation.tssv file")
                        
        

In [302]:
path_grobid_full_txt = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/pdf_txt/"
path_latex_source_tex = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/exp/arxiv_src/"
path_latex_source_pandoc_txt = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/exp/arxiv_src_txt/"

In [303]:
create_training_data(path_to_resultsAnnotation="../data/", \
                         path_to_TDM_taxonomy="../data/", path_parsed_files=path_grobid_full_txt,
                         output_dir="../data/",
                        leaderboard_threshold=5, num_negative_instances=10, allowed_unknown=10)

> [0;32m<ipython-input-301-3e5d9fbdc62d>[0m(55)[0;36mcreate_training_data[0;34m()[0m
[0;32m     54 [0;31m[0;34m[0m[0m
[0m[0;32m---> 55 [0;31m        [0mpaper_id[0m [0;34m=[0m [0;34m'.'[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0mpaper[0m[0;34m.[0m[0msplit[0m[0;34m([0m[0;34m'/'[0m[0;34m)[0m[0;34m[[0m[0;34m-[0m[0;36m1[0m[0;34m][0m[0;34m.[0m[0msplit[0m[0;34m([0m[0;34m'.'[0m[0;34m)[0m[0;34m[[0m[0;34m:[0m[0;34m-[0m[0;36m1[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     56 [0;31m[0;34m[0m[0m
[0m


ipdb>  l


[1;32m     50 [0m        [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m     51 [0m[0;31m#         content = re.sub(r"[^a-zA-Z0-9?,'’‘´`%]+", '', content).strip()[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m     52 [0m[0;31m#         content = re.sub(r"[\n]+", '', content).strip()[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m     53 [0m[0;31m#         content = ' '.join("None")[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m     54 [0m[0;34m[0m[0m
[0;32m---> 55 [0;31m        [0mpaper_id[0m [0;34m=[0m [0;34m'.'[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0mpaper[0m[0;34m.[0m[0msplit[0m[0;34m([0m[0;34m'/'[0m[0;34m)[0m[0;34m[[0m[0;34m-[0m[0;36m1[0m[0;34m][0m[0;34m.[0m[0msplit[0m[0;34m([0m[0;34m'.'[0m[0;34m)[0m[0;34m[[0m[0;34m:[0m[0;34m-[0m[0;36m1[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     56 [0m[0;34m[0m[0m
[1;32m     57 [0m        [0mnot_seen[

ipdb>  content


'Title\tEvaluation of Deep Convolutional Nets for Document Image Classification and Retrieval Abstract:\tThis paper presents a new state-of-the-art for document image classification and retrieval, using features learned by deep convolutional neural networks (CNNs). In object and scene analysis, deep neural nets are capable of learning a hierarchical chain of abstraction from pixel inputs to concise and descriptive representations. The current work explores this capacity in the realm of document analysis, and confirms that this representation strategy is superior to a variety of popular hand-crafted alternatives. Experiments also show that (i) features extracted from CNNs are robust to compression, (ii) CNNs trained on non-document images transfer well to document analysis tasks, and (iii) enforcing region-specific feature-learning is unnecessary given sufficient training data. This work also makes available a new labelled subset of the IIT-CDIP collection, containing 400,000 document i

ipdb>  content = re.sub(r"[^a-zA-Z0-9?,'’‘´`%]+", ' ', content).strip()
ipdb>  content


"Title Evaluation of Deep Convolutional Nets for Document Image Classification and Retrieval Abstract This paper presents a new state of the art for document image classification and retrieval, using features learned by deep convolutional neural networks CNNs In object and scene analysis, deep neural nets are capable of learning a hierarchical chain of abstraction from pixel inputs to concise and descriptive representations The current work explores this capacity in the realm of document analysis, and confirms that this representation strategy is superior to a variety of popular hand crafted alternatives Experiments also show that i features extracted from CNNs are robust to compression, ii CNNs trained on non document images transfer well to document analysis tasks, and iii enforcing region specific feature learning is unnecessary given sufficient training data This work also makes available a new labelled subset of the IIT CDIP collection, containing 400,000 document images across 16

ipdb>  q


BdbQuit: 

## View created data

In [288]:
import pandas as pd

In [296]:
train = pd.read_csv(f"../data/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [298]:
train.head()

Unnamed: 0,label,title,TDM,Context
0,True,1903.10176v3,Image Super-Resolution#Set5 - 8x upscaling#PSNR,TitleDeepREDDeepImagePriorPoweredbyREDAbstract...
1,True,1903.10176v3,Image Super-Resolution#Set14 - 8x upscaling#PSNR,TitleDeepREDDeepImagePriorPoweredbyREDAbstract...
2,True,1903.10176v3,Image Super-Resolution#Set5 - 4x upscaling#PSNR,TitleDeepREDDeepImagePriorPoweredbyREDAbstract...
3,True,1903.10176v3,Image Super-Resolution#Set14 - 4x upscaling#PSNR,TitleDeepREDDeepImagePriorPoweredbyREDAbstract...
4,False,1903.10176v3,3D Absolute Human Pose Estimation#Human3.6M#MRPE,TitleDeepREDDeepImagePriorPoweredbyREDAbstract...


In [291]:
# train.tail()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,label,title,TDM,Context
False,1903.11816v1,3D Face Reconstruction#AFLW2000-3D#Mean NME,Title,FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation Abstract:,"Modern approaches for semantic segmentation usually employ dilated convolutions in the backbone to extract highresolution feature maps, which brings heavy computation complexity and memory footprint. To replace the time and memory consuming dilated convolutions, we propose a novel joint upsampling module named Joint Pyramid Upsampling (JPU) by formulating the task of extracting highresolution feature maps into a joint upsampling problem. With the proposed JPU, our method reduces the computation complexity by more than three times without performance loss. Experiments show that JPU is superior to other upsampling modules, which can be plugged into many existing approaches to reduce computation complexity and improve performance. By replacing dilated convolutions with the proposed JPU module, our method achieves the state-of-the-art performance in Pascal Context dataset (mIoU of 53.13%) and ADE20K dataset (final score of 0.5584) while running 3 times faster. Code is available in https://github.com/wuhuikai/FastFCN . Introduction","Semantic segmentation [23, 40, 4] is one of the fundamental tasks in computer vision, with the goal of assigning a semantic label to each pixel of an image. Modern approaches usually employ a Fully Convolution Network (FCN) [22] to address this task, achieving tremendous success among several segmentation benchmarks. The original FCN is proposed by Long et al. [22] , which is transformed from a Convolutional Neural Network (CNN) [16, 15] designed for image classification. Inheriting from the design for image classification, the original FCN downsamples the input image progressively by stride convolutions and/or spatial pooling layers, resulting in a final feature map in low resolution. Although the final feature map encodes rich semantic information, the fine image structure information is lost, leading to inaccurate predictions around the object boundaries. As shown in Figure 1a , the original FCN typically downsamples the input image 5 times, reducing the spatial resolution of the final feature map by a factor of 32. To obtain a high-resolution final feature map, [3, 28, 18, 30, 27] employ the original FCN as the encoder to capture high-level semantic information, and a decoder is designed to gradually recover the spatial information by combining multi-level feature maps from the encoder. As shown in Figure 1b , we term such methods EncoderDecoder, of which the final prediction generated by the decoder is in high resolution. Alternatively, DeepLab [5] removes the last two downsampling operations from the original FCN and introduces dilated (atrous) convolutions to maintain the receptive field of view unchanged. 1 Following DeepLab, [38, 6, 36] employ a multi-scale context module on top of the final feature map, outperforming most EncoderDecoder methods significantly on several segmentation benchmarks. As shown in Figure 1c , the spatial resolution of the last feature map in DilatedFCN is 4 times larger than that in the original FCN, thus maintaining more structure and location information. The dilated convolutions play an important role in maintaining the spatial resolution of the final feature map, leading to superior performance compared to most methods in EncoderDecoder. However, the introduced dilated convolutions bring heavy computation complexity and memory footprint, which limit the usage in many real-time applications. Taking ResNet-101 [13] as an example, compared to the original FCN, 23 residual blocks (69 convolution layers) in DilatedFCN require to take 4 times more computation resources and memory usages, and 3 residual blocks (9 convolution layers) need to take 16 times more resources. We aim at tackling the aforementioned issue caused by dilated convolutions in this paper. To achieve this, we propose a novel joint upsampling module to replace the time and memory consuming dilated convolutions, namely Joint Pyramid Upsampling (JPU). As a result, our method employs the original FCN as the backbone while applying JPU to upsample the low-resolution final feature map with output stride (OS) 32, resulting in a high-resolution feature map (OS=8). Accordingly, the computation time and memory footprint of the whole segmentation framework is dramatically reduced. Meanwhile, there's no performance loss when replacing the dilated convolutions with the proposed JPU. We attribute this to the ability of JPU to exploit multiscale context across multi-level feature maps. To validate the effectiveness of our method, we first conduct a systematical experiment, showing that the proposed JPU can replace dilated convolutions in several popular approaches without performance loss. We then test the proposed method on several segmentation benchmarks. Results show that our method achieves the state-of-the-art performance while running more than 3 times faster. Concretely, we outperform all the baselines on Pascal Context dataset [23] by a large margin, which achieves the state-ofthe-art performance with mIoU of 53.13%. On ADE20K dataset [40] , we obtain the mIoU of 42.75% with ResNet-50 as the backbone, which sets a new record on the val set. Moreover, our method with ResNet-101 achieves the stateof-the-art performance in the test set of ADE20K dataset. In summary, our contributions are three folds, which are: (1) We propose a computationally efficient joint upsampling module named JPU to replace the time and memory consuming dilated convolutions in the backbone. (2) Based on the proposed JPU, the computation time and memory footprint of the whole segmentation framework can be reduced by a factor of more than 3 and meanwhile achieves better performance. (3) Our method achieves the new state-ofthe-art performance in both Pascal Context dataset (mIoU of 53.13%) and ADE20K dataset (mIoU of 42.75% with ResNet-50 as the backbone on the val set and final score of 0.5584 with ResNet-101 on the test set). Related Work","In this section, we first give an overview on methods for semantic segmentation, which can be categorized into two directions. We then introduce some related works on upsampling. Semantic Segmentation","FCNs [22] have achieved huge success in semantic segmentation. Following FCN, there're two prominent directions, namely DilatedFCN and EncoderDecoder. Dilated-FCNs [11, 34, 7, 6, 38, 36, 5] utilize dilated convolutions to keep the receptive field of view and employ a multi-scale context module to process high-level feature maps. Alternatively, EncoderDecoders [24, 28, 18, 1, 26, 12, 33, 37] propose to utilize an encoder to extract multi-level feature maps, which are then combined into the final prediction by a decoder. DilatedFCN In order to capture multi-scale context information on the high-resolution final feature map, PSP-Net [38] performs pooling operations at multiple grid scales while DeepLabV3 [6] employs parallel atrous convolutions with different rates named ASPP. Alternatively, EncNet [36] utilizes the Context Encoding Module to capture global contextual information. Differently, our method proposes a joint upsampling module named JPU to replace the dilated convolutions in the backbone of DilatedFCNs, which can JPU Encoding/PSP/ASPP Head C o n v 5 , 3 2 x C o n v 4 , 1 6 x C o n v 3 , 8 x C o n v 2 , 4 x C o n v 1 , 2 x 8 x 8 x Figure 2 : Framework Overview of Our Method. Our method employs the same backbone as the original FCN. After the backbone, a novel upsampling module named Joint Pyramid Upsampling (JPU) is proposed, which takes the last three feature maps as the inputs and generates a high-resolution feature map. A multi-scale/global context module is then employed to produce the final label map. Best viewed in color. reduce computation complexity dramatically without performance loss. EncoderDecoder To gradually recover the spatial information, [28] introduces skip connections to construct U-Net, which combines the encoder features and the corresponding decoder activations. [18] proposes a multipath refinement network, which explicitly exploits all the information available along the down-sampling process. DeepLabV3+ [8] combines the advantages of DilatedFCN and EncoderDecoder, which employs DeepLabV3 as the encoder. Our method is complementary to DeepLabV3+, which can reduce the computation overload of DeepLabV3 without performance loss. Upsampling","In our method, we propose a module to upsample a lowresolution feature map given high-resolution feature maps as guidance, which is closely related to joint upsampling as well as data-dependent upsampling. Joint Upsampling In the literature of image processing, joint upsampling aims at leveraging the guidance image as a prior and transferring the structural details from the guidance image to the target image. [17] constructs a joint filter based on CNNs, which learns to recover the structure details in the guidance image. [31] proposes an end-to-end trainable guided filtering module, which upsamples a lowresolution image conditionally. Our method is related to the aforementioned approaches. However, the proposed JPU is designed for processing feature maps with a large number of channels while [17, 31] are specially designed for pro-cessing 3-channel images, which fail to capture the complex relations in high dimensional feature maps. Besides, the motivation and target of our method is completely different. Data-Dependent Upsampling DUpsampling [29] is also related to our method, which takes advantages of the redundancy in the segmentation label space and is able to recover the pixel-wise prediction from low-resolution outputs of CNNs. Compared to our method, DUpsampling has a strong dependency on the label space, which generalizes poorly to a larger or more complex label space. Method","In this section, we first introduce the most popular methods for semantic segmentation, named DilatedFCNs. We then reform the architecture of DilatedFCNs with a novel joint upsampling module, Joint Pyramid Upsampling (JPU). Finally, we discuss the proposed JPU in details, before which joint upsampling, dilated convolution and stride convolution are briefly introduced. DilatedFCN","To exploit Deep CNNs in semantic segmentation, Long et al. [22] transform the CNN designed for image classification into FCN. Taking ResNet-101 as an example, the original CNN contains 5 convolution stages, a global average pooling layer and a linear layer. To construct an FCN, the global average pooling layer and the linear layer are replaced by a convolution layer, which is used to generate the final label map, as shown in Figure 1a . Between each two consecutive convolution stages, stride convolutions and/or spatial pooling layers are employed, resulting in 5 feature maps with gradually reduced spatial resolutions. The spatial resolution of the last feature map in FCN is reduced by a factor of 32, leading to inaccurate predictions about the locations and details. To obtain a final feature map with high resolution, DeepLab [5] removes the downsampling operations before the last two feature maps, as shown in Figure 1c . Besides, the convolution layers inside the last two convolution stages are replaced by dilated convolutions to maintain the receptive field of view, thus named Dilated-FCN. As a result, the resolution of the last feature map is reduced by a factor of 8, which reserves more location and detail information. Following DeepLab, [38, 6] propose a multi-scale context module to capture context information from the last feature map, achieving tremendous success in several segmentation benchmarks. The Framework of Our Method","To obtain a high-resolution final feature map, methods in DilatedFCN remove the last two downsampling operations from the original FCN, which bring in heavy computation complexity and memory footprint due to the enlarged feature maps. In this paper, we aim at seeking an alternative way to approximate the final feature map of DilatedFCN without computation and memory overload. Meanwhile, we expect the performance of our method to be as good as that of the original DilatedFCNs. To achieve this, we first put back all the stride convolutions removed by DilatedFCN, while replacing all the dilated convolutions with regular convolution layers. As shown in Figure 2 , the backbone of our method is the same as that of the original FCN, where the spatial resolutions of the five feature maps (Conv1−Conv5) are gradually reduced by a factor of 2. To obtain a feature map similar to the final feature map of DilatedFCN, we propose a novel module named Joint Pyramid Upsampling (JPU), which takes the last three feature maps (Conv3−Conv5) as inputs. Then a multi-scale context module (PSP [38] /ASPP [6] ) or a global context module (Encoding [36] ) is employed to produce the final predictions. Compared to DilatedFCN, our method takes 4 times fewer computation and memory resources in 23 residual blocks (69 layers) and 16 times fewer in 3 blocks (9 layers) when the backbone is ResNet-101. Thus, our method runs much faster than DilatedFCN while consuming less memory. Joint Pyramid Upsampling","The proposed JPU is designed for generating a feature map that approximates the activations of the final feature map from the backbone of DilatedFCN. Such a problem can be reformulated into joint upsampling, which is then resolved by a CNN designed for this task. Background","Joint Upsampling Given a low-resolution target image and a high-resolution guidance image, joint upsampling aims at generating a high-resolution target image by transferring details and structures from the guidance image. Generally, the low-resolution target image y l is generated by employing a transformation f (•) on the low-resolution guidance image x l , i.e. y l = f (x l ). Given x l and y l , we are required to obtain a transformationf ( y h =f (x h ), wheref (•) = argmin h(•)∈H ||y l − h(x l )||, (1) where H is a set of all possible transformation functions, and || • || is a pre-defined distance metric. Dilated Convolution Dilated convolution is introduced in DeepLab [5] for obtaining high-resolution feature maps while maintaining the receptive field of view. Figure 3a gives an illustration of the dilated convolution in 1D (dilation rate = 2), which can be divided into the following three steps: (1) split the input feature f in into two groups f 0 in and f 1 in according to the parity of the index, (2) process each feature with the same convolution layer, resulting in f 0 out and f 1 out , and (3) merge the two generated features interlaced to obtain the output feature f out . Stride Convolution Stride convolution is proposed to transform the input feature into an output feature with reduced spatial resolution, which is equivalent to the following two steps as shown in Figure 3b: (1) process the input feature f in with a regular convolution to obtain the intermediate feature f m , and (2) remove the elements with an odd index, resulting in f out . Reformulating into Joint Upsampling","The differences between the backbone of our method and DilatedFCN lie on the last two convolution stages. Taking the 4th convolution stage (Conv4) as an example, in Dilat-edFCN, the input feature map is first processed by a regular convolution layer, followed by a series of dilated convolutions (d=2). Differently, our method first processes the input feature map with a stride convolution (s=2), and then employs several regular convolutions to generate the output. Formally, given the input feature map x, the output feature map y d in DilatedFCN is obtained as follows: Fig 3a) , = ! ""# ! $%& ! ""# ! $%& ! ""# ' ! ""# ( ! $%& ' ! $%& ( Split Merge Conv DilatedConv, d=2 y d = x → C r → C d → ...... → C d n = x → C r → SC r M → ...... → SC r M n (Fig 3a) = x → C r → S → C r → ...... → C r n → M = y m → S → C n r → M = {y 0 m , y 1 m } → C n r → M ( while in our method, the output feature map y s is generated as follows: (Fig 3b) . y s = x → C s → C r → ...... → C r n = x → C r → R → C r → ...... → C r n (Fig 3b) = y m → R → C n r = y 0 m → C n r (3) C r , C d , and C s represent a regular/dilated/stride convolution respectively, and C n r is n layers of regular convolutions. S, M and R are split, merge, and reduce operations in Figure 3 , where adjacent S and M operations can be canceled out. Notably, the convolutions in Equations 2 and 3 are in 1D, which is for simplicity. Similar results can be obtained for 2D convolutions. The aforementioned equations show that y s and y d can be obtained with the same function C n r with different inputs: y 0 m and y m , where the former is downsampled from the latter. Thus, given x and y s , the feature map y that ap-proximates y d can be obtained as follows: EQUATION which is the same as the joint upsampling problem defined in Equation 1. Similar conclusions can be easily obtained for the 5th convolution stage (Conv5). Solving with CNNs","Equation 4 is an optimization problem, which takes lots of time to converge through the iterative gradient descent. Alternatively, we propose to approximate the optimization process with a CNN module. To achieve this, we first require to generate y m given x, as shown in Equation 4. Then, features from y 0 m and y s need to be gathered for learning the mappingĥ. Finally, a convolution block is required to transform the gathered features into the final prediction y. Following the aforementioned analysis, we design the JPU module as in Figure 4 . Concretely, each input feature map is firstly processed by a regular convolution block (Fig. 4a) , which is designed for (1) generating y m given x, and (2) transforming f m into an embedding space with reduced dimensions. As a result, all the input features are mapped into the same space, which enables a better fusion and reduces the computation complexity. Then, the generated feature maps are upsampled and concatenated, resulting in y c (Fig. 4b) . Four separable convolutions [14, 9] the convolution with dilation rate 1 is employed to capture the relation between y 0 m and the rest part of y m , as shown by the blue box in Figure 5 . Alternatively, the convolutions with dilation rate 2, 4 and 8 are designed for learning the mappingĥ to transform y 0 m into y s , as shown by the green boxes in Figure 5 . Thus, JPU can extract multi-scale context information from multi-level feature maps, which leads to a better performance. This is significantly different from ASPP [6] , which only exploit the information in the last feature map. The extracted features encode the mapping between y 0 m and y s as well as the relation between y 0 m and the rest part of y m . Thus, another regular convolution block is employed, which transforms the features into the final predictions ( Fig. 4c) . Notably, the proposed JPU module solves two closely related joint upsampling problems jointly, which are (1) upsampling Conv4 based on Conv3 (the 4th convolution stage), and (2) upscaling Conv5 with the guidance of the enlarged Conv4 (the 5th convolution stage). Experiment","In this section, we first introduce the datase...",Dataset Pascal Context dataset [23] is based o...,To show the effectiveness of the proposed meth...,"Pascal Context In Table 1 , our method employs..."
False,1903.11816v1,3D Face Reconstruction#AFLW2000-3D#NME,Title,FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation Abstract:,"Modern approaches for semantic segmentation usually employ dilated convolutions in the backbone to extract highresolution feature maps, which brings heavy computation complexity and memory footprint. To replace the time and memory consuming dilated convolutions, we propose a novel joint upsampling module named Joint Pyramid Upsampling (JPU) by formulating the task of extracting highresolution feature maps into a joint upsampling problem. With the proposed JPU, our method reduces the computation complexity by more than three times without performance loss. Experiments show that JPU is superior to other upsampling modules, which can be plugged into many existing approaches to reduce computation complexity and improve performance. By replacing dilated convolutions with the proposed JPU module, our method achieves the state-of-the-art performance in Pascal Context dataset (mIoU of 53.13%) and ADE20K dataset (final score of 0.5584) while running 3 times faster. Code is available in https://github.com/wuhuikai/FastFCN . Introduction","Semantic segmentation [23, 40, 4] is one of the fundamental tasks in computer vision, with the goal of assigning a semantic label to each pixel of an image. Modern approaches usually employ a Fully Convolution Network (FCN) [22] to address this task, achieving tremendous success among several segmentation benchmarks. The original FCN is proposed by Long et al. [22] , which is transformed from a Convolutional Neural Network (CNN) [16, 15] designed for image classification. Inheriting from the design for image classification, the original FCN downsamples the input image progressively by stride convolutions and/or spatial pooling layers, resulting in a final feature map in low resolution. Although the final feature map encodes rich semantic information, the fine image structure information is lost, leading to inaccurate predictions around the object boundaries. As shown in Figure 1a , the original FCN typically downsamples the input image 5 times, reducing the spatial resolution of the final feature map by a factor of 32. To obtain a high-resolution final feature map, [3, 28, 18, 30, 27] employ the original FCN as the encoder to capture high-level semantic information, and a decoder is designed to gradually recover the spatial information by combining multi-level feature maps from the encoder. As shown in Figure 1b , we term such methods EncoderDecoder, of which the final prediction generated by the decoder is in high resolution. Alternatively, DeepLab [5] removes the last two downsampling operations from the original FCN and introduces dilated (atrous) convolutions to maintain the receptive field of view unchanged. 1 Following DeepLab, [38, 6, 36] employ a multi-scale context module on top of the final feature map, outperforming most EncoderDecoder methods significantly on several segmentation benchmarks. As shown in Figure 1c , the spatial resolution of the last feature map in DilatedFCN is 4 times larger than that in the original FCN, thus maintaining more structure and location information. The dilated convolutions play an important role in maintaining the spatial resolution of the final feature map, leading to superior performance compared to most methods in EncoderDecoder. However, the introduced dilated convolutions bring heavy computation complexity and memory footprint, which limit the usage in many real-time applications. Taking ResNet-101 [13] as an example, compared to the original FCN, 23 residual blocks (69 convolution layers) in DilatedFCN require to take 4 times more computation resources and memory usages, and 3 residual blocks (9 convolution layers) need to take 16 times more resources. We aim at tackling the aforementioned issue caused by dilated convolutions in this paper. To achieve this, we propose a novel joint upsampling module to replace the time and memory consuming dilated convolutions, namely Joint Pyramid Upsampling (JPU). As a result, our method employs the original FCN as the backbone while applying JPU to upsample the low-resolution final feature map with output stride (OS) 32, resulting in a high-resolution feature map (OS=8). Accordingly, the computation time and memory footprint of the whole segmentation framework is dramatically reduced. Meanwhile, there's no performance loss when replacing the dilated convolutions with the proposed JPU. We attribute this to the ability of JPU to exploit multiscale context across multi-level feature maps. To validate the effectiveness of our method, we first conduct a systematical experiment, showing that the proposed JPU can replace dilated convolutions in several popular approaches without performance loss. We then test the proposed method on several segmentation benchmarks. Results show that our method achieves the state-of-the-art performance while running more than 3 times faster. Concretely, we outperform all the baselines on Pascal Context dataset [23] by a large margin, which achieves the state-ofthe-art performance with mIoU of 53.13%. On ADE20K dataset [40] , we obtain the mIoU of 42.75% with ResNet-50 as the backbone, which sets a new record on the val set. Moreover, our method with ResNet-101 achieves the stateof-the-art performance in the test set of ADE20K dataset. In summary, our contributions are three folds, which are: (1) We propose a computationally efficient joint upsampling module named JPU to replace the time and memory consuming dilated convolutions in the backbone. (2) Based on the proposed JPU, the computation time and memory footprint of the whole segmentation framework can be reduced by a factor of more than 3 and meanwhile achieves better performance. (3) Our method achieves the new state-ofthe-art performance in both Pascal Context dataset (mIoU of 53.13%) and ADE20K dataset (mIoU of 42.75% with ResNet-50 as the backbone on the val set and final score of 0.5584 with ResNet-101 on the test set). Related Work","In this section, we first give an overview on methods for semantic segmentation, which can be categorized into two directions. We then introduce some related works on upsampling. Semantic Segmentation","FCNs [22] have achieved huge success in semantic segmentation. Following FCN, there're two prominent directions, namely DilatedFCN and EncoderDecoder. Dilated-FCNs [11, 34, 7, 6, 38, 36, 5] utilize dilated convolutions to keep the receptive field of view and employ a multi-scale context module to process high-level feature maps. Alternatively, EncoderDecoders [24, 28, 18, 1, 26, 12, 33, 37] propose to utilize an encoder to extract multi-level feature maps, which are then combined into the final prediction by a decoder. DilatedFCN In order to capture multi-scale context information on the high-resolution final feature map, PSP-Net [38] performs pooling operations at multiple grid scales while DeepLabV3 [6] employs parallel atrous convolutions with different rates named ASPP. Alternatively, EncNet [36] utilizes the Context Encoding Module to capture global contextual information. Differently, our method proposes a joint upsampling module named JPU to replace the dilated convolutions in the backbone of DilatedFCNs, which can JPU Encoding/PSP/ASPP Head C o n v 5 , 3 2 x C o n v 4 , 1 6 x C o n v 3 , 8 x C o n v 2 , 4 x C o n v 1 , 2 x 8 x 8 x Figure 2 : Framework Overview of Our Method. Our method employs the same backbone as the original FCN. After the backbone, a novel upsampling module named Joint Pyramid Upsampling (JPU) is proposed, which takes the last three feature maps as the inputs and generates a high-resolution feature map. A multi-scale/global context module is then employed to produce the final label map. Best viewed in color. reduce computation complexity dramatically without performance loss. EncoderDecoder To gradually recover the spatial information, [28] introduces skip connections to construct U-Net, which combines the encoder features and the corresponding decoder activations. [18] proposes a multipath refinement network, which explicitly exploits all the information available along the down-sampling process. DeepLabV3+ [8] combines the advantages of DilatedFCN and EncoderDecoder, which employs DeepLabV3 as the encoder. Our method is complementary to DeepLabV3+, which can reduce the computation overload of DeepLabV3 without performance loss. Upsampling","In our method, we propose a module to upsample a lowresolution feature map given high-resolution feature maps as guidance, which is closely related to joint upsampling as well as data-dependent upsampling. Joint Upsampling In the literature of image processing, joint upsampling aims at leveraging the guidance image as a prior and transferring the structural details from the guidance image to the target image. [17] constructs a joint filter based on CNNs, which learns to recover the structure details in the guidance image. [31] proposes an end-to-end trainable guided filtering module, which upsamples a lowresolution image conditionally. Our method is related to the aforementioned approaches. However, the proposed JPU is designed for processing feature maps with a large number of channels while [17, 31] are specially designed for pro-cessing 3-channel images, which fail to capture the complex relations in high dimensional feature maps. Besides, the motivation and target of our method is completely different. Data-Dependent Upsampling DUpsampling [29] is also related to our method, which takes advantages of the redundancy in the segmentation label space and is able to recover the pixel-wise prediction from low-resolution outputs of CNNs. Compared to our method, DUpsampling has a strong dependency on the label space, which generalizes poorly to a larger or more complex label space. Method","In this section, we first introduce the most popular methods for semantic segmentation, named DilatedFCNs. We then reform the architecture of DilatedFCNs with a novel joint upsampling module, Joint Pyramid Upsampling (JPU). Finally, we discuss the proposed JPU in details, before which joint upsampling, dilated convolution and stride convolution are briefly introduced. DilatedFCN","To exploit Deep CNNs in semantic segmentation, Long et al. [22] transform the CNN designed for image classification into FCN. Taking ResNet-101 as an example, the original CNN contains 5 convolution stages, a global average pooling layer and a linear layer. To construct an FCN, the global average pooling layer and the linear layer are replaced by a convolution layer, which is used to generate the final label map, as shown in Figure 1a . Between each two consecutive convolution stages, stride convolutions and/or spatial pooling layers are employed, resulting in 5 feature maps with gradually reduced spatial resolutions. The spatial resolution of the last feature map in FCN is reduced by a factor of 32, leading to inaccurate predictions about the locations and details. To obtain a final feature map with high resolution, DeepLab [5] removes the downsampling operations before the last two feature maps, as shown in Figure 1c . Besides, the convolution layers inside the last two convolution stages are replaced by dilated convolutions to maintain the receptive field of view, thus named Dilated-FCN. As a result, the resolution of the last feature map is reduced by a factor of 8, which reserves more location and detail information. Following DeepLab, [38, 6] propose a multi-scale context module to capture context information from the last feature map, achieving tremendous success in several segmentation benchmarks. The Framework of Our Method","To obtain a high-resolution final feature map, methods in DilatedFCN remove the last two downsampling operations from the original FCN, which bring in heavy computation complexity and memory footprint due to the enlarged feature maps. In this paper, we aim at seeking an alternative way to approximate the final feature map of DilatedFCN without computation and memory overload. Meanwhile, we expect the performance of our method to be as good as that of the original DilatedFCNs. To achieve this, we first put back all the stride convolutions removed by DilatedFCN, while replacing all the dilated convolutions with regular convolution layers. As shown in Figure 2 , the backbone of our method is the same as that of the original FCN, where the spatial resolutions of the five feature maps (Conv1−Conv5) are gradually reduced by a factor of 2. To obtain a feature map similar to the final feature map of DilatedFCN, we propose a novel module named Joint Pyramid Upsampling (JPU), which takes the last three feature maps (Conv3−Conv5) as inputs. Then a multi-scale context module (PSP [38] /ASPP [6] ) or a global context module (Encoding [36] ) is employed to produce the final predictions. Compared to DilatedFCN, our method takes 4 times fewer computation and memory resources in 23 residual blocks (69 layers) and 16 times fewer in 3 blocks (9 layers) when the backbone is ResNet-101. Thus, our method runs much faster than DilatedFCN while consuming less memory. Joint Pyramid Upsampling","The proposed JPU is designed for generating a feature map that approximates the activations of the final feature map from the backbone of DilatedFCN. Such a problem can be reformulated into joint upsampling, which is then resolved by a CNN designed for this task. Background","Joint Upsampling Given a low-resolution target image and a high-resolution guidance image, joint upsampling aims at generating a high-resolution target image by transferring details and structures from the guidance image. Generally, the low-resolution target image y l is generated by employing a transformation f (•) on the low-resolution guidance image x l , i.e. y l = f (x l ). Given x l and y l , we are required to obtain a transformationf ( y h =f (x h ), wheref (•) = argmin h(•)∈H ||y l − h(x l )||, (1) where H is a set of all possible transformation functions, and || • || is a pre-defined distance metric. Dilated Convolution Dilated convolution is introduced in DeepLab [5] for obtaining high-resolution feature maps while maintaining the receptive field of view. Figure 3a gives an illustration of the dilated convolution in 1D (dilation rate = 2), which can be divided into the following three steps: (1) split the input feature f in into two groups f 0 in and f 1 in according to the parity of the index, (2) process each feature with the same convolution layer, resulting in f 0 out and f 1 out , and (3) merge the two generated features interlaced to obtain the output feature f out . Stride Convolution Stride convolution is proposed to transform the input feature into an output feature with reduced spatial resolution, which is equivalent to the following two steps as shown in Figure 3b: (1) process the input feature f in with a regular convolution to obtain the intermediate feature f m , and (2) remove the elements with an odd index, resulting in f out . Reformulating into Joint Upsampling","The differences between the backbone of our method and DilatedFCN lie on the last two convolution stages. Taking the 4th convolution stage (Conv4) as an example, in Dilat-edFCN, the input feature map is first processed by a regular convolution layer, followed by a series of dilated convolutions (d=2). Differently, our method first processes the input feature map with a stride convolution (s=2), and then employs several regular convolutions to generate the output. Formally, given the input feature map x, the output feature map y d in DilatedFCN is obtained as follows: Fig 3a) , = ! ""# ! $%& ! ""# ! $%& ! ""# ' ! ""# ( ! $%& ' ! $%& ( Split Merge Conv DilatedConv, d=2 y d = x → C r → C d → ...... → C d n = x → C r → SC r M → ...... → SC r M n (Fig 3a) = x → C r → S → C r → ...... → C r n → M = y m → S → C n r → M = {y 0 m , y 1 m } → C n r → M ( while in our method, the output feature map y s is generated as follows: (Fig 3b) . y s = x → C s → C r → ...... → C r n = x → C r → R → C r → ...... → C r n (Fig 3b) = y m → R → C n r = y 0 m → C n r (3) C r , C d , and C s represent a regular/dilated/stride convolution respectively, and C n r is n layers of regular convolutions. S, M and R are split, merge, and reduce operations in Figure 3 , where adjacent S and M operations can be canceled out. Notably, the convolutions in Equations 2 and 3 are in 1D, which is for simplicity. Similar results can be obtained for 2D convolutions. The aforementioned equations show that y s and y d can be obtained with the same function C n r with different inputs: y 0 m and y m , where the former is downsampled from the latter. Thus, given x and y s , the feature map y that ap-proximates y d can be obtained as follows: EQUATION which is the same as the joint upsampling problem defined in Equation 1. Similar conclusions can be easily obtained for the 5th convolution stage (Conv5). Solving with CNNs","Equation 4 is an optimization problem, which takes lots of time to converge through the iterative gradient descent. Alternatively, we propose to approximate the optimization process with a CNN module. To achieve this, we first require to generate y m given x, as shown in Equation 4. Then, features from y 0 m and y s need to be gathered for learning the mappingĥ. Finally, a convolution block is required to transform the gathered features into the final prediction y. Following the aforementioned analysis, we design the JPU module as in Figure 4 . Concretely, each input feature map is firstly processed by a regular convolution block (Fig. 4a) , which is designed for (1) generating y m given x, and (2) transforming f m into an embedding space with reduced dimensions. As a result, all the input features are mapped into the same space, which enables a better fusion and reduces the computation complexity. Then, the generated feature maps are upsampled and concatenated, resulting in y c (Fig. 4b) . Four separable convolutions [14, 9] the convolution with dilation rate 1 is employed to capture the relation between y 0 m and the rest part of y m , as shown by the blue box in Figure 5 . Alternatively, the convolutions with dilation rate 2, 4 and 8 are designed for learning the mappingĥ to transform y 0 m into y s , as shown by the green boxes in Figure 5 . Thus, JPU can extract multi-scale context information from multi-level feature maps, which leads to a better performance. This is significantly different from ASPP [6] , which only exploit the information in the last feature map. The extracted features encode the mapping between y 0 m and y s as well as the relation between y 0 m and the rest part of y m . Thus, another regular convolution block is employed, which transforms the features into the final predictions ( Fig. 4c) . Notably, the proposed JPU module solves two closely related joint upsampling problems jointly, which are (1) upsampling Conv4 based on Conv3 (the 4th convolution stage), and (2) upscaling Conv5 with the guidance of the enlarged Conv4 (the 5th convolution stage). Experiment","In this section, we first introduce the datase...",Dataset Pascal Context dataset [23] is based o...,To show the effectiveness of the proposed meth...,"Pascal Context In Table 1 , our method employs..."
False,1903.11816v1,3D Face Reconstruction#Florence#Average 3D Error,Title,FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation Abstract:,"Modern approaches for semantic segmentation usually employ dilated convolutions in the backbone to extract highresolution feature maps, which brings heavy computation complexity and memory footprint. To replace the time and memory consuming dilated convolutions, we propose a novel joint upsampling module named Joint Pyramid Upsampling (JPU) by formulating the task of extracting highresolution feature maps into a joint upsampling problem. With the proposed JPU, our method reduces the computation complexity by more than three times without performance loss. Experiments show that JPU is superior to other upsampling modules, which can be plugged into many existing approaches to reduce computation complexity and improve performance. By replacing dilated convolutions with the proposed JPU module, our method achieves the state-of-the-art performance in Pascal Context dataset (mIoU of 53.13%) and ADE20K dataset (final score of 0.5584) while running 3 times faster. Code is available in https://github.com/wuhuikai/FastFCN . Introduction","Semantic segmentation [23, 40, 4] is one of the fundamental tasks in computer vision, with the goal of assigning a semantic label to each pixel of an image. Modern approaches usually employ a Fully Convolution Network (FCN) [22] to address this task, achieving tremendous success among several segmentation benchmarks. The original FCN is proposed by Long et al. [22] , which is transformed from a Convolutional Neural Network (CNN) [16, 15] designed for image classification. Inheriting from the design for image classification, the original FCN downsamples the input image progressively by stride convolutions and/or spatial pooling layers, resulting in a final feature map in low resolution. Although the final feature map encodes rich semantic information, the fine image structure information is lost, leading to inaccurate predictions around the object boundaries. As shown in Figure 1a , the original FCN typically downsamples the input image 5 times, reducing the spatial resolution of the final feature map by a factor of 32. To obtain a high-resolution final feature map, [3, 28, 18, 30, 27] employ the original FCN as the encoder to capture high-level semantic information, and a decoder is designed to gradually recover the spatial information by combining multi-level feature maps from the encoder. As shown in Figure 1b , we term such methods EncoderDecoder, of which the final prediction generated by the decoder is in high resolution. Alternatively, DeepLab [5] removes the last two downsampling operations from the original FCN and introduces dilated (atrous) convolutions to maintain the receptive field of view unchanged. 1 Following DeepLab, [38, 6, 36] employ a multi-scale context module on top of the final feature map, outperforming most EncoderDecoder methods significantly on several segmentation benchmarks. As shown in Figure 1c , the spatial resolution of the last feature map in DilatedFCN is 4 times larger than that in the original FCN, thus maintaining more structure and location information. The dilated convolutions play an important role in maintaining the spatial resolution of the final feature map, leading to superior performance compared to most methods in EncoderDecoder. However, the introduced dilated convolutions bring heavy computation complexity and memory footprint, which limit the usage in many real-time applications. Taking ResNet-101 [13] as an example, compared to the original FCN, 23 residual blocks (69 convolution layers) in DilatedFCN require to take 4 times more computation resources and memory usages, and 3 residual blocks (9 convolution layers) need to take 16 times more resources. We aim at tackling the aforementioned issue caused by dilated convolutions in this paper. To achieve this, we propose a novel joint upsampling module to replace the time and memory consuming dilated convolutions, namely Joint Pyramid Upsampling (JPU). As a result, our method employs the original FCN as the backbone while applying JPU to upsample the low-resolution final feature map with output stride (OS) 32, resulting in a high-resolution feature map (OS=8). Accordingly, the computation time and memory footprint of the whole segmentation framework is dramatically reduced. Meanwhile, there's no performance loss when replacing the dilated convolutions with the proposed JPU. We attribute this to the ability of JPU to exploit multiscale context across multi-level feature maps. To validate the effectiveness of our method, we first conduct a systematical experiment, showing that the proposed JPU can replace dilated convolutions in several popular approaches without performance loss. We then test the proposed method on several segmentation benchmarks. Results show that our method achieves the state-of-the-art performance while running more than 3 times faster. Concretely, we outperform all the baselines on Pascal Context dataset [23] by a large margin, which achieves the state-ofthe-art performance with mIoU of 53.13%. On ADE20K dataset [40] , we obtain the mIoU of 42.75% with ResNet-50 as the backbone, which sets a new record on the val set. Moreover, our method with ResNet-101 achieves the stateof-the-art performance in the test set of ADE20K dataset. In summary, our contributions are three folds, which are: (1) We propose a computationally efficient joint upsampling module named JPU to replace the time and memory consuming dilated convolutions in the backbone. (2) Based on the proposed JPU, the computation time and memory footprint of the whole segmentation framework can be reduced by a factor of more than 3 and meanwhile achieves better performance. (3) Our method achieves the new state-ofthe-art performance in both Pascal Context dataset (mIoU of 53.13%) and ADE20K dataset (mIoU of 42.75% with ResNet-50 as the backbone on the val set and final score of 0.5584 with ResNet-101 on the test set). Related Work","In this section, we first give an overview on methods for semantic segmentation, which can be categorized into two directions. We then introduce some related works on upsampling. Semantic Segmentation","FCNs [22] have achieved huge success in semantic segmentation. Following FCN, there're two prominent directions, namely DilatedFCN and EncoderDecoder. Dilated-FCNs [11, 34, 7, 6, 38, 36, 5] utilize dilated convolutions to keep the receptive field of view and employ a multi-scale context module to process high-level feature maps. Alternatively, EncoderDecoders [24, 28, 18, 1, 26, 12, 33, 37] propose to utilize an encoder to extract multi-level feature maps, which are then combined into the final prediction by a decoder. DilatedFCN In order to capture multi-scale context information on the high-resolution final feature map, PSP-Net [38] performs pooling operations at multiple grid scales while DeepLabV3 [6] employs parallel atrous convolutions with different rates named ASPP. Alternatively, EncNet [36] utilizes the Context Encoding Module to capture global contextual information. Differently, our method proposes a joint upsampling module named JPU to replace the dilated convolutions in the backbone of DilatedFCNs, which can JPU Encoding/PSP/ASPP Head C o n v 5 , 3 2 x C o n v 4 , 1 6 x C o n v 3 , 8 x C o n v 2 , 4 x C o n v 1 , 2 x 8 x 8 x Figure 2 : Framework Overview of Our Method. Our method employs the same backbone as the original FCN. After the backbone, a novel upsampling module named Joint Pyramid Upsampling (JPU) is proposed, which takes the last three feature maps as the inputs and generates a high-resolution feature map. A multi-scale/global context module is then employed to produce the final label map. Best viewed in color. reduce computation complexity dramatically without performance loss. EncoderDecoder To gradually recover the spatial information, [28] introduces skip connections to construct U-Net, which combines the encoder features and the corresponding decoder activations. [18] proposes a multipath refinement network, which explicitly exploits all the information available along the down-sampling process. DeepLabV3+ [8] combines the advantages of DilatedFCN and EncoderDecoder, which employs DeepLabV3 as the encoder. Our method is complementary to DeepLabV3+, which can reduce the computation overload of DeepLabV3 without performance loss. Upsampling","In our method, we propose a module to upsample a lowresolution feature map given high-resolution feature maps as guidance, which is closely related to joint upsampling as well as data-dependent upsampling. Joint Upsampling In the literature of image processing, joint upsampling aims at leveraging the guidance image as a prior and transferring the structural details from the guidance image to the target image. [17] constructs a joint filter based on CNNs, which learns to recover the structure details in the guidance image. [31] proposes an end-to-end trainable guided filtering module, which upsamples a lowresolution image conditionally. Our method is related to the aforementioned approaches. However, the proposed JPU is designed for processing feature maps with a large number of channels while [17, 31] are specially designed for pro-cessing 3-channel images, which fail to capture the complex relations in high dimensional feature maps. Besides, the motivation and target of our method is completely different. Data-Dependent Upsampling DUpsampling [29] is also related to our method, which takes advantages of the redundancy in the segmentation label space and is able to recover the pixel-wise prediction from low-resolution outputs of CNNs. Compared to our method, DUpsampling has a strong dependency on the label space, which generalizes poorly to a larger or more complex label space. Method","In this section, we first introduce the most popular methods for semantic segmentation, named DilatedFCNs. We then reform the architecture of DilatedFCNs with a novel joint upsampling module, Joint Pyramid Upsampling (JPU). Finally, we discuss the proposed JPU in details, before which joint upsampling, dilated convolution and stride convolution are briefly introduced. DilatedFCN","To exploit Deep CNNs in semantic segmentation, Long et al. [22] transform the CNN designed for image classification into FCN. Taking ResNet-101 as an example, the original CNN contains 5 convolution stages, a global average pooling layer and a linear layer. To construct an FCN, the global average pooling layer and the linear layer are replaced by a convolution layer, which is used to generate the final label map, as shown in Figure 1a . Between each two consecutive convolution stages, stride convolutions and/or spatial pooling layers are employed, resulting in 5 feature maps with gradually reduced spatial resolutions. The spatial resolution of the last feature map in FCN is reduced by a factor of 32, leading to inaccurate predictions about the locations and details. To obtain a final feature map with high resolution, DeepLab [5] removes the downsampling operations before the last two feature maps, as shown in Figure 1c . Besides, the convolution layers inside the last two convolution stages are replaced by dilated convolutions to maintain the receptive field of view, thus named Dilated-FCN. As a result, the resolution of the last feature map is reduced by a factor of 8, which reserves more location and detail information. Following DeepLab, [38, 6] propose a multi-scale context module to capture context information from the last feature map, achieving tremendous success in several segmentation benchmarks. The Framework of Our Method","To obtain a high-resolution final feature map, methods in DilatedFCN remove the last two downsampling operations from the original FCN, which bring in heavy computation complexity and memory footprint due to the enlarged feature maps. In this paper, we aim at seeking an alternative way to approximate the final feature map of DilatedFCN without computation and memory overload. Meanwhile, we expect the performance of our method to be as good as that of the original DilatedFCNs. To achieve this, we first put back all the stride convolutions removed by DilatedFCN, while replacing all the dilated convolutions with regular convolution layers. As shown in Figure 2 , the backbone of our method is the same as that of the original FCN, where the spatial resolutions of the five feature maps (Conv1−Conv5) are gradually reduced by a factor of 2. To obtain a feature map similar to the final feature map of DilatedFCN, we propose a novel module named Joint Pyramid Upsampling (JPU), which takes the last three feature maps (Conv3−Conv5) as inputs. Then a multi-scale context module (PSP [38] /ASPP [6] ) or a global context module (Encoding [36] ) is employed to produce the final predictions. Compared to DilatedFCN, our method takes 4 times fewer computation and memory resources in 23 residual blocks (69 layers) and 16 times fewer in 3 blocks (9 layers) when the backbone is ResNet-101. Thus, our method runs much faster than DilatedFCN while consuming less memory. Joint Pyramid Upsampling","The proposed JPU is designed for generating a feature map that approximates the activations of the final feature map from the backbone of DilatedFCN. Such a problem can be reformulated into joint upsampling, which is then resolved by a CNN designed for this task. Background","Joint Upsampling Given a low-resolution target image and a high-resolution guidance image, joint upsampling aims at generating a high-resolution target image by transferring details and structures from the guidance image. Generally, the low-resolution target image y l is generated by employing a transformation f (•) on the low-resolution guidance image x l , i.e. y l = f (x l ). Given x l and y l , we are required to obtain a transformationf ( y h =f (x h ), wheref (•) = argmin h(•)∈H ||y l − h(x l )||, (1) where H is a set of all possible transformation functions, and || • || is a pre-defined distance metric. Dilated Convolution Dilated convolution is introduced in DeepLab [5] for obtaining high-resolution feature maps while maintaining the receptive field of view. Figure 3a gives an illustration of the dilated convolution in 1D (dilation rate = 2), which can be divided into the following three steps: (1) split the input feature f in into two groups f 0 in and f 1 in according to the parity of the index, (2) process each feature with the same convolution layer, resulting in f 0 out and f 1 out , and (3) merge the two generated features interlaced to obtain the output feature f out . Stride Convolution Stride convolution is proposed to transform the input feature into an output feature with reduced spatial resolution, which is equivalent to the following two steps as shown in Figure 3b: (1) process the input feature f in with a regular convolution to obtain the intermediate feature f m , and (2) remove the elements with an odd index, resulting in f out . Reformulating into Joint Upsampling","The differences between the backbone of our method and DilatedFCN lie on the last two convolution stages. Taking the 4th convolution stage (Conv4) as an example, in Dilat-edFCN, the input feature map is first processed by a regular convolution layer, followed by a series of dilated convolutions (d=2). Differently, our method first processes the input feature map with a stride convolution (s=2), and then employs several regular convolutions to generate the output. Formally, given the input feature map x, the output feature map y d in DilatedFCN is obtained as follows: Fig 3a) , = ! ""# ! $%& ! ""# ! $%& ! ""# ' ! ""# ( ! $%& ' ! $%& ( Split Merge Conv DilatedConv, d=2 y d = x → C r → C d → ...... → C d n = x → C r → SC r M → ...... → SC r M n (Fig 3a) = x → C r → S → C r → ...... → C r n → M = y m → S → C n r → M = {y 0 m , y 1 m } → C n r → M ( while in our method, the output feature map y s is generated as follows: (Fig 3b) . y s = x → C s → C r → ...... → C r n = x → C r → R → C r → ...... → C r n (Fig 3b) = y m → R → C n r = y 0 m → C n r (3) C r , C d , and C s represent a regular/dilated/stride convolution respectively, and C n r is n layers of regular convolutions. S, M and R are split, merge, and reduce operations in Figure 3 , where adjacent S and M operations can be canceled out. Notably, the convolutions in Equations 2 and 3 are in 1D, which is for simplicity. Similar results can be obtained for 2D convolutions. The aforementioned equations show that y s and y d can be obtained with the same function C n r with different inputs: y 0 m and y m , where the former is downsampled from the latter. Thus, given x and y s , the feature map y that ap-proximates y d can be obtained as follows: EQUATION which is the same as the joint upsampling problem defined in Equation 1. Similar conclusions can be easily obtained for the 5th convolution stage (Conv5). Solving with CNNs","Equation 4 is an optimization problem, which takes lots of time to converge through the iterative gradient descent. Alternatively, we propose to approximate the optimization process with a CNN module. To achieve this, we first require to generate y m given x, as shown in Equation 4. Then, features from y 0 m and y s need to be gathered for learning the mappingĥ. Finally, a convolution block is required to transform the gathered features into the final prediction y. Following the aforementioned analysis, we design the JPU module as in Figure 4 . Concretely, each input feature map is firstly processed by a regular convolution block (Fig. 4a) , which is designed for (1) generating y m given x, and (2) transforming f m into an embedding space with reduced dimensions. As a result, all the input features are mapped into the same space, which enables a better fusion and reduces the computation complexity. Then, the generated feature maps are upsampled and concatenated, resulting in y c (Fig. 4b) . Four separable convolutions [14, 9] the convolution with dilation rate 1 is employed to capture the relation between y 0 m and the rest part of y m , as shown by the blue box in Figure 5 . Alternatively, the convolutions with dilation rate 2, 4 and 8 are designed for learning the mappingĥ to transform y 0 m into y s , as shown by the green boxes in Figure 5 . Thus, JPU can extract multi-scale context information from multi-level feature maps, which leads to a better performance. This is significantly different from ASPP [6] , which only exploit the information in the last feature map. The extracted features encode the mapping between y 0 m and y s as well as the relation between y 0 m and the rest part of y m . Thus, another regular convolution block is employed, which transforms the features into the final predictions ( Fig. 4c) . Notably, the proposed JPU module solves two closely related joint upsampling problems jointly, which are (1) upsampling Conv4 based on Conv3 (the 4th convolution stage), and (2) upscaling Conv5 with the guidance of the enlarged Conv4 (the 5th convolution stage). Experiment","In this section, we first introduce the datase...",Dataset Pascal Context dataset [23] is based o...,To show the effectiveness of the proposed meth...,"Pascal Context In Table 1 , our method employs..."
False,1903.11816v1,3D Face Reconstruction#Florence#Mean NME,Title,FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation Abstract:,"Modern approaches for semantic segmentation usually employ dilated convolutions in the backbone to extract highresolution feature maps, which brings heavy computation complexity and memory footprint. To replace the time and memory consuming dilated convolutions, we propose a novel joint upsampling module named Joint Pyramid Upsampling (JPU) by formulating the task of extracting highresolution feature maps into a joint upsampling problem. With the proposed JPU, our method reduces the computation complexity by more than three times without performance loss. Experiments show that JPU is superior to other upsampling modules, which can be plugged into many existing approaches to reduce computation complexity and improve performance. By replacing dilated convolutions with the proposed JPU module, our method achieves the state-of-the-art performance in Pascal Context dataset (mIoU of 53.13%) and ADE20K dataset (final score of 0.5584) while running 3 times faster. Code is available in https://github.com/wuhuikai/FastFCN . Introduction","Semantic segmentation [23, 40, 4] is one of the fundamental tasks in computer vision, with the goal of assigning a semantic label to each pixel of an image. Modern approaches usually employ a Fully Convolution Network (FCN) [22] to address this task, achieving tremendous success among several segmentation benchmarks. The original FCN is proposed by Long et al. [22] , which is transformed from a Convolutional Neural Network (CNN) [16, 15] designed for image classification. Inheriting from the design for image classification, the original FCN downsamples the input image progressively by stride convolutions and/or spatial pooling layers, resulting in a final feature map in low resolution. Although the final feature map encodes rich semantic information, the fine image structure information is lost, leading to inaccurate predictions around the object boundaries. As shown in Figure 1a , the original FCN typically downsamples the input image 5 times, reducing the spatial resolution of the final feature map by a factor of 32. To obtain a high-resolution final feature map, [3, 28, 18, 30, 27] employ the original FCN as the encoder to capture high-level semantic information, and a decoder is designed to gradually recover the spatial information by combining multi-level feature maps from the encoder. As shown in Figure 1b , we term such methods EncoderDecoder, of which the final prediction generated by the decoder is in high resolution. Alternatively, DeepLab [5] removes the last two downsampling operations from the original FCN and introduces dilated (atrous) convolutions to maintain the receptive field of view unchanged. 1 Following DeepLab, [38, 6, 36] employ a multi-scale context module on top of the final feature map, outperforming most EncoderDecoder methods significantly on several segmentation benchmarks. As shown in Figure 1c , the spatial resolution of the last feature map in DilatedFCN is 4 times larger than that in the original FCN, thus maintaining more structure and location information. The dilated convolutions play an important role in maintaining the spatial resolution of the final feature map, leading to superior performance compared to most methods in EncoderDecoder. However, the introduced dilated convolutions bring heavy computation complexity and memory footprint, which limit the usage in many real-time applications. Taking ResNet-101 [13] as an example, compared to the original FCN, 23 residual blocks (69 convolution layers) in DilatedFCN require to take 4 times more computation resources and memory usages, and 3 residual blocks (9 convolution layers) need to take 16 times more resources. We aim at tackling the aforementioned issue caused by dilated convolutions in this paper. To achieve this, we propose a novel joint upsampling module to replace the time and memory consuming dilated convolutions, namely Joint Pyramid Upsampling (JPU). As a result, our method employs the original FCN as the backbone while applying JPU to upsample the low-resolution final feature map with output stride (OS) 32, resulting in a high-resolution feature map (OS=8). Accordingly, the computation time and memory footprint of the whole segmentation framework is dramatically reduced. Meanwhile, there's no performance loss when replacing the dilated convolutions with the proposed JPU. We attribute this to the ability of JPU to exploit multiscale context across multi-level feature maps. To validate the effectiveness of our method, we first conduct a systematical experiment, showing that the proposed JPU can replace dilated convolutions in several popular approaches without performance loss. We then test the proposed method on several segmentation benchmarks. Results show that our method achieves the state-of-the-art performance while running more than 3 times faster. Concretely, we outperform all the baselines on Pascal Context dataset [23] by a large margin, which achieves the state-ofthe-art performance with mIoU of 53.13%. On ADE20K dataset [40] , we obtain the mIoU of 42.75% with ResNet-50 as the backbone, which sets a new record on the val set. Moreover, our method with ResNet-101 achieves the stateof-the-art performance in the test set of ADE20K dataset. In summary, our contributions are three folds, which are: (1) We propose a computationally efficient joint upsampling module named JPU to replace the time and memory consuming dilated convolutions in the backbone. (2) Based on the proposed JPU, the computation time and memory footprint of the whole segmentation framework can be reduced by a factor of more than 3 and meanwhile achieves better performance. (3) Our method achieves the new state-ofthe-art performance in both Pascal Context dataset (mIoU of 53.13%) and ADE20K dataset (mIoU of 42.75% with ResNet-50 as the backbone on the val set and final score of 0.5584 with ResNet-101 on the test set). Related Work","In this section, we first give an overview on methods for semantic segmentation, which can be categorized into two directions. We then introduce some related works on upsampling. Semantic Segmentation","FCNs [22] have achieved huge success in semantic segmentation. Following FCN, there're two prominent directions, namely DilatedFCN and EncoderDecoder. Dilated-FCNs [11, 34, 7, 6, 38, 36, 5] utilize dilated convolutions to keep the receptive field of view and employ a multi-scale context module to process high-level feature maps. Alternatively, EncoderDecoders [24, 28, 18, 1, 26, 12, 33, 37] propose to utilize an encoder to extract multi-level feature maps, which are then combined into the final prediction by a decoder. DilatedFCN In order to capture multi-scale context information on the high-resolution final feature map, PSP-Net [38] performs pooling operations at multiple grid scales while DeepLabV3 [6] employs parallel atrous convolutions with different rates named ASPP. Alternatively, EncNet [36] utilizes the Context Encoding Module to capture global contextual information. Differently, our method proposes a joint upsampling module named JPU to replace the dilated convolutions in the backbone of DilatedFCNs, which can JPU Encoding/PSP/ASPP Head C o n v 5 , 3 2 x C o n v 4 , 1 6 x C o n v 3 , 8 x C o n v 2 , 4 x C o n v 1 , 2 x 8 x 8 x Figure 2 : Framework Overview of Our Method. Our method employs the same backbone as the original FCN. After the backbone, a novel upsampling module named Joint Pyramid Upsampling (JPU) is proposed, which takes the last three feature maps as the inputs and generates a high-resolution feature map. A multi-scale/global context module is then employed to produce the final label map. Best viewed in color. reduce computation complexity dramatically without performance loss. EncoderDecoder To gradually recover the spatial information, [28] introduces skip connections to construct U-Net, which combines the encoder features and the corresponding decoder activations. [18] proposes a multipath refinement network, which explicitly exploits all the information available along the down-sampling process. DeepLabV3+ [8] combines the advantages of DilatedFCN and EncoderDecoder, which employs DeepLabV3 as the encoder. Our method is complementary to DeepLabV3+, which can reduce the computation overload of DeepLabV3 without performance loss. Upsampling","In our method, we propose a module to upsample a lowresolution feature map given high-resolution feature maps as guidance, which is closely related to joint upsampling as well as data-dependent upsampling. Joint Upsampling In the literature of image processing, joint upsampling aims at leveraging the guidance image as a prior and transferring the structural details from the guidance image to the target image. [17] constructs a joint filter based on CNNs, which learns to recover the structure details in the guidance image. [31] proposes an end-to-end trainable guided filtering module, which upsamples a lowresolution image conditionally. Our method is related to the aforementioned approaches. However, the proposed JPU is designed for processing feature maps with a large number of channels while [17, 31] are specially designed for pro-cessing 3-channel images, which fail to capture the complex relations in high dimensional feature maps. Besides, the motivation and target of our method is completely different. Data-Dependent Upsampling DUpsampling [29] is also related to our method, which takes advantages of the redundancy in the segmentation label space and is able to recover the pixel-wise prediction from low-resolution outputs of CNNs. Compared to our method, DUpsampling has a strong dependency on the label space, which generalizes poorly to a larger or more complex label space. Method","In this section, we first introduce the most popular methods for semantic segmentation, named DilatedFCNs. We then reform the architecture of DilatedFCNs with a novel joint upsampling module, Joint Pyramid Upsampling (JPU). Finally, we discuss the proposed JPU in details, before which joint upsampling, dilated convolution and stride convolution are briefly introduced. DilatedFCN","To exploit Deep CNNs in semantic segmentation, Long et al. [22] transform the CNN designed for image classification into FCN. Taking ResNet-101 as an example, the original CNN contains 5 convolution stages, a global average pooling layer and a linear layer. To construct an FCN, the global average pooling layer and the linear layer are replaced by a convolution layer, which is used to generate the final label map, as shown in Figure 1a . Between each two consecutive convolution stages, stride convolutions and/or spatial pooling layers are employed, resulting in 5 feature maps with gradually reduced spatial resolutions. The spatial resolution of the last feature map in FCN is reduced by a factor of 32, leading to inaccurate predictions about the locations and details. To obtain a final feature map with high resolution, DeepLab [5] removes the downsampling operations before the last two feature maps, as shown in Figure 1c . Besides, the convolution layers inside the last two convolution stages are replaced by dilated convolutions to maintain the receptive field of view, thus named Dilated-FCN. As a result, the resolution of the last feature map is reduced by a factor of 8, which reserves more location and detail information. Following DeepLab, [38, 6] propose a multi-scale context module to capture context information from the last feature map, achieving tremendous success in several segmentation benchmarks. The Framework of Our Method","To obtain a high-resolution final feature map, methods in DilatedFCN remove the last two downsampling operations from the original FCN, which bring in heavy computation complexity and memory footprint due to the enlarged feature maps. In this paper, we aim at seeking an alternative way to approximate the final feature map of DilatedFCN without computation and memory overload. Meanwhile, we expect the performance of our method to be as good as that of the original DilatedFCNs. To achieve this, we first put back all the stride convolutions removed by DilatedFCN, while replacing all the dilated convolutions with regular convolution layers. As shown in Figure 2 , the backbone of our method is the same as that of the original FCN, where the spatial resolutions of the five feature maps (Conv1−Conv5) are gradually reduced by a factor of 2. To obtain a feature map similar to the final feature map of DilatedFCN, we propose a novel module named Joint Pyramid Upsampling (JPU), which takes the last three feature maps (Conv3−Conv5) as inputs. Then a multi-scale context module (PSP [38] /ASPP [6] ) or a global context module (Encoding [36] ) is employed to produce the final predictions. Compared to DilatedFCN, our method takes 4 times fewer computation and memory resources in 23 residual blocks (69 layers) and 16 times fewer in 3 blocks (9 layers) when the backbone is ResNet-101. Thus, our method runs much faster than DilatedFCN while consuming less memory. Joint Pyramid Upsampling","The proposed JPU is designed for generating a feature map that approximates the activations of the final feature map from the backbone of DilatedFCN. Such a problem can be reformulated into joint upsampling, which is then resolved by a CNN designed for this task. Background","Joint Upsampling Given a low-resolution target image and a high-resolution guidance image, joint upsampling aims at generating a high-resolution target image by transferring details and structures from the guidance image. Generally, the low-resolution target image y l is generated by employing a transformation f (•) on the low-resolution guidance image x l , i.e. y l = f (x l ). Given x l and y l , we are required to obtain a transformationf ( y h =f (x h ), wheref (•) = argmin h(•)∈H ||y l − h(x l )||, (1) where H is a set of all possible transformation functions, and || • || is a pre-defined distance metric. Dilated Convolution Dilated convolution is introduced in DeepLab [5] for obtaining high-resolution feature maps while maintaining the receptive field of view. Figure 3a gives an illustration of the dilated convolution in 1D (dilation rate = 2), which can be divided into the following three steps: (1) split the input feature f in into two groups f 0 in and f 1 in according to the parity of the index, (2) process each feature with the same convolution layer, resulting in f 0 out and f 1 out , and (3) merge the two generated features interlaced to obtain the output feature f out . Stride Convolution Stride convolution is proposed to transform the input feature into an output feature with reduced spatial resolution, which is equivalent to the following two steps as shown in Figure 3b: (1) process the input feature f in with a regular convolution to obtain the intermediate feature f m , and (2) remove the elements with an odd index, resulting in f out . Reformulating into Joint Upsampling","The differences between the backbone of our method and DilatedFCN lie on the last two convolution stages. Taking the 4th convolution stage (Conv4) as an example, in Dilat-edFCN, the input feature map is first processed by a regular convolution layer, followed by a series of dilated convolutions (d=2). Differently, our method first processes the input feature map with a stride convolution (s=2), and then employs several regular convolutions to generate the output. Formally, given the input feature map x, the output feature map y d in DilatedFCN is obtained as follows: Fig 3a) , = ! ""# ! $%& ! ""# ! $%& ! ""# ' ! ""# ( ! $%& ' ! $%& ( Split Merge Conv DilatedConv, d=2 y d = x → C r → C d → ...... → C d n = x → C r → SC r M → ...... → SC r M n (Fig 3a) = x → C r → S → C r → ...... → C r n → M = y m → S → C n r → M = {y 0 m , y 1 m } → C n r → M ( while in our method, the output feature map y s is generated as follows: (Fig 3b) . y s = x → C s → C r → ...... → C r n = x → C r → R → C r → ...... → C r n (Fig 3b) = y m → R → C n r = y 0 m → C n r (3) C r , C d , and C s represent a regular/dilated/stride convolution respectively, and C n r is n layers of regular convolutions. S, M and R are split, merge, and reduce operations in Figure 3 , where adjacent S and M operations can be canceled out. Notably, the convolutions in Equations 2 and 3 are in 1D, which is for simplicity. Similar results can be obtained for 2D convolutions. The aforementioned equations show that y s and y d can be obtained with the same function C n r with different inputs: y 0 m and y m , where the former is downsampled from the latter. Thus, given x and y s , the feature map y that ap-proximates y d can be obtained as follows: EQUATION which is the same as the joint upsampling problem defined in Equation 1. Similar conclusions can be easily obtained for the 5th convolution stage (Conv5). Solving with CNNs","Equation 4 is an optimization problem, which takes lots of time to converge through the iterative gradient descent. Alternatively, we propose to approximate the optimization process with a CNN module. To achieve this, we first require to generate y m given x, as shown in Equation 4. Then, features from y 0 m and y s need to be gathered for learning the mappingĥ. Finally, a convolution block is required to transform the gathered features into the final prediction y. Following the aforementioned analysis, we design the JPU module as in Figure 4 . Concretely, each input feature map is firstly processed by a regular convolution block (Fig. 4a) , which is designed for (1) generating y m given x, and (2) transforming f m into an embedding space with reduced dimensions. As a result, all the input features are mapped into the same space, which enables a better fusion and reduces the computation complexity. Then, the generated feature maps are upsampled and concatenated, resulting in y c (Fig. 4b) . Four separable convolutions [14, 9] the convolution with dilation rate 1 is employed to capture the relation between y 0 m and the rest part of y m , as shown by the blue box in Figure 5 . Alternatively, the convolutions with dilation rate 2, 4 and 8 are designed for learning the mappingĥ to transform y 0 m into y s , as shown by the green boxes in Figure 5 . Thus, JPU can extract multi-scale context information from multi-level feature maps, which leads to a better performance. This is significantly different from ASPP [6] , which only exploit the information in the last feature map. The extracted features encode the mapping between y 0 m and y s as well as the relation between y 0 m and the rest part of y m . Thus, another regular convolution block is employed, which transforms the features into the final predictions ( Fig. 4c) . Notably, the proposed JPU module solves two closely related joint upsampling problems jointly, which are (1) upsampling Conv4 based on Conv3 (the 4th convolution stage), and (2) upscaling Conv5 with the guidance of the enlarged Conv4 (the 5th convolution stage). Experiment","In this section, we first introduce the datase...",Dataset Pascal Context dataset [23] is based o...,To show the effectiveness of the proposed meth...,"Pascal Context In Table 1 , our method employs..."
False,1903.11816v1,3D Face Reconstruction#NoW Benchmark#Mean Reconstruction Error (mm),Title,FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation Abstract:,"Modern approaches for semantic segmentation usually employ dilated convolutions in the backbone to extract highresolution feature maps, which brings heavy computation complexity and memory footprint. To replace the time and memory consuming dilated convolutions, we propose a novel joint upsampling module named Joint Pyramid Upsampling (JPU) by formulating the task of extracting highresolution feature maps into a joint upsampling problem. With the proposed JPU, our method reduces the computation complexity by more than three times without performance loss. Experiments show that JPU is superior to other upsampling modules, which can be plugged into many existing approaches to reduce computation complexity and improve performance. By replacing dilated convolutions with the proposed JPU module, our method achieves the state-of-the-art performance in Pascal Context dataset (mIoU of 53.13%) and ADE20K dataset (final score of 0.5584) while running 3 times faster. Code is available in https://github.com/wuhuikai/FastFCN . Introduction","Semantic segmentation [23, 40, 4] is one of the fundamental tasks in computer vision, with the goal of assigning a semantic label to each pixel of an image. Modern approaches usually employ a Fully Convolution Network (FCN) [22] to address this task, achieving tremendous success among several segmentation benchmarks. The original FCN is proposed by Long et al. [22] , which is transformed from a Convolutional Neural Network (CNN) [16, 15] designed for image classification. Inheriting from the design for image classification, the original FCN downsamples the input image progressively by stride convolutions and/or spatial pooling layers, resulting in a final feature map in low resolution. Although the final feature map encodes rich semantic information, the fine image structure information is lost, leading to inaccurate predictions around the object boundaries. As shown in Figure 1a , the original FCN typically downsamples the input image 5 times, reducing the spatial resolution of the final feature map by a factor of 32. To obtain a high-resolution final feature map, [3, 28, 18, 30, 27] employ the original FCN as the encoder to capture high-level semantic information, and a decoder is designed to gradually recover the spatial information by combining multi-level feature maps from the encoder. As shown in Figure 1b , we term such methods EncoderDecoder, of which the final prediction generated by the decoder is in high resolution. Alternatively, DeepLab [5] removes the last two downsampling operations from the original FCN and introduces dilated (atrous) convolutions to maintain the receptive field of view unchanged. 1 Following DeepLab, [38, 6, 36] employ a multi-scale context module on top of the final feature map, outperforming most EncoderDecoder methods significantly on several segmentation benchmarks. As shown in Figure 1c , the spatial resolution of the last feature map in DilatedFCN is 4 times larger than that in the original FCN, thus maintaining more structure and location information. The dilated convolutions play an important role in maintaining the spatial resolution of the final feature map, leading to superior performance compared to most methods in EncoderDecoder. However, the introduced dilated convolutions bring heavy computation complexity and memory footprint, which limit the usage in many real-time applications. Taking ResNet-101 [13] as an example, compared to the original FCN, 23 residual blocks (69 convolution layers) in DilatedFCN require to take 4 times more computation resources and memory usages, and 3 residual blocks (9 convolution layers) need to take 16 times more resources. We aim at tackling the aforementioned issue caused by dilated convolutions in this paper. To achieve this, we propose a novel joint upsampling module to replace the time and memory consuming dilated convolutions, namely Joint Pyramid Upsampling (JPU). As a result, our method employs the original FCN as the backbone while applying JPU to upsample the low-resolution final feature map with output stride (OS) 32, resulting in a high-resolution feature map (OS=8). Accordingly, the computation time and memory footprint of the whole segmentation framework is dramatically reduced. Meanwhile, there's no performance loss when replacing the dilated convolutions with the proposed JPU. We attribute this to the ability of JPU to exploit multiscale context across multi-level feature maps. To validate the effectiveness of our method, we first conduct a systematical experiment, showing that the proposed JPU can replace dilated convolutions in several popular approaches without performance loss. We then test the proposed method on several segmentation benchmarks. Results show that our method achieves the state-of-the-art performance while running more than 3 times faster. Concretely, we outperform all the baselines on Pascal Context dataset [23] by a large margin, which achieves the state-ofthe-art performance with mIoU of 53.13%. On ADE20K dataset [40] , we obtain the mIoU of 42.75% with ResNet-50 as the backbone, which sets a new record on the val set. Moreover, our method with ResNet-101 achieves the stateof-the-art performance in the test set of ADE20K dataset. In summary, our contributions are three folds, which are: (1) We propose a computationally efficient joint upsampling module named JPU to replace the time and memory consuming dilated convolutions in the backbone. (2) Based on the proposed JPU, the computation time and memory footprint of the whole segmentation framework can be reduced by a factor of more than 3 and meanwhile achieves better performance. (3) Our method achieves the new state-ofthe-art performance in both Pascal Context dataset (mIoU of 53.13%) and ADE20K dataset (mIoU of 42.75% with ResNet-50 as the backbone on the val set and final score of 0.5584 with ResNet-101 on the test set). Related Work","In this section, we first give an overview on methods for semantic segmentation, which can be categorized into two directions. We then introduce some related works on upsampling. Semantic Segmentation","FCNs [22] have achieved huge success in semantic segmentation. Following FCN, there're two prominent directions, namely DilatedFCN and EncoderDecoder. Dilated-FCNs [11, 34, 7, 6, 38, 36, 5] utilize dilated convolutions to keep the receptive field of view and employ a multi-scale context module to process high-level feature maps. Alternatively, EncoderDecoders [24, 28, 18, 1, 26, 12, 33, 37] propose to utilize an encoder to extract multi-level feature maps, which are then combined into the final prediction by a decoder. DilatedFCN In order to capture multi-scale context information on the high-resolution final feature map, PSP-Net [38] performs pooling operations at multiple grid scales while DeepLabV3 [6] employs parallel atrous convolutions with different rates named ASPP. Alternatively, EncNet [36] utilizes the Context Encoding Module to capture global contextual information. Differently, our method proposes a joint upsampling module named JPU to replace the dilated convolutions in the backbone of DilatedFCNs, which can JPU Encoding/PSP/ASPP Head C o n v 5 , 3 2 x C o n v 4 , 1 6 x C o n v 3 , 8 x C o n v 2 , 4 x C o n v 1 , 2 x 8 x 8 x Figure 2 : Framework Overview of Our Method. Our method employs the same backbone as the original FCN. After the backbone, a novel upsampling module named Joint Pyramid Upsampling (JPU) is proposed, which takes the last three feature maps as the inputs and generates a high-resolution feature map. A multi-scale/global context module is then employed to produce the final label map. Best viewed in color. reduce computation complexity dramatically without performance loss. EncoderDecoder To gradually recover the spatial information, [28] introduces skip connections to construct U-Net, which combines the encoder features and the corresponding decoder activations. [18] proposes a multipath refinement network, which explicitly exploits all the information available along the down-sampling process. DeepLabV3+ [8] combines the advantages of DilatedFCN and EncoderDecoder, which employs DeepLabV3 as the encoder. Our method is complementary to DeepLabV3+, which can reduce the computation overload of DeepLabV3 without performance loss. Upsampling","In our method, we propose a module to upsample a lowresolution feature map given high-resolution feature maps as guidance, which is closely related to joint upsampling as well as data-dependent upsampling. Joint Upsampling In the literature of image processing, joint upsampling aims at leveraging the guidance image as a prior and transferring the structural details from the guidance image to the target image. [17] constructs a joint filter based on CNNs, which learns to recover the structure details in the guidance image. [31] proposes an end-to-end trainable guided filtering module, which upsamples a lowresolution image conditionally. Our method is related to the aforementioned approaches. However, the proposed JPU is designed for processing feature maps with a large number of channels while [17, 31] are specially designed for pro-cessing 3-channel images, which fail to capture the complex relations in high dimensional feature maps. Besides, the motivation and target of our method is completely different. Data-Dependent Upsampling DUpsampling [29] is also related to our method, which takes advantages of the redundancy in the segmentation label space and is able to recover the pixel-wise prediction from low-resolution outputs of CNNs. Compared to our method, DUpsampling has a strong dependency on the label space, which generalizes poorly to a larger or more complex label space. Method","In this section, we first introduce the most popular methods for semantic segmentation, named DilatedFCNs. We then reform the architecture of DilatedFCNs with a novel joint upsampling module, Joint Pyramid Upsampling (JPU). Finally, we discuss the proposed JPU in details, before which joint upsampling, dilated convolution and stride convolution are briefly introduced. DilatedFCN","To exploit Deep CNNs in semantic segmentation, Long et al. [22] transform the CNN designed for image classification into FCN. Taking ResNet-101 as an example, the original CNN contains 5 convolution stages, a global average pooling layer and a linear layer. To construct an FCN, the global average pooling layer and the linear layer are replaced by a convolution layer, which is used to generate the final label map, as shown in Figure 1a . Between each two consecutive convolution stages, stride convolutions and/or spatial pooling layers are employed, resulting in 5 feature maps with gradually reduced spatial resolutions. The spatial resolution of the last feature map in FCN is reduced by a factor of 32, leading to inaccurate predictions about the locations and details. To obtain a final feature map with high resolution, DeepLab [5] removes the downsampling operations before the last two feature maps, as shown in Figure 1c . Besides, the convolution layers inside the last two convolution stages are replaced by dilated convolutions to maintain the receptive field of view, thus named Dilated-FCN. As a result, the resolution of the last feature map is reduced by a factor of 8, which reserves more location and detail information. Following DeepLab, [38, 6] propose a multi-scale context module to capture context information from the last feature map, achieving tremendous success in several segmentation benchmarks. The Framework of Our Method","To obtain a high-resolution final feature map, methods in DilatedFCN remove the last two downsampling operations from the original FCN, which bring in heavy computation complexity and memory footprint due to the enlarged feature maps. In this paper, we aim at seeking an alternative way to approximate the final feature map of DilatedFCN without computation and memory overload. Meanwhile, we expect the performance of our method to be as good as that of the original DilatedFCNs. To achieve this, we first put back all the stride convolutions removed by DilatedFCN, while replacing all the dilated convolutions with regular convolution layers. As shown in Figure 2 , the backbone of our method is the same as that of the original FCN, where the spatial resolutions of the five feature maps (Conv1−Conv5) are gradually reduced by a factor of 2. To obtain a feature map similar to the final feature map of DilatedFCN, we propose a novel module named Joint Pyramid Upsampling (JPU), which takes the last three feature maps (Conv3−Conv5) as inputs. Then a multi-scale context module (PSP [38] /ASPP [6] ) or a global context module (Encoding [36] ) is employed to produce the final predictions. Compared to DilatedFCN, our method takes 4 times fewer computation and memory resources in 23 residual blocks (69 layers) and 16 times fewer in 3 blocks (9 layers) when the backbone is ResNet-101. Thus, our method runs much faster than DilatedFCN while consuming less memory. Joint Pyramid Upsampling","The proposed JPU is designed for generating a feature map that approximates the activations of the final feature map from the backbone of DilatedFCN. Such a problem can be reformulated into joint upsampling, which is then resolved by a CNN designed for this task. Background","Joint Upsampling Given a low-resolution target image and a high-resolution guidance image, joint upsampling aims at generating a high-resolution target image by transferring details and structures from the guidance image. Generally, the low-resolution target image y l is generated by employing a transformation f (•) on the low-resolution guidance image x l , i.e. y l = f (x l ). Given x l and y l , we are required to obtain a transformationf ( y h =f (x h ), wheref (•) = argmin h(•)∈H ||y l − h(x l )||, (1) where H is a set of all possible transformation functions, and || • || is a pre-defined distance metric. Dilated Convolution Dilated convolution is introduced in DeepLab [5] for obtaining high-resolution feature maps while maintaining the receptive field of view. Figure 3a gives an illustration of the dilated convolution in 1D (dilation rate = 2), which can be divided into the following three steps: (1) split the input feature f in into two groups f 0 in and f 1 in according to the parity of the index, (2) process each feature with the same convolution layer, resulting in f 0 out and f 1 out , and (3) merge the two generated features interlaced to obtain the output feature f out . Stride Convolution Stride convolution is proposed to transform the input feature into an output feature with reduced spatial resolution, which is equivalent to the following two steps as shown in Figure 3b: (1) process the input feature f in with a regular convolution to obtain the intermediate feature f m , and (2) remove the elements with an odd index, resulting in f out . Reformulating into Joint Upsampling","The differences between the backbone of our method and DilatedFCN lie on the last two convolution stages. Taking the 4th convolution stage (Conv4) as an example, in Dilat-edFCN, the input feature map is first processed by a regular convolution layer, followed by a series of dilated convolutions (d=2). Differently, our method first processes the input feature map with a stride convolution (s=2), and then employs several regular convolutions to generate the output. Formally, given the input feature map x, the output feature map y d in DilatedFCN is obtained as follows: Fig 3a) , = ! ""# ! $%& ! ""# ! $%& ! ""# ' ! ""# ( ! $%& ' ! $%& ( Split Merge Conv DilatedConv, d=2 y d = x → C r → C d → ...... → C d n = x → C r → SC r M → ...... → SC r M n (Fig 3a) = x → C r → S → C r → ...... → C r n → M = y m → S → C n r → M = {y 0 m , y 1 m } → C n r → M ( while in our method, the output feature map y s is generated as follows: (Fig 3b) . y s = x → C s → C r → ...... → C r n = x → C r → R → C r → ...... → C r n (Fig 3b) = y m → R → C n r = y 0 m → C n r (3) C r , C d , and C s represent a regular/dilated/stride convolution respectively, and C n r is n layers of regular convolutions. S, M and R are split, merge, and reduce operations in Figure 3 , where adjacent S and M operations can be canceled out. Notably, the convolutions in Equations 2 and 3 are in 1D, which is for simplicity. Similar results can be obtained for 2D convolutions. The aforementioned equations show that y s and y d can be obtained with the same function C n r with different inputs: y 0 m and y m , where the former is downsampled from the latter. Thus, given x and y s , the feature map y that ap-proximates y d can be obtained as follows: EQUATION which is the same as the joint upsampling problem defined in Equation 1. Similar conclusions can be easily obtained for the 5th convolution stage (Conv5). Solving with CNNs","Equation 4 is an optimization problem, which takes lots of time to converge through the iterative gradient descent. Alternatively, we propose to approximate the optimization process with a CNN module. To achieve this, we first require to generate y m given x, as shown in Equation 4. Then, features from y 0 m and y s need to be gathered for learning the mappingĥ. Finally, a convolution block is required to transform the gathered features into the final prediction y. Following the aforementioned analysis, we design the JPU module as in Figure 4 . Concretely, each input feature map is firstly processed by a regular convolution block (Fig. 4a) , which is designed for (1) generating y m given x, and (2) transforming f m into an embedding space with reduced dimensions. As a result, all the input features are mapped into the same space, which enables a better fusion and reduces the computation complexity. Then, the generated feature maps are upsampled and concatenated, resulting in y c (Fig. 4b) . Four separable convolutions [14, 9] the convolution with dilation rate 1 is employed to capture the relation between y 0 m and the rest part of y m , as shown by the blue box in Figure 5 . Alternatively, the convolutions with dilation rate 2, 4 and 8 are designed for learning the mappingĥ to transform y 0 m into y s , as shown by the green boxes in Figure 5 . Thus, JPU can extract multi-scale context information from multi-level feature maps, which leads to a better performance. This is significantly different from ASPP [6] , which only exploit the information in the last feature map. The extracted features encode the mapping between y 0 m and y s as well as the relation between y 0 m and the rest part of y m . Thus, another regular convolution block is employed, which transforms the features into the final predictions ( Fig. 4c) . Notably, the proposed JPU module solves two closely related joint upsampling problems jointly, which are (1) upsampling Conv4 based on Conv3 (the 4th convolution stage), and (2) upscaling Conv5 with the guidance of the enlarged Conv4 (the 5th convolution stage). Experiment","In this section, we first introduce the datase...",Dataset Pascal Context dataset [23] is based o...,To show the effectiveness of the proposed meth...,"Pascal Context In Table 1 , our method employs..."


In [261]:
train[train.label==True].head(10)

Unnamed: 0,label,title,TDM,Context
0,True,1607.07155v1,Face Detection#WIDER Face (Hard)#AP,N o n e
1,True,1607.07155v1,Pedestrian Detection#Caltech#Reasonable Miss Rate,N o n e
12,True,1911.01616v4,Aspect Sentiment Triplet Extraction#SemEval#F1,N o n e
23,True,1412.7259v3,Image Classification#MNIST#Percentage error,N o n e
24,True,1412.7259v3,Image Classification#STL-10#Percentage correct,N o n e
35,True,1704.07156v1,Part-Of-Speech Tagging#Penn Treebank#Accuracy,N o n e
36,True,1704.07156v1,Grammatical Error Detection#FCE#F0.5,N o n e
37,True,1704.07156v1,Grammatical Error Detection#CoNLL-2014 A2#F0.5,N o n e
38,True,1704.07156v1,Grammatical Error Detection#CoNLL-2014 A1#F0.5,N o n e
49,True,2003.12060v1,Few-Shot Image Classification#CUB 200 5-way 1-...,N o n e


In [262]:
train[train.title=="1911.08251v2"].head()

Unnamed: 0,label,title,TDM,Context
49978,True,1911.08251v2,Image Classification#STL-10#Percentage correct,N o n e
49979,False,1911.08251v2,3D Absolute Human Pose Estimation#Human3.6M#MRPE,N o n e
49980,False,1911.08251v2,3D Action Recognition#100 sleep nights of 8 ca...,N o n e
49981,False,1911.08251v2,3D Canonical Hand Pose Estimation#Ego3DHands#AUC,N o n e
49982,False,1911.08251v2,3D Canonical Hand Pose Estimation#RHP#AUC,N o n e


In [263]:
train[train.TDM=="unknown"].head()

Unnamed: 0,label,title,TDM,Context
103,True,2005.03059v3,unknown,N o n e
235,True,2102.02717v3,unknown,N o n e
276,True,2102.05126v1,unknown,N o n e
299,True,2008.06223v2,unknown,N o n e
346,True,2003.06520v1,unknown,N o n e


In [None]:
train["len"]train.Context.apply(lambda content: len(content),)

In [171]:
tdm = pd.read_csv(f"/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/new/60Neg800unk/twofoldwithunk/fold1/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [172]:
tdm.head()

Unnamed: 0,label,title,TDM,Context
0,True,1810.02575v1.pdf,Semantic Segmentation; Nighttime Driving; mIoU,Dark Model Adaptation: Semantic Image Segmenta...
1,False,1810.02575v1.pdf,Extractive Text Summarization; DebateSum; ROUGE-L,Dark Model Adaptation: Semantic Image Segmenta...
2,False,1810.02575v1.pdf,Action Recognition; Something-Something V1; To...,Dark Model Adaptation: Semantic Image Segmenta...
3,False,1810.02575v1.pdf,Multi-Object Tracking; MOTS20; sMOTSA,Dark Model Adaptation: Semantic Image Segmenta...
4,False,1810.02575v1.pdf,Continuous Control; PyBullet Ant; Return,Dark Model Adaptation: Semantic Image Segmenta...


In [173]:
tdm.tail()

Unnamed: 0,label,title,TDM,Context
256003,False,1307.0414v1.pdf,Skeleton Based Action Recognition; SHREC 2017 ...,Challenges in Representation Learning: A repor...
256004,False,1307.0414v1.pdf,Fake News Detection; FNC-1; Weighted Accuracy,Challenges in Representation Learning: A repor...
256005,False,1307.0414v1.pdf,Multimodal Unsupervised Image-To-Image Transla...,Challenges in Representation Learning: A repor...
256006,False,1307.0414v1.pdf,Graph Classification; PTC; Accuracy,Challenges in Representation Learning: A repor...
256007,False,1307.0414v1.pdf,Pose Estimation; UPenn Action; Mean PCK@0.2,Challenges in Representation Learning: A repor...


In [242]:
tdm[tdm.title =="1908.05786v1.pdf" ].head(1)

Unnamed: 0,label,title,TDM,Context
610,True,1908.05786v1.pdf,unknow,TASED-Net: Temporally-Aggregating Spatial Enco...


In [148]:
tdm[tdm.count_tdm>800].head()

Unnamed: 0,tdm,count_tdm
703,Skeleton Based Action Recognition#NTU RGB+D#Ac...,896
704,Skeleton Based Action Recognition#NTU RGB+D#Ac...,912


In [121]:
import re
# re.sub(r"[^a-zA-Z0-9?,'’‘´`%]+", '', "Hell\no*%& \n").strip()
f = re.sub(r"[\n]+", '', "Hell\no*%& \n").strip()

In [123]:
f

'Hello*%&'