# Dataset Creation 

we have files 
- `resultsAnnotation.tsv`, 
- `datasetAnnotation.tsv`, 
- `taskAnnotation.tsv`, 
- `paper_links.tsv`, 
- `TDM_taxonomy.tsv`, 
- `TDMs_taxonomy.tsv` 
- `paper_name_taxonomy.tsv` 

Created mostly from the file `evaluation-tables.json` from [paperswithcode](https://paperswithcode.com/about)

In [17]:
# imports
import ipdb, os, re

from sklearn.model_selection import train_test_split 

In [16]:
with open(f"../data/resultsAnnotation.tsv", errors='replace') as f:
    resultsAnnotation = f.read().splitlines()

with open(f"../data/datasetAnnotation.tsv", errors='replace') as f:
    datasetAnnotation = f.read().splitlines()
    
with open(f"../data/taskAnnotation.tsv", errors='replace') as f:
    taskAnnotation = f.read().splitlines()
    
with open(f"../data/TDM_taxonomy.tsv", errors='replace') as f:
    TDM_taxonomy = f.read().splitlines()
    
with open(f"../data/paper_name_taxonomy.tsv", errors='replace') as f:
    paper_name_taxonomy = f.read().splitlines()

In [17]:
resultsAnnotation[5]

'1510.05067v4.pdf\tHandwritten Digit Recognition#MNIST#PERCENTAGE ERROR#0.91$Image Classification#STL-10#Percentage correct#57.32$Image Classification#CIFAR-100#Percentage correct#48.75$Image Classification#SVHN#Percentage error#10.16$Image Classification#CIFAR-10#Percentage correct#80.98'

In [18]:
datasetAnnotation[5]

'1510.05067v4.pdf\tMNIST#STL-10#CIFAR-100#SVHN#CIFAR-10'

In [19]:
taskAnnotation[5]

'1510.05067v4.pdf\tHandwritten Digit Recognition'

In [20]:
TDM_taxonomy[9]

'Deblurring#HIDE (trained on GOPRO)#PSNR (sRGB)\t8'

In [21]:
paper_name_taxonomy[5]

'1510.05067v4.pdf\t5'

In [72]:
def create_training_data(path_to_resultsAnnotation, path_to_TDM_taxonomy, path_parsed_files,
                         output_dir, test_set_portion=0.2,
                         leaderboard_threshold=5, num_negative_instances=5, allowed_unknown=10):
    
    # to create the repo if it doesn't exist already 
    if not os.path.exists(f"{output_dir}{num_negative_instances}Neg{allowed_unknown}Unknown/"):
        os.makedirs(f"{output_dir}{num_negative_instances}Neg{allowed_unknown}Unknown/")
        
        
    with open(f"{path_to_resultsAnnotation}/resultsAnnotation.tsv", errors='replace') as f:
        resultsAnnotation = f.read().splitlines()
    
    paper_TDM = {}
    for paper in resultsAnnotation:
        if len(paper.split("\t")) != 2:
            continue
            
        title, TDMSList = paper.split("\t")
        
        title = '.'.join(title.split('/')[-1].split('.')[:-1])
        paper_TDM[title] = TDMSList

    with open(f"{path_to_TDM_taxonomy}/TDM_taxonomy.tsv", errors='replace') as f:
        TDM_taxonomy = f.read().splitlines()
        
    TDM_taxonomy_dict = {}
    unknown_count = 0
    for TDMCount in TDM_taxonomy:
        if len(TDMCount.split("\t")) != 2:
            continue
        TDM, count = TDMCount.split("\t")
        count = int(count)
        if count >= leaderboard_threshold:
            TDM_taxonomy_dict[TDM] = count
    # ipdb.set_trace()
    list_parsed_pdf = os.listdir(path_parsed_files)
    if '.ipynb_checkpoints' in list_parsed_pdf:
        list_parsed_pdf.remove('.ipynb_checkpoints')

    
    # ToDo: will it be interresting to use stratified ? using the label ? 
    train_valid = train_test_split(list_parsed_pdf, test_size=10/100, shuffle=True)
    train, valid = train_valid[0], train_valid[1]
    
    if os.path.exists(f"{output_dir}{num_negative_instances}Neg{allowed_unknown}Unknown/train.tsv"):
        os.remove(f"{output_dir}{num_negative_instances}Neg{allowed_unknown}Unknown/train.tsv")
        
    if os.path.exists(f"{output_dir}{num_negative_instances}Neg{allowed_unknown}Unknown/dev.tsv"):
        os.remove(f"{output_dir}{num_negative_instances}Neg{allowed_unknown}Unknown/dev.tsv")
    
    
    for paper in train :
        with open(f"{path_parsed_files}{paper}", errors='replace') as f:
            txt = f.read().splitlines()
        content = ' '.join(txt)
        # content = re.sub(r"[^a-zA-Z0-9?,'’‘´`%]+", ' ', content).strip()
        content = re.sub(r"[\t]+", ' ', content).strip()
        
        paper_id = '.'.join(paper.split('/')[-1].split('.')[:-1])
        
        not_seen = True
        if paper_id in paper_TDM.keys():
            cache_tdm = set()
            for contrib in paper_TDM[paper_id].split("$"):
                if len(contrib.split("#")) != 4:
                    # missed += 1
                    continue

                task, dataset, metric, score = contrib.split("#")
        
                if (f"{task}#{dataset}#{metric}" in cache_tdm):
                    continue
                
                if f"{task}#{dataset}#{metric}" in TDM_taxonomy_dict.keys():
                    not_seen = False
                    cache_tdm.add(f"{task}#{dataset}#{metric}")
                    with open(f"{output_dir}{num_negative_instances}Neg{allowed_unknown}Unknown/train.tsv", "a+", encoding="utf-8") as text_file:
                        text_file.write(f"true\t{paper_id}\t{task}#{dataset}#{metric}\t{content}\n")
                        
            if not_seen and (unknown_count <= allowed_unknown):
                with open(f"{output_dir}{num_negative_instances}Neg{allowed_unknown}Unknown/train.tsv", "a+", encoding="utf-8") as text_file:
                    text_file.write(f"true\t{paper_id}\tunknown\t{content}\n")
                    
                    
            random_tdm =  list(TDM_taxonomy_dict.keys()) 
            random_tdm.sort()
            for RandTDM in random_tdm[:num_negative_instances]:
                task, dataset, metric = RandTDM.split("#")
                with open(f"{output_dir}{num_negative_instances}Neg{allowed_unknown}Unknown/train.tsv", "a+", encoding="utf-8") as text_file:
                    text_file.write(f"false\t{paper_id}\t{task}#{dataset}#{metric}\t{content}\n")
        else:
            print(f"Paper {paper_id} from train not in the resultsAnnotation.tssv file")
    
    for paper in valid :
        with open(f"{path_parsed_files}{paper}", errors='replace') as f:
            txt = f.read().splitlines()
        content = ' '.join(txt)
        # content = re.sub(r"[^a-zA-Z0-9?,'’‘´`%]+", ' ', content).strip()
        content = re.sub(r"[\t]+", ' ', content).strip()
        
        paper_id = '.'.join(paper.split('/')[-1].split('.')[:-1])
        
        not_seen = True
        if paper_id in paper_TDM.keys():
            cache_tdm = set()
            for contrib in paper_TDM[paper_id].split("$"):
                if len(contrib.split("#")) != 4:
                    # missed += 1
                    continue

                task, dataset, metric, score = contrib.split("#")
        
                if (f"{task}#{dataset}#{metric}" in cache_tdm):
                    continue
                
                if f"{task}#{dataset}#{metric}" in TDM_taxonomy_dict.keys():
                    not_seen = False
                    cache_tdm.add(f"{task}#{dataset}#{metric}")
                    with open(f"{output_dir}{num_negative_instances}Neg{allowed_unknown}Unknown/dev.tsv", "a+", encoding="utf-8") as text_file:
                        text_file.write(f"true\t{paper_id}\t{task}#{dataset}#{metric}\t{content}\n")
                        
            if not_seen and (unknown_count <= allowed_unknown):
                with open(f"{output_dir}{num_negative_instances}Neg{allowed_unknown}Unknown/dev.tsv", "a+", encoding="utf-8") as text_file:
                    text_file.write(f"true\t{paper_id}\tunknown\t{content}\n")
                    
                    
            random_tdm =  list(TDM_taxonomy_dict.keys()) 
            random_tdm.sort()
            for RandTDM in random_tdm[:num_negative_instances]:
                task, dataset, metric = RandTDM.split("#")
                with open(f"{output_dir}{num_negative_instances}Neg{allowed_unknown}Unknown/dev.tsv", "a+", encoding="utf-8") as text_file:
                    text_file.write(f"false\t{paper_id}\t{task}#{dataset}#{metric}\t{content}\n")
        else:
            print(f"Paper {paper_id} from validation not in the resultsAnnotation.tssv file")
    
        

In [73]:
path_grobid_full_txt = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/pdf_txt/"
path_latex_source_tex = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/exp/arxiv_src/"
path_latex_source_pandoc_txt = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/exp/arxiv_src_txt/"

In [86]:
create_training_data(path_to_resultsAnnotation="../data/", \
                         path_to_TDM_taxonomy="../data/", path_parsed_files=path_grobid_full_txt,
                         output_dir="../data/",
                        leaderboard_threshold=5, num_negative_instances=60, allowed_unknown=800)

## View created data

In [1]:
import pandas as pd

In [2]:
train_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/new/60Neg800unk/twofoldwithunk/fold1/train.tsv"
valid_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/new/60Neg800unk/twofoldwithunk/fold1/dev.tsv"

In [4]:
train = pd.read_csv(train_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

valid = pd.read_csv(valid_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [5]:
train.head()

Unnamed: 0,label,title,TDM,Context
0,True,1810.02575v1.pdf,Semantic Segmentation; Nighttime Driving; mIoU,Dark Model Adaptation: Semantic Image Segmenta...
1,False,1810.02575v1.pdf,Extractive Text Summarization; DebateSum; ROUGE-L,Dark Model Adaptation: Semantic Image Segmenta...
2,False,1810.02575v1.pdf,Action Recognition; Something-Something V1; To...,Dark Model Adaptation: Semantic Image Segmenta...
3,False,1810.02575v1.pdf,Multi-Object Tracking; MOTS20; sMOTSA,Dark Model Adaptation: Semantic Image Segmenta...
4,False,1810.02575v1.pdf,Continuous Control; PyBullet Ant; Return,Dark Model Adaptation: Semantic Image Segmenta...


In [6]:
train.tail()

Unnamed: 0,label,title,TDM,Context
256003,False,1307.0414v1.pdf,Skeleton Based Action Recognition; SHREC 2017 ...,Challenges in Representation Learning: A repor...
256004,False,1307.0414v1.pdf,Fake News Detection; FNC-1; Weighted Accuracy,Challenges in Representation Learning: A repor...
256005,False,1307.0414v1.pdf,Multimodal Unsupervised Image-To-Image Transla...,Challenges in Representation Learning: A repor...
256006,False,1307.0414v1.pdf,Graph Classification; PTC; Accuracy,Challenges in Representation Learning: A repor...
256007,False,1307.0414v1.pdf,Pose Estimation; UPenn Action; Mean PCK@0.2,Challenges in Representation Learning: A repor...


In [7]:
valid.head()

Unnamed: 0,label,title,TDM,Context
0,True,1707.03497v2.pdf,Atari Games; Atari 2600 Seaquest; Score,Value Prediction Network This paper proposes a...
1,True,1707.03497v2.pdf,Atari Games; Atari 2600 Amidar; Score,Value Prediction Network This paper proposes a...
2,True,1707.03497v2.pdf,Atari Games; Atari 2600 Krull; Score,Value Prediction Network This paper proposes a...
3,True,1707.03497v2.pdf,Atari Games; Atari 2600 Alien; Score,Value Prediction Network This paper proposes a...
4,True,1707.03497v2.pdf,Atari Games; Atari 2600 Enduro; Score,Value Prediction Network This paper proposes a...


In [8]:
valid.tail()

Unnamed: 0,label,title,TDM,Context
108451,False,1909.09051v1.pdf,Image Classification; EMNIST-Balanced; Accuracy,Self-Supervised Monocular Depth Hints Monocula...
108452,False,1909.09051v1.pdf,Question Answering; TrecQA; MAP,Self-Supervised Monocular Depth Hints Monocula...
108453,False,1909.09051v1.pdf,Text Classification; IMDb; Accuracy (10 classes),Self-Supervised Monocular Depth Hints Monocula...
108454,False,1909.09051v1.pdf,Image-to-Image Translation; CelebA-HQ; LPIPS,Self-Supervised Monocular Depth Hints Monocula...
108455,False,1909.09051v1.pdf,Few-Shot Transfer Learning for Saliency Predic...,Self-Supervised Monocular Depth Hints Monocula...


In [9]:
train[train.title=="1911.08251v2"].head()

Unnamed: 0,label,title,TDM,Context


In [10]:
train[train.TDM=="unknown"].head()

Unnamed: 0,label,title,TDM,Context


In [11]:
# train["len"]=train.Context.apply(lambda content: len(content.split()),)

# Convert IBM Data to our Data format

## Train - Test

In [33]:
IBM_train_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/train.tsv"
IBM_test_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/test.tsv"

In [34]:
train_IBM = pd.read_csv(IBM_train_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

test_IBM = pd.read_csv(IBM_test_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [35]:
train_IBM.head()

Unnamed: 0,label,title,TDM,Context
0,True,D16-1036.pdf,unknow,Multi-view Response Selection for Human-Comput...
1,False,D16-1036.pdf,"question answering, SQuAD, F1",Multi-view Response Selection for Human-Comput...
2,False,D16-1036.pdf,"relation prediction, FB15K-237, H@1",Multi-view Response Selection for Human-Comput...
3,False,D16-1036.pdf,"word sense disambiguation, SemEval 2013, F1",Multi-view Response Selection for Human-Comput...
4,False,D16-1036.pdf,"language modeling, 1B Words / Google Billion W...",Multi-view Response Selection for Human-Comput...


In [36]:
test_IBM.head()

Unnamed: 0,label,title,TDM,Context
0,True,1803.11175.pdf,"sentiment analysis, SUBJ, Accuracy",Universal Sentence Encoder We present models f...
1,True,1803.11175.pdf,"text classification, TREC, Error",Universal Sentence Encoder We present models f...
2,False,1803.11175.pdf,"question answering, SQuAD, F1",Universal Sentence Encoder We present models f...
3,False,1803.11175.pdf,"relation prediction, FB15K-237, H@1",Universal Sentence Encoder We present models f...
4,False,1803.11175.pdf,"word sense disambiguation, SemEval 2013, F1",Universal Sentence Encoder We present models f...


In [37]:
train_IBM["TDM"] = train_IBM.TDM.apply(lambda x : re.sub(r"[,]+", ';', x).strip())
test_IBM["TDM"] = test_IBM.TDM.apply(lambda x : re.sub(r"[,]+", ';', x).strip())

In [38]:
train_IBM.head()

Unnamed: 0,label,title,TDM,Context
0,True,D16-1036.pdf,unknow,Multi-view Response Selection for Human-Comput...
1,False,D16-1036.pdf,question answering; SQuAD; F1,Multi-view Response Selection for Human-Comput...
2,False,D16-1036.pdf,relation prediction; FB15K-237; H@1,Multi-view Response Selection for Human-Comput...
3,False,D16-1036.pdf,word sense disambiguation; SemEval 2013; F1,Multi-view Response Selection for Human-Comput...
4,False,D16-1036.pdf,language modeling; 1B Words / Google Billion W...,Multi-view Response Selection for Human-Comput...


In [39]:
train_IBM.tail()

Unnamed: 0,label,title,TDM,Context
13301,False,16165.pdf,dependency parsing; Penn Treebank; LAS,R 3 : Reinforced Ranker-Reader for Open-Domain...
13302,False,16165.pdf,language modeling; Hutter Prize; Number of params,R 3 : Reinforced Ranker-Reader for Open-Domain...
13303,False,16165.pdf,summarization; CNN / Daily Mail (Non-anonymize...,R 3 : Reinforced Ranker-Reader for Open-Domain...
13304,False,16165.pdf,word sense disambiguation; Senseval 3; F1,R 3 : Reinforced Ranker-Reader for Open-Domain...
13305,False,16165.pdf,text classification; TREC; Error,R 3 : Reinforced Ranker-Reader for Open-Domain...


In [40]:
test_IBM.tail()

Unnamed: 0,label,title,TDM,Context
13066,False,C18-1121.pdf,dependency parsing; Penn Treebank; LAS,Ensure the Correctness of the Summary: Incorpo...
13067,False,C18-1121.pdf,language modeling; Hutter Prize; Number of params,Ensure the Correctness of the Summary: Incorpo...
13068,False,C18-1121.pdf,summarization; CNN / Daily Mail (Non-anonymize...,Ensure the Correctness of the Summary: Incorpo...
13069,False,C18-1121.pdf,word sense disambiguation; Senseval 3; F1,Ensure the Correctness of the Summary: Incorpo...
13070,False,C18-1121.pdf,text classification; TREC; Error,Ensure the Correctness of the Summary: Incorpo...


In [45]:
train_IBM.to_csv(path_or_buf="/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/train_v2.tsv", 
                 sep="\t", header=None, index=False)

test_IBM.to_csv(path_or_buf="/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/test_v2.tsv", 
                 sep="\t", header=None, index=False)

In [46]:
IBM_train_v2_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/train_v2.tsv"
IBM_test_v2_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/test_v2.tsv"

train_v2_IBM = pd.read_csv(IBM_train_v2_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])
test_v2_IBM = pd.read_csv(IBM_test_v2_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [47]:
train_v2_IBM.head()

Unnamed: 0,label,title,TDM,Context
0,True,D16-1036.pdf,unknow,Multi-view Response Selection for Human-Comput...
1,False,D16-1036.pdf,question answering; SQuAD; F1,Multi-view Response Selection for Human-Comput...
2,False,D16-1036.pdf,relation prediction; FB15K-237; H@1,Multi-view Response Selection for Human-Comput...
3,False,D16-1036.pdf,word sense disambiguation; SemEval 2013; F1,Multi-view Response Selection for Human-Comput...
4,False,D16-1036.pdf,language modeling; 1B Words / Google Billion W...,Multi-view Response Selection for Human-Comput...


In [48]:
test_v2_IBM.tail()

Unnamed: 0,label,title,TDM,Context
13066,False,C18-1121.pdf,dependency parsing; Penn Treebank; LAS,Ensure the Correctness of the Summary: Incorpo...
13067,False,C18-1121.pdf,language modeling; Hutter Prize; Number of params,Ensure the Correctness of the Summary: Incorpo...
13068,False,C18-1121.pdf,summarization; CNN / Daily Mail (Non-anonymize...,Ensure the Correctness of the Summary: Incorpo...
13069,False,C18-1121.pdf,word sense disambiguation; Senseval 3; F1,Ensure the Correctness of the Summary: Incorpo...
13070,False,C18-1121.pdf,text classification; TREC; Error,Ensure the Correctness of the Summary: Incorpo...


## DatasetAnnotation

In [72]:
datasetAnnotation_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/annotations/datasetAnnotation.tsv"
resultsAnnotation_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/annotations/resultsAnnotation.tsv"
taskAnnotation_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/annotations/taskAnnotation.tsv"

datasetAnnotation_IBM_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/ibm/NLP-TDMS/annotations/datasetAnnotation.tsv"
resultsAnnotation_IBM_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/ibm/NLP-TDMS/annotations/resultsAnnotation.tsv"
taskAnnotation_IBM_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/ibm/NLP-TDMS/annotations/taskAnnotation.tsv"

In [73]:
datasetAnnotation = pd.read_csv(datasetAnnotation_csv, 
                    sep="\t", names=["label", "datasets"])
resultsAnnotation = pd.read_csv(resultsAnnotation_csv, 
                    sep="\t", names=["label", "TDMS"])
taskAnnotation = pd.read_csv(taskAnnotation_csv, 
                    sep="\t", names=["label", "tasks"])

datasetAnnotation_IBM = pd.read_csv(datasetAnnotation_IBM_csv, 
                    sep="\t", names=["label", "datasets"])
resultsAnnotation_IBM = pd.read_csv(resultsAnnotation_IBM_csv, 
                    sep="\t", names=["label", "TDMS"])
taskAnnotation_IBM = pd.read_csv(taskAnnotation_IBM_csv, 
                    sep="\t", names=["label", "tasks"])

In [74]:
datasetAnnotation_IBM.head()

Unnamed: 0,label,datasets
0,1705.05952.pdf,UD
1,D18-1205.pdf,CNN / Daily Mail (Non-anonymized version)
2,C18-1121.pdf,Gigaword#DUC 2004 Task 1
3,P18-1063.pdf,CNN / Daily Mail (Anonymized version)
4,5635-grammar-as-a-foreign-language.pdf,Penn Treebank


In [75]:
datasetAnnotation.head()

Unnamed: 0,label,datasets
0,1704.03549v4.pdf,FSNS - Test
1,1712.05404.pdf,FSNS - Test
2,1702.03970v1.pdf,FSNS - Test
3,2104.02324v1.pdf,COCO#PASCAL VOC 07+12#PASCAL VOC 07+12#COCO#PA...
4,2008.12995v3.pdf,BanglaLekha Isolated Dataset#BanglaLekha Isola...


In [76]:
resultsAnnotation.tail()

Unnamed: 0,label,TDMS
5726,2104.01378v1.pdf,Phone-level pronunciation scoring#speechocean7...
5727,2104.10283v1.pdf,Graph Question Answering#GQA#Accuracy#96.30
5728,2104.11980v1.pdf,Trajectory Modeling#NBA SportVU#1x1 NLL#0.472
5729,1704.00077v1.pdf,Video Segmentation#SegTrack v2#Accuracy#86.86
5730,2004.07922v1.pdf,Document Text Classification#Tobacco small-348...


In [71]:
resultsAnnotation_IBM.tail()

Unnamed: 0,label,TDMS
341,1704.08381.pdf,amr_parsing#LDC2015E86#Smatch#62.1
342,1803.09074.pdf,question_answering#RACE#Accuracy on RACE-m#60....
343,1602.02373.pdf,text_classification#AG News#Error#6.57$text_cl...
344,N18-2108.pdf,coreference_resolution#CoNLL 2012#Avg F1#73.0
345,P17-1089.pdf,sql_parsing#ATIS#Question Split#45$sql_parsing...


In [81]:
taskAnnotation.tail()

Unnamed: 0,label,tasks
5724,2104.01378v1.pdf,Phone-level pronunciation scoring
5725,2104.10283v1.pdf,Graph Question Answering
5726,2104.11980v1.pdf,Trajectory Modeling
5727,1704.00077v1.pdf,Video Segmentation
5728,2004.07922v1.pdf,Document Text Classification


In [78]:
taskAnnotation_IBM.tail()

Unnamed: 0,label,tasks
342,1704.08381.pdf,amr_parsing
343,1803.09074.pdf,question_answering
344,1602.02373.pdf,sentiment_analysis
345,N18-2108.pdf,coreference_resolution
346,P17-1089.pdf,sql_parsing


In [171]:
tdm = pd.read_csv(f"/nfs/home/kabenamualus/Research/task-dataset-metric-extraction/data/paperwithcode/new/60Neg800unk/twofoldwithunk/fold1/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [172]:
tdm.head()

Unnamed: 0,label,title,TDM,Context
0,True,1810.02575v1.pdf,Semantic Segmentation; Nighttime Driving; mIoU,Dark Model Adaptation: Semantic Image Segmenta...
1,False,1810.02575v1.pdf,Extractive Text Summarization; DebateSum; ROUGE-L,Dark Model Adaptation: Semantic Image Segmenta...
2,False,1810.02575v1.pdf,Action Recognition; Something-Something V1; To...,Dark Model Adaptation: Semantic Image Segmenta...
3,False,1810.02575v1.pdf,Multi-Object Tracking; MOTS20; sMOTSA,Dark Model Adaptation: Semantic Image Segmenta...
4,False,1810.02575v1.pdf,Continuous Control; PyBullet Ant; Return,Dark Model Adaptation: Semantic Image Segmenta...


In [173]:
tdm.tail()

Unnamed: 0,label,title,TDM,Context
256003,False,1307.0414v1.pdf,Skeleton Based Action Recognition; SHREC 2017 ...,Challenges in Representation Learning: A repor...
256004,False,1307.0414v1.pdf,Fake News Detection; FNC-1; Weighted Accuracy,Challenges in Representation Learning: A repor...
256005,False,1307.0414v1.pdf,Multimodal Unsupervised Image-To-Image Transla...,Challenges in Representation Learning: A repor...
256006,False,1307.0414v1.pdf,Graph Classification; PTC; Accuracy,Challenges in Representation Learning: A repor...
256007,False,1307.0414v1.pdf,Pose Estimation; UPenn Action; Mean PCK@0.2,Challenges in Representation Learning: A repor...


In [242]:
tdm[tdm.title =="1908.05786v1.pdf" ].head(1)

Unnamed: 0,label,title,TDM,Context
610,True,1908.05786v1.pdf,unknow,TASED-Net: Temporally-Aggregating Spatial Enco...


In [148]:
tdm[tdm.count_tdm>800].head()

Unnamed: 0,tdm,count_tdm
703,Skeleton Based Action Recognition#NTU RGB+D#Ac...,896
704,Skeleton Based Action Recognition#NTU RGB+D#Ac...,912


In [121]:
import re
# re.sub(r"[^a-zA-Z0-9?,'’‘´`%]+", '', "Hell\no*%& \n").strip()
f = re.sub(r"[\n]+", '', "Hell\no*%& \n").strip()

In [123]:
f

'Hello*%&'