# Stats in Paper

In [119]:
# imports
import ipdb, os, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

## IBM

In [120]:
IBM_train_csv = "~/Research/task-dataset-metric-nli-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/train.tsv"
IBM_test_csv = "~/Research/task-dataset-metric-nli-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/test.tsv"

In [4]:
train_IBM = pd.read_csv(IBM_train_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])
# train_IBM['label'] = train_IBM.label.apply(lambda x: "true" if x else "false")

test_IBM = pd.read_csv(IBM_test_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [2]:
def get_stats(path_to_df): 
    
    unique_labels = path_to_df[(path_to_df.label == True)].TDM.tolist()
    
    TDM = set()
    Uniq_task = set()
    Uniq_dataset = set()
    Uniq_metric = set()
    unknown_count = 0
    avg_tdm_per_paper = defaultdict(lambda : 0)
    
    for contrib in unique_labels:
        split = contrib.split(';')
        
        if(len(split) == 1):
            
            unknown_count += 1 
        else:
            if len(split) !=3:
#                 ipdb.set_trace()
                task, dataset, metric, _ = split
                
            else:
                task, dataset, metric = split
            
            t, d, m = task.strip(), dataset.strip(), metric.strip()
            TDM.add(f"{t}#{d}#{m}")
            
            Uniq_task.add(t)
            Uniq_dataset.add(d)
            Uniq_metric.add(m)
    
    for paper in path_to_df[(path_to_df.label == True) & (path_to_df.TDM != 'unknown') ].title.tolist():
        avg_tdm_per_paper[paper] += 1
    
    print(f"Number of papers: {len(set(path_to_df[(path_to_df.label == True)].title.tolist()))}")
    print(f"Unknown count: {unknown_count}")
    print(f"Total leaderboards: {len(path_to_df[(path_to_df.label == True) & (path_to_df.TDM != 'unknown')].title.tolist())}")
    print(f"Avg leaderboard per paper: {round(np.mean(list(avg_tdm_per_paper.values())), 2)}")
    print(f"Distinc leaderboard: {len(TDM)}")
    print(f"Distinct taks: {len(Uniq_task)}")
    print(f"Distinc datasets: {len(Uniq_dataset)}")
    print(f"Distinc metrics: {len(Uniq_metric)}")
    print(f"Max leaderboard per paper: {round(np.max(list(avg_tdm_per_paper.values())), 2)}")
    print(f"Min leaderboard per paper: {round(np.min(list(avg_tdm_per_paper.values())), 2)}")
    
    
    return avg_tdm_per_paper

### Train

In [6]:
train_IBM.head()

Unnamed: 0,label,title,TDM,Context
0,True,D16-1036.pdf,unknown,Multi-view Response Selection for Human-Comput...
1,False,D16-1036.pdf,question answering; SQuAD; F1,Multi-view Response Selection for Human-Comput...
2,False,D16-1036.pdf,relation prediction; FB15K-237; H@1,Multi-view Response Selection for Human-Comput...
3,False,D16-1036.pdf,word sense disambiguation; SemEval 2013; F1,Multi-view Response Selection for Human-Comput...
4,False,D16-1036.pdf,language modeling; 1B Words / Google Billion W...,Multi-view Response Selection for Human-Comput...


In [7]:
train_IBM.title.nunique()

170

In [8]:
avg_tdm_per_paper = get_stats(train_IBM)

Number of papers: 170
Unknown count: 46
Total leaderboards: 327
Avg leaderboard per paper: 2.64
Distinc leaderboard: 78
Distinct taks: 18
Distinc datasets: 44
Distinc metrics: 31


### Test

In [9]:
test_IBM.head()

Unnamed: 0,label,title,TDM,Context
0,True,1803.11175.pdf,sentiment analysis; SUBJ; Accuracy,Universal Sentence Encoder We present models f...
1,True,1803.11175.pdf,text classification; TREC; Error,Universal Sentence Encoder We present models f...
2,False,1803.11175.pdf,question answering; SQuAD; F1,Universal Sentence Encoder We present models f...
3,False,1803.11175.pdf,relation prediction; FB15K-237; H@1,Universal Sentence Encoder We present models f...
4,False,1803.11175.pdf,word sense disambiguation; SemEval 2013; F1,Universal Sentence Encoder We present models f...


In [10]:
metric = get_stats(test_IBM)

Number of papers: 167
Unknown count: 45
Total leaderboards: 294
Avg leaderboard per paper: 2.41
Distinc leaderboard: 78
Distinct taks: 18
Distinc datasets: 44
Distinc metrics: 31


In [11]:
# Make sure that all leaderboard in test are present in train 
count = []
for paper in test_IBM.TDM.to_list():
    if paper not in train_IBM.TDM.to_list():
        count.append(paper)
print(count)

[]


In [12]:
count = []
for paper in train_IBM.TDM.to_list():
    if paper not in test_IBM.TDM.to_list():
        print(paper)
        count.append(paper)
print(count)

[]


### our dataset

In [78]:
# New_train_csv = "~/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_800/twofoldwithunk/fold1/train.tsv"
# New_test_csv = "~/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_800/twofoldwithunk/fold1/dev.tsv"

New_train_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk/fold2/train.tsv"
New_test_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk/fold2/dev.tsv"

# New_train_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_5000/10Neg5000unk/twofoldwithunk/fold2/train.tsv"
# New_test_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_5000/10Neg5000unk/twofoldwithunk/fold2/dev.tsv"

# New_train_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_1000/10Neg1000unk/twofoldwithunk/fold2/train.tsv"
# New_test_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_1000/10Neg1000unk/twofoldwithunk/fold2/dev.tsv"

# New_train_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_500/10Neg500unk/twofoldwithunk/fold1/train.tsv"
# New_test_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_500/10Neg500unk/twofoldwithunk/fold1/dev.tsv"

# New_train_csv = IBM_train_csv
# New_test_csv = IBM_test_csv

In [91]:
train_New = pd.read_csv(New_train_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])
# train_IBM['label'] = train_IBM.label.apply(lambda x: "true" if x else "false")

test_New = pd.read_csv(New_test_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [92]:
len(train_New.drop_duplicates())

50821

In [93]:
len(train_New)

52315

In [94]:
len(train_New[(train_New.TDM=="unknown") & (train_New.label==False)])

8

In [102]:
train_New[ (train_New.label==True)]

Unnamed: 0,label,title,TDM,Context
0,True,1707.03497v2.pdf,Atari Games; Atari 2600 Seaquest; Score,Value Prediction Network This paper proposes a...
1,True,1707.03497v2.pdf,Atari Games; Atari 2600 Amidar; Score,Value Prediction Network This paper proposes a...
2,True,1707.03497v2.pdf,Atari Games; Atari 2600 Krull; Score,Value Prediction Network This paper proposes a...
3,True,1707.03497v2.pdf,Atari Games; Atari 2600 Alien; Score,Value Prediction Network This paper proposes a...
4,True,1707.03497v2.pdf,Atari Games; Atari 2600 Enduro; Score,Value Prediction Network This paper proposes a...
...,...,...,...,...
52290,True,1905.11946v5.pdf,Fine-Grained Image Classification; Birdsnap; A...,EfficientNet: Rethinking Model Scaling for Con...
52291,True,1905.11946v5.pdf,Fine-Grained Image Classification; Food-101; A...,EfficientNet: Rethinking Model Scaling for Con...
52292,True,1905.11946v5.pdf,Fine-Grained Image Classification; Stanford Ca...,EfficientNet: Rethinking Model Scaling for Con...
52293,True,1905.11946v5.pdf,Fine-Grained Image Classification; Oxford-IIIT...,EfficientNet: Rethinking Model Scaling for Con...


In [95]:
train_New[(train_New.TDM=="unknown") & (train_New.label==True)]

Unnamed: 0,label,title,TDM,Context
125,True,1908.05786v1.pdf,unknown,TASED-Net: Temporally-Aggregating Spatial Enco...
176,True,2008.04259.pdf,unknown,A Perceptually-Motivated Approach for Low-Comp...
188,True,2009.04534v2.pdf,unknown,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
190,True,2009.04534v2.pdf,unknown,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
202,True,2007.04973v3.pdf,unknown,Contrastive Code Representation Learning Recen...
...,...,...,...,...
52191,True,2101.07172v2.pdf,unknown,HarDNet-MSEG: A Simple Encoder-Decoder Polyp S...
52205,True,1905.00067v3.pdf,unknown,MixHop: Higher-Order Graph Convolutional Archi...
52211,True,1905.00067v3.pdf,unknown,MixHop: Higher-Order Graph Convolutional Archi...
52223,True,1602.01595v4.pdf,unknown,"Many Languages, One Parser We train one multil..."


In [96]:
train_New[(train_New.title=="1602.01595v4.pdf") & (train_New.label==True)]

Unnamed: 0,label,title,TDM,Context
52223,True,1602.01595v4.pdf,unknown,"Many Languages, One Parser We train one multil..."


In [97]:
train_New[(train_New.title=="2009.04534v2.pdf") & (train_New.label==True)]

Unnamed: 0,label,title,TDM,Context
187,True,2009.04534v2.pdf,Language Modelling; WikiText-103; Test perplexity,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
188,True,2009.04534v2.pdf,unknown,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
189,True,2009.04534v2.pdf,Language Modelling; Text8; Bit per Character (...,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
190,True,2009.04534v2.pdf,unknown,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
191,True,2009.04534v2.pdf,Sentiment Analysis; SST-2 Binary classificatio...,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...


In [98]:
train_New[(train_New.title=="2009.04534v2.pdf") & (train_New.label==True)]

Unnamed: 0,label,title,TDM,Context
187,True,2009.04534v2.pdf,Language Modelling; WikiText-103; Test perplexity,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
188,True,2009.04534v2.pdf,unknown,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
189,True,2009.04534v2.pdf,Language Modelling; Text8; Bit per Character (...,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
190,True,2009.04534v2.pdf,unknown,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
191,True,2009.04534v2.pdf,Sentiment Analysis; SST-2 Binary classificatio...,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...


In [105]:
train_New[(train_New.title=="2011.14859v2.pdf")]

Unnamed: 0,label,title,TDM,Context


In [27]:
avg_tdm_per_paper = get_stats(train_New)

Number of papers: 3753
Unknown count: 2935
Total leaderboards: 11896
Avg leaderboard per paper: 4.18
Distinc leaderboard: 1810
Distinct taks: 288
Distinc datasets: 909
Distinc metrics: 558
Max leaderboard per paper: 58
Min leaderboard per paper: 1


In [22]:
((677+323)+(697+303))/2

1000.0

In [29]:
((2931+1255)+(2935+1251))/2

4186.0

In [89]:
1205+2981

4186

In [93]:
print("Train")
print("======")
print(f"Avg Unknown count: {round((2934 + 3028)/2)}")
print(f"Avg Total leaderboards: {round((11690 + 11757)/2)}")
print(f"Avg leaderboard per paper: {round((4.13 + 4.15)/2, 1)}")
print(f"Avg Distinc leaderboard: {round((1791 + 1820)/2)}")
print(f"Avg Distinct taks: {round((286 + 291)/2)}")
print(f"Avg Distinc datasets: {round((905 + 912)/2)}")
print(f"Avg Distinc metrics: {round((547 + 553)/2)}")

Train
Avg Unknown count: 2981
Avg Total leaderboards: 11724
Avg leaderboard per paper: 4.1
Avg Distinc leaderboard: 1806
Avg Distinct taks: 288
Avg Distinc datasets: 908
Avg Distinc metrics: 550


In [28]:
avg_tdm_per_paper = get_stats(test_New)

Number of papers: 1608
Unknown count: 1251
Total leaderboards: 4888
Avg leaderboard per paper: 4.02
Distinc leaderboard: 1582
Distinct taks: 252
Distinc datasets: 805
Distinc metrics: 459
Max leaderboard per paper: 58
Min leaderboard per paper: 1


In [88]:
print("Test")
print("======")
print(f"Avg Unknown count: {round((1252 + 1158)/2)}")
print(f"Avg Total leaderboards: {round((5094 + 5027)/2)}")
print(f"Avg leaderboard per paper: {round((4.14 + 4.1)/2, 1)}")
print(f"Avg Distinc leaderboard: {round((1556 + 1541)/2)}")
print(f"Avg Distinct taks: {round((254 + 250)/2)}")
print(f"Avg Distinc datasets: {round((806 + 790)/2)}")
print(f"Avg Distinc metrics: {round((472 + 466)/2)}")

Test
Avg Unknown count: 1205
Avg Total leaderboards: 5060
Avg leaderboard per paper: 4.1
Avg Distinc leaderboard: 1548
Avg Distinct taks: 252
Avg Distinc datasets: 798
Avg Distinc metrics: 469


In [17]:
count = []
for tdm in test_New.TDM.to_list():
    if tdm not in train_New.TDM.to_list():
        count.append(tdm)
print(count)

[]


In [18]:
count = []
for tdm in train_New.TDM.to_list():
    if tdm not in test_New.TDM.to_list():
        count.append(tdm)
print(len(count))

0


In [52]:
# New_train_csv = "~/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_800/twofoldwithunk/fold1/train.tsv"
# New_test_csv = "~/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_800/twofoldwithunk/fold1/dev.tsv"

New_train_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk/fold2/train.tsv"
New_test_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk/fold2/dev.tsv"



In [53]:
train_New = pd.read_csv(New_train_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])
# train_IBM['label'] = train_IBM.label.apply(lambda x: "true" if x else "false")

test_New = pd.read_csv(New_test_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [54]:
avg_tdm_per_paper = get_stats(train_New)

Number of papers: 3753
Unknown count: 3028
Total leaderboards: 11757
Avg leaderboard per paper: 4.15
Distinc leaderboard: 1820
Distinct taks: 291
Distinc datasets: 912
Distinc metrics: 553
Max leaderboard per paper: 58
Min leaderboard per paper: 1


In [55]:
avg_tdm_per_paper = get_stats(test_New)

Number of papers: 1608
Unknown count: 1158
Total leaderboards: 5027
Avg leaderboard per paper: 4.1
Distinc leaderboard: 1541
Distinct taks: 250
Distinc datasets: 790
Distinc metrics: 466
Max leaderboard per paper: 58
Min leaderboard per paper: 1


##  Remove unknown label for paper with leaderboards

In [124]:
New_train_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk/fold1/train.tsv"
New_test_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk/fold1/dev.tsv"

# New_train_csv = IBM_train_csv
# New_test_csv = IBM_test_csv

train_New = pd.read_csv(New_train_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])
# train_IBM['label'] = train_IBM.label.apply(lambda x: "true" if x else "false")

test_New = pd.read_csv(New_test_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [125]:
len(train_New)

52154

In [126]:
train_New.drop_duplicates(inplace=True)
len(train_New)

50757

In [128]:
train_New[(train_New.label==True) & (train_New.TDM=="unknown")]

Unnamed: 0,label,title,TDM,Context
104,True,1903.12290v2.pdf,unknown,Revisiting Local Descriptor based Image-to-Cla...
135,True,1806.05228v2.pdf,unknown,3D-CODED : 3D Correspondences by Deep Deformat...
157,True,1908.05786v1.pdf,unknown,TASED-Net: Temporally-Aggregating Spatial Enco...
210,True,2008.04259.pdf,unknown,A Perceptually-Motivated Approach for Low-Comp...
222,True,2009.04534v2.pdf,unknown,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
...,...,...,...,...
51990,True,1912.06112v2.pdf,unknown,IEEE TRANSACTIONS ON IMAGE PROCESSING 1 Unifie...
52020,True,1808.08703v3.pdf,unknown,Generating Text through Adversarial Training u...
52045,True,1602.01595v4.pdf,unknown,"Many Languages, One Parser We train one multil..."
52068,True,2006.11275v2.pdf,unknown,Center-based 3D Object Detection and Tracking ...


In [132]:
train_New[(train_New.label==True) & (train_New.title=="1903.12290v2.pdf")]

Unnamed: 0,label,title,TDM,Context
103,True,1903.12290v2.pdf,Few-Shot Image Classification; Stanford Cars 5...,Revisiting Local Descriptor based Image-to-Cla...
104,True,1903.12290v2.pdf,unknown,Revisiting Local Descriptor based Image-to-Cla...
105,True,1903.12290v2.pdf,Few-Shot Image Classification; CUB 200 5-way 1...,Revisiting Local Descriptor based Image-to-Cla...
106,True,1903.12290v2.pdf,Few-Shot Image Classification; Stanford Cars 5...,Revisiting Local Descriptor based Image-to-Cla...
107,True,1903.12290v2.pdf,Few-Shot Image Classification; CUB 200 5-way 5...,Revisiting Local Descriptor based Image-to-Cla...
108,True,1903.12290v2.pdf,Few-Shot Image Classification; Mini-Imagenet 5...,Revisiting Local Descriptor based Image-to-Cla...
109,True,1903.12290v2.pdf,Few-Shot Image Classification; Stanford Dogs 5...,Revisiting Local Descriptor based Image-to-Cla...
110,True,1903.12290v2.pdf,Few-Shot Image Classification; Mini-Imagenet 5...,Revisiting Local Descriptor based Image-to-Cla...


In [133]:
train_New[(train_New.label==True) & (train_New.title=="1806.05228v2.pdf")]

Unnamed: 0,label,title,TDM,Context
135,True,1806.05228v2.pdf,unknown,3D-CODED : 3D Correspondences by Deep Deformat...


In [138]:
train_New[(train_New.label==True) & (train_New.title=="1903.12290v2.pdf") & (train_New.TDM=="unknown")].index

Int64Index([104], dtype='int64')

In [141]:
papers = set(train_New.title.to_list())
for paper in papers:
    if len(train_New[(train_New.label==True) & (train_New.title==paper)]) != 1:
        train_New.drop(train_New[(train_New.label==True) & (train_New.title==paper) & (train_New.TDM=="unknown")].index, inplace=True)

In [142]:
len(train_New)

50143

In [143]:
train_New[(train_New.label==True) & (train_New.title=="1903.12290v2.pdf")]

Unnamed: 0,label,title,TDM,Context
103,True,1903.12290v2.pdf,Few-Shot Image Classification; Stanford Cars 5...,Revisiting Local Descriptor based Image-to-Cla...
105,True,1903.12290v2.pdf,Few-Shot Image Classification; CUB 200 5-way 1...,Revisiting Local Descriptor based Image-to-Cla...
106,True,1903.12290v2.pdf,Few-Shot Image Classification; Stanford Cars 5...,Revisiting Local Descriptor based Image-to-Cla...
107,True,1903.12290v2.pdf,Few-Shot Image Classification; CUB 200 5-way 5...,Revisiting Local Descriptor based Image-to-Cla...
108,True,1903.12290v2.pdf,Few-Shot Image Classification; Mini-Imagenet 5...,Revisiting Local Descriptor based Image-to-Cla...
109,True,1903.12290v2.pdf,Few-Shot Image Classification; Stanford Dogs 5...,Revisiting Local Descriptor based Image-to-Cla...
110,True,1903.12290v2.pdf,Few-Shot Image Classification; Mini-Imagenet 5...,Revisiting Local Descriptor based Image-to-Cla...


In [144]:
train_New[(train_New.label==True) & (train_New.TDM=="unknown")]

Unnamed: 0,label,title,TDM,Context
135,True,1806.05228v2.pdf,unknown,3D-CODED : 3D Correspondences by Deep Deformat...
157,True,1908.05786v1.pdf,unknown,TASED-Net: Temporally-Aggregating Spatial Enco...
210,True,2008.04259.pdf,unknown,A Perceptually-Motivated Approach for Low-Comp...
250,True,2007.04973v3.pdf,unknown,Contrastive Code Representation Learning Recen...
261,True,1806.11538v2.pdf,unknown,Factorizable Net: An Efficient Subgraph-based ...
...,...,...,...,...
51838,True,2001.10692v1.pdf,unknown,ImVoteNet: Boosting 3D Object Detection in Poi...
51916,True,1605.06240v3.pdf,unknown,FPNN: Field Probing Neural Networks for 3D Dat...
52020,True,1808.08703v3.pdf,unknown,Generating Text through Adversarial Training u...
52045,True,1602.01595v4.pdf,unknown,"Many Languages, One Parser We train one multil..."


In [145]:
train_New[(train_New.label==True) & (train_New.title=="1411.1091v1.pdf")]

Unnamed: 0,label,title,TDM,Context
52121,True,1411.1091v1.pdf,unknown,Do Convnets Learn Correspondence? Convolutiona...


In [152]:
output = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000_correct/10Neg10000unk/twofoldwithunk/fold1/"
if not os.path.exists(output):
        os.makedirs(output)

In [153]:
train_New.to_csv(f"{output}train.tsv", 
                    header=False, index=False, sep="\t")

## Test

In [146]:
len(test_New)

22426

In [147]:
test_New.drop_duplicates(inplace=True)
len(test_New)

21821

In [148]:
papers = set(test_New.title.to_list())
for paper in papers:
    if len(test_New[(test_New.label==True) & (test_New.title==paper)]) != 1:
        test_New.drop(test_New[(test_New.label==True) & (test_New.title==paper) & (test_New.TDM=="unknown")].index, inplace=True)

In [150]:
len(test_New)

21552

In [154]:
test_New.to_csv(f"{output}dev.tsv", 
                    header=False, index=False, sep="\t")

# F2

In [155]:
New_train_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk/fold2/train.tsv"
New_test_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk/fold2/dev.tsv"

# New_train_csv = IBM_train_csv
# New_test_csv = IBM_test_csv

train_New = pd.read_csv(New_train_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])
# train_IBM['label'] = train_IBM.label.apply(lambda x: "true" if x else "false")

test_New = pd.read_csv(New_test_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [156]:
len(train_New)

52315

In [157]:
train_New.drop_duplicates(inplace=True)
len(train_New)

50821

In [158]:
train_New[(train_New.label==True) & (train_New.TDM=="unknown")]

Unnamed: 0,label,title,TDM,Context
125,True,1908.05786v1.pdf,unknown,TASED-Net: Temporally-Aggregating Spatial Enco...
176,True,2008.04259.pdf,unknown,A Perceptually-Motivated Approach for Low-Comp...
188,True,2009.04534v2.pdf,unknown,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
202,True,2007.04973v3.pdf,unknown,Contrastive Code Representation Learning Recen...
234,True,1806.11538v2.pdf,unknown,Factorizable Net: An Efficient Subgraph-based ...
...,...,...,...,...
52174,True,2007.08176v2.pdf,unknown,CSI: Novelty Detection via Contrastive Learnin...
52188,True,2101.07172v2.pdf,unknown,HarDNet-MSEG: A Simple Encoder-Decoder Polyp S...
52205,True,1905.00067v3.pdf,unknown,MixHop: Higher-Order Graph Convolutional Archi...
52223,True,1602.01595v4.pdf,unknown,"Many Languages, One Parser We train one multil..."


In [161]:
train_New[(train_New.label==True) & (train_New.title=="2009.04534v2.pdf")]

Unnamed: 0,label,title,TDM,Context
187,True,2009.04534v2.pdf,Language Modelling; WikiText-103; Test perplexity,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
188,True,2009.04534v2.pdf,unknown,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
189,True,2009.04534v2.pdf,Language Modelling; Text8; Bit per Character (...,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
191,True,2009.04534v2.pdf,Sentiment Analysis; SST-2 Binary classificatio...,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...


In [162]:
train_New[(train_New.label==True) & (train_New.title=="1411.1091v1.pdf")]

Unnamed: 0,label,title,TDM,Context
52259,True,1411.1091v1.pdf,unknown,Do Convnets Learn Correspondence? Convolutiona...


In [163]:
papers = set(train_New.title.to_list())
for paper in papers:
    if len(train_New[(train_New.label==True) & (train_New.title==paper)]) != 1:
        train_New.drop(train_New[(train_New.label==True) & (train_New.title==paper) & (train_New.TDM=="unknown")].index, inplace=True)

In [164]:
len(train_New)

50207

In [165]:
train_New[(train_New.label==True) & (train_New.title=="2009.04534v2.pdf")]

Unnamed: 0,label,title,TDM,Context
187,True,2009.04534v2.pdf,Language Modelling; WikiText-103; Test perplexity,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
189,True,2009.04534v2.pdf,Language Modelling; Text8; Bit per Character (...,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...
191,True,2009.04534v2.pdf,Sentiment Analysis; SST-2 Binary classificatio...,PAY ATTENTION WHEN REQUIRED A PREPRINT Transfo...


In [166]:
train_New[(train_New.label==True) & (train_New.TDM=="unknown")]

Unnamed: 0,label,title,TDM,Context
125,True,1908.05786v1.pdf,unknown,TASED-Net: Temporally-Aggregating Spatial Enco...
176,True,2008.04259.pdf,unknown,A Perceptually-Motivated Approach for Low-Comp...
202,True,2007.04973v3.pdf,unknown,Contrastive Code Representation Learning Recen...
234,True,1806.11538v2.pdf,unknown,Factorizable Net: An Efficient Subgraph-based ...
322,True,1908.02262v1.pdf,unknown,Predicting Prosodic Prominence from Text with ...
...,...,...,...,...
51902,True,1611.02200v1.pdf,unknown,UNSUPERVISED CROSS-DOMAIN IMAGE GENERATION We ...
51913,True,2010.05171v1.pdf,unknown,FAIRSEQ S2T: Fast Speech-to-Text Modeling with...
51973,True,2001.10692v1.pdf,unknown,ImVoteNet: Boosting 3D Object Detection in Poi...
52223,True,1602.01595v4.pdf,unknown,"Many Languages, One Parser We train one multil..."


In [167]:
train_New[(train_New.label==True) & (train_New.title=="1908.02262v1.pdf")]

Unnamed: 0,label,title,TDM,Context
322,True,1908.02262v1.pdf,unknown,Predicting Prosodic Prominence from Text with ...


In [168]:
output = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000_correct/10Neg10000unk/twofoldwithunk/fold2/"
if not os.path.exists(output):
        os.makedirs(output)

In [169]:
train_New.to_csv(f"{output}train.tsv", 
                    header=False, index=False, sep="\t")

## Test

In [170]:
len(test_New)

22265

In [171]:
test_New.drop_duplicates(inplace=True)
len(test_New)

21757

In [172]:
papers = set(test_New.title.to_list())
for paper in papers:
    if len(test_New[(test_New.label==True) & (test_New.title==paper)]) != 1:
        test_New.drop(test_New[(test_New.label==True) & (test_New.title==paper) & (test_New.TDM=="unknown")].index, inplace=True)

In [173]:
len(test_New)

21488

In [174]:
test_New.to_csv(f"{output}dev.tsv", 
                    header=False, index=False, sep="\t")