# Stats in Paper

In [1]:
# imports
import ipdb, os, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

### Selected Dataset

In [2]:
New_train_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk/fold1/train.tsv"
New_test_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk/fold1/dev.tsv"

New_trainOutput_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/trainOutput.tsv"

IBM_train_csv = "~/Research/task-dataset-metric-nli-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/train.tsv"
IBM_test_csv = "~/Research/task-dataset-metric-nli-extraction/data/ibm/exp/few-shot-setup/NLP-TDMS/paperVersion/test.tsv"

Old_train_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/other/pwc_ibm_150_5_10_800/twofoldwithunk/fold1/train.tsv"
Old_test_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/other/pwc_ibm_150_5_10_800/twofoldwithunk/fold1/dev.tsv"

IBM_train_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/ibm/exp/zero-shot-setup/NLP-TDMS/train.tsv"
IBM_test_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/ibm/exp/zero-shot-setup/NLP-TDMS/test.tsv"

# New_train_csv = IBM_train_csv
# New_test_csv = IBM_test_csv

# New_train_csv =Old_train_csv
# New_test_csv = Old_test_csv

In [3]:
train_New = pd.read_csv(New_train_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

test_New = pd.read_csv(New_test_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

trainOutput_New = pd.read_csv(New_trainOutput_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [4]:
train_New.head()

Unnamed: 0,label,title,TDM,Context
0,True,1707.03497v2.pdf,Atari Games; Atari 2600 Seaquest; Score,Value Prediction Network This paper proposes a...
1,True,1707.03497v2.pdf,Atari Games; Atari 2600 Amidar; Score,Value Prediction Network This paper proposes a...
2,True,1707.03497v2.pdf,Atari Games; Atari 2600 Krull; Score,Value Prediction Network This paper proposes a...
3,True,1707.03497v2.pdf,Atari Games; Atari 2600 Alien; Score,Value Prediction Network This paper proposes a...
4,True,1707.03497v2.pdf,Atari Games; Atari 2600 Enduro; Score,Value Prediction Network This paper proposes a...


In [5]:
train_unique_tdm = set(train_New[train_New.label==True].TDM.to_list())

In [6]:
test_unique_tdm = set(test_New[test_New.label==True].TDM.to_list())

In [7]:
print(f"unique TDM train: {len(train_unique_tdm)}")
print(f"unique TDM test: {len(test_unique_tdm)}")

unique TDM train: 1792
unique TDM test: 1557


In [8]:
def unique_tdm_check(left, right, mode="Train"):
    count = 0
    count_set = set()
    for tdm in left:
        if tdm not in right:
            count_set.add(tdm)
            count  += 1
    print(f"Missing TDM {mode}: {count}")
    return count_set

## Create Zero-shoot

In [9]:
def create_zero_shoot(train, test, output_path="/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_500/10Neg500unk_shot/zero_shot_twofoldwithunk/fold1/"):
    
    if not os.path.exists(output_path):
        os.makedirs(output_path)
        
    unique_sets_train = set(train[train.label==True].TDM.to_list())
    unique_sets_test = set(test[test.label==True].TDM.to_list())
        
    print(f"Len unique TDM Train: {len(unique_sets_train)}.")
    print(f"Len unique TDM Test: {len(unique_sets_test)}.")
    
    print(f"Size Train df before: {len(train)}.")
    print(f"Size Train df before: {len(test)}.")
    
    for label in unique_sets_train:
        index_to_drop = test[(test.TDM ==label) & (test.label==True)].index
        test.drop(index_to_drop, axis=0, inplace=True)
        
    for label in unique_sets_test:
        index_to_drop = train[(train.TDM ==label) & (train.label==True)].index
        train.drop(index_to_drop, axis=0, inplace=True)
        
    train.to_csv(f"{output_path}train.tsv", 
                              header=False, index=False, sep="\t")    
    test.to_csv(f"{output_path}dev.tsv", 
                              header=False, index=False, sep="\t")
    
    unique_sets_train = set(train[train.label==True].TDM.to_list())
    unique_sets_test = set(test[test.label==True].TDM.to_list())
    
    print(f"Len unique TDM Train: {len(unique_sets_train)}.")
    print(f"Len unique TDM Test: {len(unique_sets_test)}.")
    
    print(f"Size Train df After: {len(train)}.")
    print(f"Size Test df After: {len(test)}.")
    
    print("Done.")

In [10]:
# def create_zero_shoot(trainOutput_New, split=0.8, output_path="/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_500/10Neg500unk_shot/zero_shot_twofoldwithunk/fold1/"):
    
#     if not os.path.exists(output_path):
#         os.makedirs(output_path)
        
#     unique_sets = set(trainOutput_New[trainOutput_New.label==True].TDM.to_list())
#     train_split_tdm, test_split_tdm = train_test_split(list(unique_sets), train_size=split)
    
#     print(f"Len unique TDM {len(unique_sets)}.")
    
#     Zero_shoot_train_New = pd.DataFrame().reindex_like(trainOutput_New).dropna()
#     Zero_shoot_test_New = pd.DataFrame().reindex_like(trainOutput_New).dropna()
    
#     for label in train_split_tdm:
#         Zero_shoot_train_New = Zero_shoot_train_New.merge(trainOutput_New[(trainOutput_New.TDM==label) & (trainOutput_New.TDM==label)], how='outer')
        
#     for label in test_split_tdm:
#         Zero_shoot_test_New = Zero_shoot_test_New.merge(trainOutput_New[trainOutput_New.TDM==label], how='outer')
        
#     Zero_shoot_train_New.to_csv(f"{output_path}train.tsv", 
#                               header=False, index=False, sep="\t")
    
#     print(f"Len train dataset {len(Zero_shoot_train_New)}.")
    
#     Zero_shoot_test_New.to_csv(f"{output_path}dev.tsv", 
#                               header=False, index=False, sep="\t")
    
#     print(f"Len test dataset {len(Zero_shoot_test_New)}.")
    
#     print("Done.")

In [11]:
output = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/zero_shot_2_twofoldwithunk/fold1/"

In [12]:
# create_zero_shoot(trainOutput_New=trainOutput_New, split=0.8, 
#                   output_path=output)

In [13]:
create_zero_shoot(train=train_New, test=test_New, 
                  output_path=output)

Len unique TDM Train: 1792.
Len unique TDM Test: 1557.
Size Train df before: 52154.
Size Train df before: 22426.
Len unique TDM Train: 294.
Len unique TDM Test: 59.
Size Train df After: 38608.
Size Test df After: 16165.
Done.


In [14]:
tdm_in_test_not_train = unique_tdm_check(left=New_test_csv, right=New_train_csv, mode="Test")

Missing TDM Test: 0


In [15]:
tdm_in_train_not_test = unique_tdm_check(left=New_train_csv, right=New_test_csv, mode="Train")

Missing TDM Train: 0


In [16]:
New_train_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk/fold2/train.tsv"
New_test_csv = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk/fold2/dev.tsv"

train_New = pd.read_csv(New_train_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

test_New = pd.read_csv(New_test_csv, 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [17]:
output = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/zero_shot_2_twofoldwithunk/fold2/"

In [18]:
# create_zero_shoot(trainOutput_New=trainOutput_New, split=0.8, 
#                   output_path=output)
create_zero_shoot(train=train_New, test=test_New, 
                  output_path=output)

Len unique TDM Train: 1821.
Len unique TDM Test: 1542.
Size Train df before: 52315.
Size Train df before: 22265.
Len unique TDM Train: 309.
Len unique TDM Test: 30.
Size Train df After: 38649.
Size Test df After: 16131.
Done.


In [19]:
New_train_csv = f"{output}train.tsv"
New_test_csv = f"{output}dev.tsv"

In [20]:
tdm_in_test_not_train = unique_tdm_check(left=New_test_csv, right=New_train_csv, mode="Test")

Missing TDM Test: 0


In [21]:
tdm_in_train_not_test = unique_tdm_check(left=New_train_csv, right=New_test_csv, mode="Train")

Missing TDM Train: 0


## Enforce seen TDM in both train and test

In [101]:
print(f"unique TDM train: {len(train_unique_tdm)}")
print(f"unique TDM test: {len(test_unique_tdm)}")

unique TDM train: 96
unique TDM test: 63


In [97]:
tdm_in_test_not_train = unique_tdm_check(left=test_unique_tdm, right=train_unique_tdm, mode="Test")

Missing TDM Test: 63


In [98]:
tdm_in_train_not_test = unique_tdm_check(left=train_unique_tdm, right=test_unique_tdm, mode="Train")

Missing TDM Train: 96


In [99]:
len(train_New)

20160

In [100]:
len(test_New)

17388

In [111]:
test_New[(test_New.TDM=="Fact-based Text Editing; WebEdit; ADD") & (test_New.label==True)]

Unnamed: 0,label,title,TDM,Context
14199,True,2007.00916v1.pdf,Fact-based Text Editing; WebEdit; ADD,Fact-based Text Editing We propose a novel tex...


In [112]:
train_New[(train_New.TDM=="Fact-based Text Editing; WebEdit; ADD") & (train_New.label==True)]

Unnamed: 0,label,title,TDM,Context
