In [1]:
# !pip install --quiet  datasets #to access squad dataset
# !pip install --quiet pyarrow   #to deal with parquet files for saving dataset if required
# !pip install --quiet  tqdm     #for progress bars
# !pip install --quiet transformers # for t5 model
# !pip install --quiet tokenizers  #tokenizers from HuggingFace
# !pip install --quiet sentencepiece #subword tokenizer used by T5
# !pip install --quiet pytorch-lightning # pytorch wrapper 
# !pip install --quiet torchtext # text utilities

# Fetching Datasets

In [2]:
#imports
import pandas as pd
import torch
from tqdm import tqdm
from datasets import DatasetDict, Dataset, load_from_disk
# from torch.utils.data import Dataset, DataLoader
from pprint import pprint
import copy
import numpy as np
from collections import defaultdict
import ipdb

In [3]:
device  = 'cuda' if torch.cuda.is_available() else "cpu"
device

'cpu'

In [4]:
# path_to_source = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_full_5_10_10000_clone_latex_compare/10Neg10000unk/twofoldwithunk"
path_to_csv = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk"

fold1 = "fold1"
train_f1_pd = pd.read_csv(f"{path_to_csv}/{fold1}/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])
dev_f1_pd = pd.read_csv(f"{path_to_csv}/{fold1}/dev.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

fold2 = "fold2"
train_f2_pd = pd.read_csv(f"{path_to_csv}/{fold2}/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])
dev_f2_pd = pd.read_csv(f"{path_to_csv}/{fold2}/dev.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [5]:
# no_leaderboard_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/T5-Leaderboard-QA/data_proccess/arxiv_no_leaderboard_links_pdf_short/DocTAET_full.tsv", 
#                     sep="\t", names=["title", "Context"])

no_leaderboard_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/T5-Leaderboard-QA/data_proccess/arxiv_no_leaderboard_links_pdf_short/DocTAET_150.tsv", 
                    sep="\t", names=["title", "Context"])

no_leaderboard_pd.describe()

Unnamed: 0,title,Context
count,4369,4369
unique,4369,4365
top,0912.4438.pdf,! !
freq,1,2


In [6]:
resultsAnnotation_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/annotations_final/resultsAnnotation.tsv",
                                   sep="\t", names=["Title", "TDMSs"])
resultsAnnotation_pd = resultsAnnotation_pd.fillna("NAN")
resultsAnnotation_pd

Unnamed: 0,Title,TDMSs
0,1704.03549v4.pdf,Optical Character Recognition#FSNS - Test#Sequ...
1,1712.05404.pdf,Optical Character Recognition#FSNS - Test#Sequ...
2,1702.03970v1.pdf,Optical Character Recognition#FSNS - Test#Sequ...
3,2104.02324v1.pdf,"Active Object Detection#COCO#AP#(7.3, 13.8, 16..."
4,2008.12995v3.pdf,Handwriting Recognition#BanglaLekha Isolated D...
...,...,...
5724,2104.01378v1.pdf,Phone-level pronunciation scoring#speechocean7...
5725,2104.10283v1.pdf,Graph Question Answering#GQA#Accuracy#96.30
5726,2104.11980v1.pdf,Trajectory Modeling#NBA SportVU#1x1 NLL#0.472
5727,1704.00077v1.pdf,Video Segmentation#SegTrack v2#Accuracy#86.86


In [7]:
"""
This will take care of papers with more than one learderboard 
"""
records = resultsAnnotation_pd.to_dict("records")

title_to_tdms_dict = defaultdict(
    lambda : 
        list()
    )

for i, row in tqdm(enumerate(records), total = len(records)):
    if row['TDMSs'] == 'NAN':
        continue

    for tdms in row['TDMSs'].split("$"):
        if len(tdms.split("#")) != 4:
            # ipdb.set_trace()
            continue 
        t, d, m, s = tdms.split("#")
        title_to_tdms_dict[row['Title']].append(
            {
                "LEADERBOARD": {
                    "Task": t,
                    "Dataset": d,
                    "Metric": m,
                    # "Score": s,
                }
            }            
        )

100%|██████████| 5729/5729 [00:00<00:00, 124102.22it/s]


In [8]:
len(title_to_tdms_dict)

5725

In [9]:
# No need for negative instances, but will still have 'duplicate' for paper with more than one leaderboard
train_f1_pd = train_f1_pd[train_f1_pd.label==True]
print("train_f1_pd")
display(train_f1_pd.describe())

dev_f1_pd = dev_f1_pd[dev_f1_pd.label==True]
print("dev_f1_pd")
display(dev_f1_pd.describe())

train_f2_pd = train_f2_pd[train_f2_pd.label==True]
print("train_f2_pd")
display(train_f2_pd.describe())

dev_f2_pd = dev_f2_pd[dev_f2_pd.label==True]
print("dev_f2_pd")
display(dev_f2_pd.describe())

train_f1_pd


Unnamed: 0,label,title,TDM,Context
count,12613,12613,12613,12613
unique,1,3753,1792,3747
top,True,1803.00933v1.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,12613,58,923,58


dev_f1_pd


Unnamed: 0,label,title,TDM,Context
count,5472,5472,5472,5472
unique,1,1608,1557,1606
top,True,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,5472,58,378,58


train_f2_pd


Unnamed: 0,label,title,TDM,Context
count,12677,12677,12677,12677
unique,1,3753,1821,3749
top,True,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,12677,58,920,58


dev_f2_pd


Unnamed: 0,label,title,TDM,Context
count,5408,5408,5408,5408
unique,1,1608,1542,1608
top,True,1802.01561v3.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,5408,58,381,58


In [10]:
# len(train_pd.title.unique())
records_train_f1 = train_f1_pd.to_dict("records")
records_dev_f1 = dev_f1_pd.to_dict("records")
records_train_f2 = train_f2_pd.to_dict("records")
records_dev_f2 = dev_f2_pd.to_dict("records")

In [11]:
title_id = records_train_f1[0]["title"].split(".pdf")[0]
title_id

'1707.03497v2'

In [12]:
# title_to_tdms_dict = defaultdict(lambda : defaultdict(lambda : str("| ")))
title_to_content = {
    "train_f1":{},
    "dev_f1":{},
    "train_f2":{},
    "dev_f2":{},
    }

arxiv_leaderboard_full_txt = "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_leaderboard_full_txt"

missed = 0
for i, row in tqdm(enumerate(records_train_f1), total = len(records_train_f1)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["train_f1"]:
        continue 
    else:
        try:
            with open(f'{arxiv_leaderboard_full_txt}/{title_id}.txt', 'r') as file:
                # Read the file
                data = file.read()
                
        except :
            # print(f"Error on file {row['title']}")
            data = "" 
            missed += 1
        
        title_to_content["train_f1"][row['title']] = row['Context'] if len(data.split()) < 100 else data
        # title_to_content["train_f1"][row['title']] = row['Context']

print(f"train_f1 missed long context: {missed}/{len(records_train_f1)}\n")

missed = 0
for i, row in tqdm(enumerate(records_dev_f1), total = len(records_dev_f1)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["dev_f1"]:
        continue 
    else:
        try:
            with open(f'{arxiv_leaderboard_full_txt}/{title_id}.txt', 'r') as file:
                # Read the file
                data = file.read()
                
        except :
            # print(f"Error on file {row['title']}")
            data = ""
            missed += 1
            # continue 
        
        title_to_content["dev_f1"][row['title']] = row['Context'] if len(data.split()) < 100 else data

print(f"dev_f1 missed long context: {missed}/{len(records_dev_f1)}\n")       
        
missed = 0        
for i, row in tqdm(enumerate(records_train_f2), total = len(records_train_f2)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["train_f2"]:
        continue 
    else:
        try:
            with open(f'{arxiv_leaderboard_full_txt}/{title_id}.txt', 'r') as file:
                # Read the file
                data = file.read()
                
        except :
            # print(f"Error on file {row['title']}")
            data = ""
            missed += 1
        
        title_to_content["train_f2"][row['title']] = row['Context'] if len(data.split()) < 100 else data
           
print(f"train_f2 missed long context: {missed}/{len(records_train_f2)}\n")       

missed = 0
for i, row in tqdm(enumerate(records_dev_f2), total = len(records_dev_f2)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["dev_f2"]:
        continue 
    else:
        try:
            with open(f'{arxiv_leaderboard_full_txt}/{title_id}.txt', 'r') as file:
                # Read the file
                data = file.read()
                
        except :
            # print(f"Error on file {row['title']}")
            data = ""
            missed += 1
            # continue
            
        title_to_content["dev_f2"][row['title']] = row['Context'] if len(data.split()) < 100 else data
        # title_to_content["dev_f2"][row['title']] = row['Context']

print(f"train_f2 missed long context: {missed}/{len(records_dev_f2)}")       

  0%|          | 18/12613 [00:00<01:28, 142.10it/s]

100%|██████████| 12613/12613 [03:17<00:00, 63.83it/s] 


train_f1 missed long context: 938/12613



100%|██████████| 5472/5472 [01:05<00:00, 83.17it/s] 


dev_f1 missed long context: 416/5472



100%|██████████| 12677/12677 [00:03<00:00, 3987.61it/s]


train_f2 missed long context: 939/12677



100%|██████████| 5408/5408 [00:01<00:00, 4311.83it/s]

train_f2 missed long context: 415/5408





In [13]:
no_leaderboard_pourcentage_train_f1 = int(len(train_f1_pd.title.unique())*50/100)
no_leaderboard_pourcentage_dev_f1 = int(len(dev_f1_pd.title.unique())*50/100)
no_leaderboard_pourcentage_train_f2 = int(len(train_f2_pd.title.unique())*50/100)
no_leaderboard_pourcentage_dev_f2 = int(len(dev_f2_pd.title.unique())*50/100)

print(f"no_leaderboard_pourcentage_train_f1: {no_leaderboard_pourcentage_train_f1}")
print(f"no_leaderboard_pourcentage_dev_f1: {no_leaderboard_pourcentage_dev_f1}")
print(f"no_leaderboard_pourcentage_train_f2: {no_leaderboard_pourcentage_train_f2}")
print(f"no_leaderboard_pourcentage_dev_f2: {no_leaderboard_pourcentage_dev_f2}")

no_leaderboard_pourcentage_train_f1: 1876
no_leaderboard_pourcentage_dev_f1: 804
no_leaderboard_pourcentage_train_f2: 1876
no_leaderboard_pourcentage_dev_f2: 804


In [14]:
# no_leaderboard_pourcentage = int(len(train_pd.title.unique())*50/100)
# no_leaderboard_pourcentage

In [15]:
records = no_leaderboard_pd.to_dict("records")

# For train only F1
no_lead_papers_train_f1 = []

already_seen = no_lead_papers_train_f1
no_lead_papers_train_f1 = []
i = 0

arxiv_no_leaderboard_full_txt = "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_no_leaderboard_full_txt"

missed = 0
for _, row in tqdm(enumerate(records), total = len(records)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in already_seen:
        continue 
        
    if i >= no_leaderboard_pourcentage_train_f1:
        break 
    
    try:
        with open(f'{arxiv_no_leaderboard_full_txt}/{title_id}.txt', 'r') as file:
            # Read the file
            data = file.read()
            
    except :
        # print(f"Error on file {row['title']}")
        data = ""
        missed += 1
                
    title_to_content["train_f1"][row['title']] = row['Context'] if len(data.split()) < 100 else data
    no_lead_papers_train_f1.append(row['title'])
    i += 1
print(f"train_f1 missed long context: {missed}/{i}\n")       
    
no_lead_papers_dev_f1 = no_lead_papers_train_f1
already_seen = no_lead_papers_dev_f1
no_lead_papers_dev_f1 = []
i = 0
missed = 0
for _, row in tqdm(enumerate(records), total = len(records)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in already_seen:
        continue 
        
    if i >= no_leaderboard_pourcentage_dev_f1:
        break 
     
    try:
        with open(f'{arxiv_no_leaderboard_full_txt}/{title_id}.txt', 'r') as file:
            # Read the file
            data = file.read()
            
    except :
        # print(f"Error on file {row['title']}")
        data = ""
        missed += 1
           
    title_to_content["dev_f1"][row['title']] = row['Context'] if len(data.split()) < 100 else data
    no_lead_papers_dev_f1.append(row['title'])  
    i += 1
print(f"dev_f1 missed long context: {missed}/{i}\n")       

    
# For train only F2
no_lead_papers_train_f2 = []

already_seen = no_lead_papers_train_f2
no_lead_papers_train_f2 = []
j = 0
missed = 0
for _, row in tqdm(enumerate(records), total = len(records)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in already_seen:
        continue 
        
    if j >= no_leaderboard_pourcentage_train_f2:
        break 
    
    try:
        with open(f'{arxiv_no_leaderboard_full_txt}/{title_id}.txt', 'r') as file:
            # Read the file
            data = file.read()
            
    except :
        # print(f"Error on file {row['title']}")
        data = ""
        missed += 1
        
    title_to_content["train_f2"][row['title']] = row['Context'] if len(data.split()) < 100 else data
    no_lead_papers_train_f2.append(row['title'])
    j += 1
print(f"train_f2 missed long context: {missed}/{j}\n")  
    
no_lead_papers_dev_f2 = no_lead_papers_train_f2
already_seen = no_lead_papers_dev_f2
no_lead_papers_dev_f2 = []
j = 0
missed = 0
for _, row in tqdm(enumerate(records), total = len(records)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in already_seen:
        continue 
        
    if j >= no_leaderboard_pourcentage_dev_f2:
        break 
        
    try:
        with open(f'{arxiv_no_leaderboard_full_txt}/{title_id}.txt', 'r') as file:
            # Read the file
            data = file.read()
            
    except :
        # print(f"Error on file {row['title']}")
        data = ""
        missed += 1
        
    title_to_content["dev_f2"][row['title']] = row['Context'] if len(data.split()) < 100 else data
    no_lead_papers_dev_f2.append(row['title'])
    j += 1
print(f"train_f2 missed long context: {missed}/{j}\n")  

  0%|          | 10/4369 [00:00<02:08, 34.03it/s]

 43%|████▎     | 1876/4369 [01:01<01:22, 30.32it/s]


train_f1 missed long context: 171/1876



 61%|██████▏   | 2680/4369 [00:33<00:21, 79.84it/s]   


dev_f1 missed long context: 55/804



 43%|████▎     | 1876/4369 [00:01<00:01, 1247.69it/s]


train_f2 missed long context: 171/1876



 61%|██████▏   | 2680/4369 [00:00<00:00, 3517.32it/s] 

train_f2 missed long context: 55/804






In [16]:
# train_f1_pd["Lenght context"] = train_f1_pd.Context.apply(lambda x: len(x.split()))
# dev_f1_pd["Lenght context"] = dev_f1_pd.Context.apply(lambda x: len(x.split()))
# train_f2_pd["Lenght context"] = train_f2_pd.Context.apply(lambda x: len(x.split()))
# dev_f2_pd["Lenght context"] = dev_f2_pd.Context.apply(lambda x: len(x.split()))

In [17]:
# train_pd[train_pd["Lenght context"] < 400]

In [18]:
# train_pd = train_pd[train_pd["Lenght context"] < 400]

In [19]:
# print("train_f1_pd describe: ")
# display(train_f1_pd.describe())
# print("dev_f1_pd describe: ")
# display(dev_f1_pd.describe())

# print("train_f2_pd describe: ")
# display(train_f2_pd.describe())
# print("dev_f2_pd describe: ")
# display(dev_f2_pd.describe())

In [20]:
df_train_f1 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])
for i, title in tqdm(enumerate(title_to_content["train_f1"].keys()), total = len(title_to_content["train_f1"].keys())):
    
    if (len(title_to_content["train_f1"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_train_f1) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_train_f1 = pd.concat([df_train_f1, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["train_f1"][title],
                'Lenght Context': len(title_to_content["train_f1"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_train_f1 describe: ")
display(df_train_f1.describe())  

df_dev_f1 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])  
for i, title in tqdm(enumerate(title_to_content["dev_f1"].keys()), total = len(title_to_content["dev_f1"].keys())):
    
    if (len(title_to_content["dev_f1"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_dev_f1) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_dev_f1 = pd.concat([df_dev_f1, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["dev_f1"][title],
                'Lenght Context': len(title_to_content["dev_f1"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_dev_f1 describe: ")
display(df_dev_f1.describe())  

df_train_f2 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])
for i, title in tqdm(enumerate(title_to_content["train_f2"].keys()), total = len(title_to_content["train_f2"].keys())):
    
    if (len(title_to_content["train_f2"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_train_f2) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_train_f2 = pd.concat([df_train_f2, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["train_f2"][title],
                'Lenght Context': len(title_to_content["train_f2"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_train_f2 describe: ")
display(df_train_f2.describe())  
 
df_dev_f2 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])  
for i, title in tqdm(enumerate(title_to_content["dev_f2"].keys()), total = len(title_to_content["dev_f2"].keys())):
    
    if (len(title_to_content["dev_f2"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_dev_f2) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_dev_f2 = pd.concat([df_dev_f2, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["dev_f2"][title],
                'Lenght Context': len(title_to_content["dev_f2"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_dev_f2 describe: ")
display(df_dev_f2.describe())  

  6%|▌         | 311/5629 [00:00<00:03, 1558.72it/s]

100%|██████████| 5629/5629 [00:03<00:00, 1485.36it/s]

df_train_f1 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,5512.0,5512.0
mean,3769.126814,38.429064
std,3641.882691,85.362865
min,3.0,1.0
25%,354.0,1.0
50%,3930.0,16.0
75%,5700.0,44.0
max,47576.0,2455.0


100%|██████████| 2412/2412 [00:01<00:00, 1617.35it/s]

df_dev_f1 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,2353.0,2353.0
mean,3821.762431,38.00765
std,3472.276893,71.784381
min,6.0,1.0
25%,360.0,1.0
50%,4122.0,16.0
75%,5758.0,43.0
max,36493.0,1530.0


100%|██████████| 5629/5629 [00:03<00:00, 1510.35it/s]

df_train_f2 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,5513.0,5513.0
mean,3755.273173,38.690731
std,3561.476945,83.986774
min,3.0,1.0
25%,344.0,1.0
50%,3981.0,16.0
75%,5712.0,44.0
max,41780.0,2455.0


100%|██████████| 2412/2412 [00:01<00:00, 1453.59it/s]

df_dev_f2 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,2352.0,2352.0
mean,3854.257228,37.394133
std,3661.857514,75.481169
min,6.0,1.0
25%,379.0,1.0
50%,4074.0,16.0
75%,5722.5,45.0
max,47576.0,1537.0


In [21]:
template = ['Please answer a question about this article. If the question is unanswerable, say \"unanswerable\"',
'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".',
'If the question is unanswerable, say \"unanswerable\"',
'Try to answer this question if possible (otherwise reply \"unanswerable\"',
'If it is possible to answer this question, answer it for me (else, reply \"unanswerable\"',
'Answer this question, if possible (if impossible, reply \"unanswerable\"',
'Read this: What is the answer? (If it cannot be answered, return \"unanswerable\"',
'Read this: Now answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"',
'Answer based on context:',
'Answer this question based on the article:',
# ("{context}\n\n{question}", "{answer}"),
'Answer this question:',
'Read this article and answer this question',
'Based on the above article, answer a question.',
# 'Context: {context}\n\nQuestion: {question}\n\nAnswer:", "{answer}"),
]

template

['Please answer a question about this article. If the question is unanswerable, say "unanswerable"',
 'Read this and answer the question. If the question is unanswerable, say "unanswerable".',
 'If the question is unanswerable, say "unanswerable"',
 'Try to answer this question if possible (otherwise reply "unanswerable"',
 'If it is possible to answer this question, answer it for me (else, reply "unanswerable"',
 'Answer this question, if possible (if impossible, reply "unanswerable"',
 'Read this: What is the answer? (If it cannot be answered, return "unanswerable"',
 'Read this: Now answer this question, if there is an answer (If it cannot be answered, return "unanswerable"',
 'Answer based on context:',
 'Answer this question based on the article:',
 'Answer this question:',
 'Read this article and answer this question',
 'Based on the above article, answer a question.']

In [22]:
# df.head(2)

In [23]:
def create_pandas_dataset_from_pandas(df,
                          answer_threshold=7,
                          verbose = False):

  ''' Create a Pandas Dataframe from pandas.
  Params:
        answer_threshold: Only consider those Question Answer pairs where the Answer is short.
  '''
  count_index = 0
  result_df  = pd.DataFrame(columns = ['prompt', 'answer'])   
  # q_type_1 = "Which Tasks are addressed in this article"
  # q_type_2 = "Which Datasets are addressed in this article"
  # q_type_3 = "Which Metrics are addressed in this article"
  # q_type_4 = "Which Tasks, Datasets, Metrics are addressed in this article"
  # q_type_5 = "Which Tasks, Datasets, Metrics and Scores are addressed in this article" 
  
  # q_types = [
  #   {"q": "Which Tasks are addressed in this article", "a_key": "Tasks"}, 
  #   {"q": "Which Datasets are addressed in this article", "a_key": "Datasets"}, 
  #   {"q": "Which Metrics are addressed in this article", "a_key": "Metrics"},
  #   {"q": "Which Tasks, Datasets, Metrics are addressed in this article", "a_key": "TDMs"},
  #   {"q": "Which Tasks, Datasets, Metrics and Scores are addressed in this article", "a_key": "TDMSs"}
  #   ]
  
  q_types = [
    {"q": "What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, and metric?", "a_key": "TDMSs"},
    # {"q": "What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, metric, and score?", "a_key": "TDMSs"},
    ]
  
  records = df.to_dict("records")
  # db_dict = defaultdict(lambda : list())
  for i, row in tqdm(enumerate(records), total = len(records)):        
      for q_type in q_types:
        
        # Squad_v2
        result_df.loc[count_index] = [f'{row["Context"]}\n\nPlease answer a question about this article. If the question is unanswerable, say \"unanswerable\". {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".\n\n{row["Context"]}\n\n{q_type["q"]}'
] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n{q_type["q"]} (If the question is unanswerable, say \"unanswerable\"'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\nTry to answer this question if possible (otherwise reply \"unanswerable\"): {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\nIf it is possible to answer this question, answer it for me (else, reply \"unanswerable\"): {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n\nAnswer this question, if possible (if impossible, reply \"unanswerable\"): {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'Read this: {row["Context"]}\n\n{q_type["q"]}\nWhat is the answer? (If it cannot be answered, return \"unanswerable\")'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'Read this: {row["Context"]}\nNow answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"): {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        
        # Drop
        result_df.loc[count_index] = [f'Answer based on context:\n\n{row["Context"]}\n\n{q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n\nAnswer this question based on the article: {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n\n{q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\nAnswer this question: {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'Read this article and answer this question {row["Context"]}\n{q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n\nBased on the above article, answer a question. {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'Context: {row["Context"]}\n\nQuestion: {q_type["q"]}\n\nAnswer:'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
         
            
  if verbose:
    # return (result_df,
    #         count_long,
    #         count_short)
    return (result_df)
  else:
    return result_df

In [24]:
df_train_f1_all_templates = create_pandas_dataset_from_pandas(df_train_f1) 
df_dev_f1_all_templates = create_pandas_dataset_from_pandas(df_dev_f1) 
df_train_f2_all_templates = create_pandas_dataset_from_pandas(df_train_f2) 
df_dev_f2_all_templates = create_pandas_dataset_from_pandas(df_dev_f2) 

  0%|          | 27/5512 [00:00<00:40, 134.38it/s]

100%|██████████| 5512/5512 [05:45<00:00, 15.95it/s]
100%|██████████| 2353/2353 [00:38<00:00, 60.68it/s]
100%|██████████| 5513/5513 [06:04<00:00, 15.13it/s]
100%|██████████| 2352/2352 [00:42<00:00, 54.99it/s]


In [25]:
df_train_f1_all_templates.describe()

Unnamed: 0,prompt,answer
count,82680,82680
unique,82335,2821
top,Title\tLaTeX Author Guidelines for CVPR Procee...,unanswerable
freq,5,28080


In [26]:
print("df_train_f1_all_templates describe: ")
display(df_train_f1_all_templates.describe())
print("df_dev_f1_all_templates describe: ")
display(df_dev_f1_all_templates.describe())

print("df_train_f2_all_templates describe: ")
display(df_train_f2_all_templates.describe())
print("df_dev_f2_all_templates describe: ")
display(df_dev_f2_all_templates.describe())

df_train_f1_all_templates describe: 


Unnamed: 0,prompt,answer
count,82680,82680
unique,82335,2821
top,Title\tLaTeX Author Guidelines for CVPR Procee...,unanswerable
freq,5,28080


df_dev_f1_all_templates describe: 


Unnamed: 0,prompt,answer
count,35295,35295
unique,35145,1338
top,Read this: Title\tLaTeX Author Guidelines for ...,unanswerable
freq,5,12060


df_train_f2_all_templates describe: 


Unnamed: 0,prompt,answer
count,82695,82695
unique,82260,2849
top,Title\tLaTeX Author Guidelines for CVPR Procee...,unanswerable
freq,6,28080


df_dev_f2_all_templates describe: 


Unnamed: 0,prompt,answer
count,35280,35280
unique,35205,1326
top,Title\tBare Advanced Demo of IEEEtran.cls for\...,unanswerable
freq,2,12060


In [27]:
df_train_f1_all_templates['answer'].apply(type).value_counts()


answer
<class 'str'>    82680
Name: count, dtype: int64

In [28]:
df_dev_f1_all_templates['answer'].apply(type).value_counts()


answer
<class 'str'>    35295
Name: count, dtype: int64

In [31]:
str(df_train_f1_all_templates.at[10, 'prompt'])

'Title\tValue Prediction Network\n\nAbstract:\tThis paper proposes a novel deep reinforcement learning (RL) architecture, called Value Prediction Network (VPN), which integrates model-free and model-based RL methods into a single neural network. In contrast to typical model-based RL methods, VPN learns a dynamics model whose abstract states are trained to make option-conditional predictions of future values (discounted sum of rewards) rather than of future observations. Our experimental results show that VPN has several advantages over both model-free and model-based baselines in a stochastic environment where careful planning is required but building an accurate observation-prediction model is difficult. Furthermore, VPN outperforms Deep Q-Network (DQN) on several Atari games even with short-lookahead planning, demonstrating its potential as a new way of learning a good state representation.\n\nIntroduction\n\nModel-based reinforcement learning (RL) approaches attempt to learn a model

In [34]:
str(df_train_f1_all_templates.at[100, 'answer'])

"[{'LEADERBOARD': {'Task': 'Few-Shot Image Classification', 'Dataset': 'Stanford Cars 5-way (5-shot)', 'Metric': 'Accuracy'}}, {'LEADERBOARD': {'Task': 'Few-Shot Image Classification', 'Dataset': 'Stanford Dogs 5-way (1-shot)', 'Metric': 'Accuracy'}}, {'LEADERBOARD': {'Task': 'Few-Shot Image Classification', 'Dataset': 'CUB 200 5-way 1-shot', 'Metric': 'Accuracy'}}, {'LEADERBOARD': {'Task': 'Few-Shot Image Classification', 'Dataset': 'Stanford Cars 5-way (1-shot)', 'Metric': 'Accuracy'}}, {'LEADERBOARD': {'Task': 'Few-Shot Image Classification', 'Dataset': 'CUB 200 5-way 5-shot', 'Metric': 'Accuracy'}}, {'LEADERBOARD': {'Task': 'Few-Shot Image Classification', 'Dataset': 'Mini-Imagenet 5-way (1-shot)', 'Metric': 'Accuracy'}}, {'LEADERBOARD': {'Task': 'Few-Shot Image Classification', 'Dataset': 'Stanford Dogs 5-way (5-shot)', 'Metric': 'Accuracy'}}, {'LEADERBOARD': {'Task': 'Few-Shot Image Classification', 'Dataset': 'Mini-Imagenet 5-way (5-shot)', 'Metric': 'Accuracy'}}]"

In [35]:
df_train_f1_all_templates.to_parquet('../data/df_train_tdm_long_f1_all_templates.parquet')
df_dev_f1_all_templates.to_parquet('../data/df_dev_tdm_long_f1_all_templates.parquet')
df_train_f2_all_templates.to_parquet('../data/df_train_tdm_long_f2_all_templates.parquet')
df_dev_f2_all_templates.to_parquet('../data/df_dev_tdm_long_f2_all_templates.parquet')

In [36]:
# df_train_f1_all_templates = pd.read_parquet('../data/df_train_f1_all_templates.parquet')
# df_dev_f1_all_templates = pd.read_parquet('../data/df_dev_f1_all_templates.parquet')
# df_train_f2_all_templates = pd.read_parquet('../data/df_train_f2_all_templates.parquet')
# df_dev_f2_all_templates = pd.read_parquet('../data/df_dev_f2_all_templates.parquet')

In [37]:
# dataset_train_f1_all_templates = Dataset.from_pandas(df_train_f1_all_templates)
# dataset_dev_f1_all_templates = Dataset.from_pandas(df_dev_f1_all_templates)
# dataset_train_f2_all_templates = Dataset.from_pandas(df_train_f2_all_templates)
# dataset_dev_f2_all_templates = Dataset.from_pandas(df_dev_f2_all_templates)



# f1 = DatasetDict({
#         "train": dataset_train_f1_all_templates,
#         "validation": dataset_dev_f1_all_templates
#     })
# f2 = DatasetDict({
#         "train": dataset_train_f2_all_templates,
#         "validation": dataset_dev_f2_all_templates
#     })

# Combine into a DatasetDict
# dataset = DatasetDict({
#     'fold1': DatasetDict({
#         "train": dataset_train_f1_all_templates,
#         "validation": dataset_dev_f1_all_templates
#     }),
#     'fold2': DatasetDict({
#         "train": dataset_train_f2_all_templates,
#         "validation": dataset_dev_f2_all_templates
#     })
# })


dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_long_f1_all_templates.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_long_f1_all_templates.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_long_f2_all_templates.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_long_f2_all_templates.parquet')
    })
})

# dataset_fold1 = DatasetDict({
#     "train": Dataset.from_parquet('../data/df_train_f1_all_templates.parquet'),
#     "validation": Dataset.from_parquet('../data/df_dev_f1_all_templates.parquet')
#     })

# dataset_fold2 =  DatasetDict({
#     "train": Dataset.from_parquet('../data/df_train_f2_all_templates.parquet'),
#     "validation": Dataset.from_parquet('../data/df_dev_f2_all_templates.parquet')
#     })

# dataset = DatasetDict({
#     'fold1': f1,
#     'fold2': f2
# })

# dataset = DatasetDict({
#     "train": dataset_train_f1_all_templates,
#     "validation": dataset_dev_f1_all_templates
#     })

print(dataset)
# print(dataset_fold1)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 82680
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 35295
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 82695
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 35280
        })
    })
})


In [38]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [39]:
dataset.save_to_disk("../data/LLLM_LONG_TDM_ALL_TEMPLATE")

Saving the dataset (0/5 shards):   0%|          | 0/82680 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/35295 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/82695 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/35280 [00:00<?, ? examples/s]

In [40]:
root_directory = "../data/LLLM_LONG_TDM_ALL_TEMPLATE"

# reloaded_encoded_dataset = datasets.load_from_disk("../data/dataset/LLLM_TDMS_ALL_TEMPLATE")
# reloaded_encoded_dataset = DatasetDict.load_from_disk("../data/LLLM_TDMS_ALL_TEMPLATE")

dataset_fold1 = DatasetDict.load_from_disk(f"{root_directory}/fold1")
dataset_fold2 = DatasetDict.load_from_disk(f"{root_directory}/fold2")

In [41]:
dataset_fold1['train'][0]

{'prompt': 'Title\tValue Prediction Network\n\nAbstract:\tThis paper proposes a novel deep reinforcement learning (RL) architecture, called Value Prediction Network (VPN), which integrates model-free and model-based RL methods into a single neural network. In contrast to typical model-based RL methods, VPN learns a dynamics model whose abstract states are trained to make option-conditional predictions of future values (discounted sum of rewards) rather than of future observations. Our experimental results show that VPN has several advantages over both model-free and model-based baselines in a stochastic environment where careful planning is required but building an accurate observation-prediction model is difficult. Furthermore, VPN outperforms Deep Q-Network (DQN) on several Atari games even with short-lookahead planning, demonstrating its potential as a new way of learning a good state representation.\n\nIntroduction\n\nModel-based reinforcement learning (RL) approaches attempt to le