In [1]:
# !pip install --quiet  datasets #to access squad dataset
# !pip install --quiet pyarrow   #to deal with parquet files for saving dataset if required
# !pip install --quiet  tqdm     #for progress bars
# !pip install --quiet transformers # for t5 model
# !pip install --quiet tokenizers  #tokenizers from HuggingFace
# !pip install --quiet sentencepiece #subword tokenizer used by T5
# !pip install --quiet pytorch-lightning # pytorch wrapper 
# !pip install --quiet torchtext # text utilities

# Fetching Datasets

In [2]:
#imports
import pandas as pd
import torch
from tqdm import tqdm
from datasets import DatasetDict, Dataset, load_from_disk
# from torch.utils.data import Dataset, DataLoader
from pprint import pprint
import copy
import numpy as np
from collections import defaultdict
import ipdb

In [3]:
device  = 'cuda' if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
# path_to_source = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_full_5_10_10000_clone_latex_compare/10Neg10000unk/twofoldwithunk"
path_to_csv = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk"

fold1 = "fold1"
train_f1_pd = pd.read_csv(f"{path_to_csv}/{fold1}/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])
dev_f1_pd = pd.read_csv(f"{path_to_csv}/{fold1}/dev.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

fold2 = "fold2"
train_f2_pd = pd.read_csv(f"{path_to_csv}/{fold2}/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])
dev_f2_pd = pd.read_csv(f"{path_to_csv}/{fold2}/dev.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [5]:
# no_leaderboard_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/T5-Leaderboard-QA/data_proccess/arxiv_no_leaderboard_links_pdf_short/DocTAET_full.tsv", 
#                     sep="\t", names=["title", "Context"])

no_leaderboard_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/T5-Leaderboard-QA/data_proccess/arxiv_no_leaderboard_links_pdf_short/DocTAET_150.tsv", 
                    sep="\t", names=["title", "Context"])

no_leaderboard_pd.describe()

Unnamed: 0,title,Context
count,4369,4369
unique,4369,4365
top,0912.4438.pdf,! !
freq,1,2


In [6]:
resultsAnnotation_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/annotations_final/resultsAnnotation.tsv",
                                   sep="\t", names=["Title", "TDMSs"])
resultsAnnotation_pd = resultsAnnotation_pd.fillna("NAN")
resultsAnnotation_pd

Unnamed: 0,Title,TDMSs
0,1704.03549v4.pdf,Optical Character Recognition#FSNS - Test#Sequ...
1,1712.05404.pdf,Optical Character Recognition#FSNS - Test#Sequ...
2,1702.03970v1.pdf,Optical Character Recognition#FSNS - Test#Sequ...
3,2104.02324v1.pdf,"Active Object Detection#COCO#AP#(7.3, 13.8, 16..."
4,2008.12995v3.pdf,Handwriting Recognition#BanglaLekha Isolated D...
...,...,...
5724,2104.01378v1.pdf,Phone-level pronunciation scoring#speechocean7...
5725,2104.10283v1.pdf,Graph Question Answering#GQA#Accuracy#96.30
5726,2104.11980v1.pdf,Trajectory Modeling#NBA SportVU#1x1 NLL#0.472
5727,1704.00077v1.pdf,Video Segmentation#SegTrack v2#Accuracy#86.86


In [7]:
"""
This will take care of papers with more than one learderboard 
"""
records = resultsAnnotation_pd.to_dict("records")

title_to_tdms_dict = defaultdict(
    lambda : 
        list()
    )

for i, row in tqdm(enumerate(records), total = len(records)):
    if row['TDMSs'] == 'NAN':
        continue

    for tdms in row['TDMSs'].split("$"):
        if len(tdms.split("#")) != 4:
            # ipdb.set_trace()
            continue 
        t, d, m, s = tdms.split("#")
        title_to_tdms_dict[row['Title']].append(
            {
                "LEADERBOARD": {
                    "Task": t,
                    "Dataset": d,
                    "Metric": m,
                    "Score": s,
                }
            }            
        )

100%|██████████| 5729/5729 [00:00<00:00, 30819.43it/s]


In [8]:
len(title_to_tdms_dict)

5725

In [9]:
# title_to_tdms_dict

In [10]:
# No need for negative instances, but will still have 'duplicate' for paper with more than one leaderboard
train_f1_pd = train_f1_pd[train_f1_pd.label==True]
print("train_f1_pd")
display(train_f1_pd.describe())

dev_f1_pd = dev_f1_pd[dev_f1_pd.label==True]
print("dev_f1_pd")
display(dev_f1_pd.describe())

train_f2_pd = train_f2_pd[train_f2_pd.label==True]
print("train_f2_pd")
display(train_f2_pd.describe())

dev_f2_pd = dev_f2_pd[dev_f2_pd.label==True]
print("dev_f2_pd")
display(dev_f2_pd.describe())

train_f1_pd


Unnamed: 0,label,title,TDM,Context
count,12613,12613,12613,12613
unique,1,3753,1792,3747
top,True,1803.00933v1.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,12613,58,923,58


dev_f1_pd


Unnamed: 0,label,title,TDM,Context
count,5472,5472,5472,5472
unique,1,1608,1557,1606
top,True,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,5472,58,378,58


train_f2_pd


Unnamed: 0,label,title,TDM,Context
count,12677,12677,12677,12677
unique,1,3753,1821,3749
top,True,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,12677,58,920,58


dev_f2_pd


Unnamed: 0,label,title,TDM,Context
count,5408,5408,5408,5408
unique,1,1608,1542,1608
top,True,1802.01561v3.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,5408,58,381,58


In [11]:
# len(train_pd.title.unique())
records_train_f1 = train_f1_pd.to_dict("records")
records_dev_f1 = dev_f1_pd.to_dict("records")
records_train_f2 = train_f2_pd.to_dict("records")
records_dev_f2 = dev_f2_pd.to_dict("records")

In [12]:
title_id = records_train_f1[0]["title"].split(".pdf")[0]
title_id

'1707.03497v2'

In [13]:
with open(f'/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_no_leaderboard_txt_summarised/0704.0062_summarised.txt', 'r') as file:
    # Read the file
    data = file.read()

len(data.split())

# data

2

In [14]:
# title_to_tdms_dict = defaultdict(lambda : defaultdict(lambda : str("| ")))
title_to_content = {
    "train_f1":{},
    "dev_f1":{},
    "train_f2":{},
    "dev_f2":{},
    }

# arxiv_leaderboard_full_txt = "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_leaderboard_full_txt"
arxiv_leaderboard_full_txt = "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_txt_summarised"

missed = 0
for i, row in tqdm(enumerate(records_train_f1), total = len(records_train_f1)):
    title_id = row['title'].split(".pdf")[0]
    
    if row['title'] in title_to_content["train_f1"]:
        # ipdb.set_trace()
        continue 
    else:
        try:
            with open(f'{arxiv_leaderboard_full_txt}/{title_id}_summarised.txt', 'r') as file:
                # Read the file
                data = file.read()

        except Exception as e:
            # Code that runs if the exception occurs
            # except Exception as e:
            # print(f"Error on file {row['title']}")
            data = "" 
            missed += 1
            # ipdb.set_trace()
            continue

        if len(data.split()) < 10:
            # ipdb.set_trace()
            continue
        else:
            title_to_content["train_f1"][row['title']] = data
            
        # title_to_content["train_f1"][row['title']] = row['Context'] if len(data.split()) < 100 else data
        # title_to_content["train_f1"][row['title']] = row['Context']

print(f"train_f1 missed long context: {missed}/{len(records_train_f1)}\n")

missed = 0
for i, row in tqdm(enumerate(records_dev_f1), total = len(records_dev_f1)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["dev_f1"]:
        continue 
    else:
        try:
            with open(f'{arxiv_leaderboard_full_txt}/{title_id}_summarised.txt', 'r') as file:
                # Read the file
                data = file.read()
                
        except Exception as e:
            # except Exception as e:
            # print(f"Error on file {row['title']}")
            data = ""
            missed += 1
            continue 

        if len(data.split()) < 10:
            continue
        else:
            title_to_content["dev_f1"][row['title']] = data
            
        # title_to_content["dev_f1"][row['title']] = row['Context'] if len(data.split()) < 100 else data

print(f"dev_f1 missed long context: {missed}/{len(records_dev_f1)}\n")       
        
missed = 0        
for i, row in tqdm(enumerate(records_train_f2), total = len(records_train_f2)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["train_f2"]:
        continue 
    else:
        try:
            with open(f'{arxiv_leaderboard_full_txt}/{title_id}_summarised.txt', 'r') as file:
                # Read the file
                data = file.read()
                
        except Exception as e:
            # except Exception as e:
            # print(f"Error on file {row['title']}")
            data = ""
            missed += 1
            continue

        if len(data.split()) < 10:
            continue
        else:
            title_to_content["train_f2"][row['title']] = data
            
        # title_to_content["train_f2"][row['title']] = row['Context'] if len(data.split()) < 100 else data
           
print(f"train_f2 missed long context: {missed}/{len(records_train_f2)}\n")       

missed = 0
for i, row in tqdm(enumerate(records_dev_f2), total = len(records_dev_f2)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["dev_f2"]:
        continue 
    else:
        try:
            with open(f'{arxiv_leaderboard_full_txt}/{title_id}_summarised.txt', 'r') as file:
                # Read the file
                data = file.read()
                
        except Exception as e:
            # except Exception as e:
            # print(f"Error on file {row['title']}")
            data = ""
            missed += 1
            continue

        if len(data.split()) < 10:
            continue
        else:
            title_to_content["dev_f2"][row['title']] = data
            
        # title_to_content["dev_f2"][row['title']] = row['Context'] if len(data.split()) < 100 else data
        # title_to_content["dev_f2"][row['title']] = row['Context']

print(f"train_f2 missed long context: {missed}/{len(records_dev_f2)}")       

100%|██████████| 12613/12613 [00:00<00:00, 17311.10it/s]


train_f1 missed long context: 4063/12613



100%|██████████| 5472/5472 [00:00<00:00, 6470.28it/s]


dev_f1 missed long context: 1886/5472



100%|██████████| 12677/12677 [00:01<00:00, 8296.62it/s]


train_f2 missed long context: 4142/12677



100%|██████████| 5408/5408 [00:00<00:00, 18321.46it/s]

train_f2 missed long context: 1807/5408





In [15]:
len(title_to_content["train_f1"])

2514

In [16]:
len(title_to_content["dev_f1"])

1057

In [17]:
ratio = 15/100
no_leaderboard_pourcentage_train_f1 = int(len(train_f1_pd.title.unique())*ratio)
no_leaderboard_pourcentage_dev_f1 = int(len(dev_f1_pd.title.unique())*ratio)
no_leaderboard_pourcentage_train_f2 = int(len(train_f2_pd.title.unique())*ratio)
no_leaderboard_pourcentage_dev_f2 = int(len(dev_f2_pd.title.unique())*ratio)

print(f"no_leaderboard_pourcentage_train_f1: {no_leaderboard_pourcentage_train_f1}")
print(f"no_leaderboard_pourcentage_dev_f1: {no_leaderboard_pourcentage_dev_f1}")
print(f"no_leaderboard_pourcentage_train_f2: {no_leaderboard_pourcentage_train_f2}")
print(f"no_leaderboard_pourcentage_dev_f2: {no_leaderboard_pourcentage_dev_f2}")

no_leaderboard_pourcentage_train_f1: 562
no_leaderboard_pourcentage_dev_f1: 241
no_leaderboard_pourcentage_train_f2: 562
no_leaderboard_pourcentage_dev_f2: 241


In [18]:
# no_leaderboard_pourcentage = int(len(train_pd.title.unique())*50/100)
# no_leaderboard_pourcentage

In [19]:
records = no_leaderboard_pd.to_dict("records")

# For train only F1
# already_seen = no_lead_papers_train_f1

already_seen = []
no_lead_papers_train_f1 = []
i = 0
# count_i = 0
# arxiv_no_leaderboard_full_txt = "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_no_leaderboard_full_txt"
arxiv_no_leaderboard_full_txt = "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_no_leaderboard_txt_summarised"

missed = 0
for _, row in tqdm(enumerate(records), total = len(records)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in already_seen:
        continue 
        
    if i >= no_leaderboard_pourcentage_train_f1:
        break 
    
    try:
        with open(f'{arxiv_no_leaderboard_full_txt}/{title_id}_summarised.txt', 'r') as file:
            # Read the file
            data = file.read()
            
    except Exception as e:
        # print(f"Error on file {row['title']}")
        data = ""
        missed += 1
        continue

    if len(data.split()) < 10:
        continue
    else:
        # count_i += 1
        title_to_content["train_f1"][row['title']] = data
        no_lead_papers_train_f1.append(row['title'])
        already_seen.append(row['title'])
        i += 1
        
    # title_to_content["train_f1"][row['title']] = row['Context'] if len(data.split()) < 100 else data
    # no_lead_papers_train_f1.append(row['title'])
    # i += 1

print(f"train_f1 missed long context: {missed}/{len(records_train_f1)}\n")

    
# no_lead_papers_dev_f1 = no_lead_papers_train_f1
# already_seen = no_lead_papers_dev_f1
no_lead_papers_dev_f1 = []
i = 0
missed = 0
for _, row in tqdm(enumerate(records), total = len(records)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in already_seen:
        continue 
        
    if i >= no_leaderboard_pourcentage_dev_f1:
        break 
     
    try:
        with open(f'{arxiv_no_leaderboard_full_txt}/{title_id}_summarised.txt', 'r') as file:
            # Read the file
            data = file.read()
            
    except Exception as e:
        # print(f"Error on file {row['title']}")
        data = ""
        missed += 1
        continue

    if len(data.split()) < 10:
        continue
    else:
        title_to_content["dev_f1"][row['title']] = data
        no_lead_papers_dev_f1.append(row['title'])
        already_seen.append(row['title'])
        i += 1
           
    # title_to_content["dev_f1"][row['title']] = row['Context'] if len(data.split()) < 100 else data
    # no_lead_papers_dev_f1.append(row['title'])  
    # i += 1

print(f"dev_f1 missed long context: {missed}/{len(records_dev_f1)}\n")       

    
# For train only F2
# no_lead_papers_train_f2 = []

# already_seen = no_lead_papers_train_f2
no_lead_papers_train_f2 = []
j = 0
missed = 0
for _, row in tqdm(enumerate(records), total = len(records)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in already_seen:
        continue 
        
    if j >= no_leaderboard_pourcentage_train_f2:
        break 
    
    try:
        with open(f'{arxiv_no_leaderboard_full_txt}/{title_id}_summarised.txt', 'r') as file:
            # Read the file
            data = file.read()
            
    except Exception as e:
        # print(f"Error on file {row['title']}")
        data = ""
        missed += 1
        continue

    if len(data.split()) < 10:
        continue
    else:
        title_to_content["train_f2"][row['title']] = data
        no_lead_papers_train_f2.append(row['title'])
        already_seen.append(row['title'])
        j += 1
        
    # title_to_content["train_f2"][row['title']] = row['Context'] if len(data.split()) < 100 else data
    # no_lead_papers_train_f2.append(row['title'])
    # j += 1

print(f"train_f2 missed long context: {missed}/{len(records_train_f2)}\n")       
    
# no_lead_papers_dev_f2 = no_lead_papers_train_f2
# already_seen = no_lead_papers_dev_f2
no_lead_papers_dev_f2 = []
j = 0
missed = 0
for _, row in tqdm(enumerate(records), total = len(records)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in already_seen:
        continue 
        
    if j >= no_leaderboard_pourcentage_dev_f2:
        break 
        
    try:
        with open(f'{arxiv_no_leaderboard_full_txt}/{title_id}_summarised.txt', 'r') as file:
            # Read the file
            data = file.read()
            
    except Exception as e:
        # print(f"Error on file {row['title']}")
        data = ""
        missed += 1
        continue

    if len(data.split()) < 10:
        continue
    else:
        title_to_content["dev_f2"][row['title']] = data
        no_lead_papers_dev_f2.append(row['title'])
        already_seen.append(row['title'])
        j += 1
        
    # title_to_content["dev_f2"][row['title']] = row['Context'] if len(data.split()) < 100 else data
    # no_lead_papers_dev_f2.append(row['title'])
    # j += 1

print(f"train_f2 missed long context: {missed}/{len(records_dev_f2)}")       

 34%|███▍      | 1485/4369 [00:01<00:02, 1367.36it/s]


train_f1 missed long context: 99/12613



 49%|████▊     | 2121/4369 [00:01<00:01, 2063.14it/s]


dev_f1 missed long context: 144/5472



 79%|███████▉  | 3468/4369 [00:01<00:00, 1856.64it/s]


train_f2 missed long context: 222/12677



 92%|█████████▏| 4004/4369 [00:01<00:00, 2267.51it/s]

train_f2 missed long context: 257/5408





In [20]:
no_lead_papers_train_f1[:5]

['0912.4438.pdf',
 '0801.3581.pdf',
 '0706.0014.pdf',
 '0909.4094.pdf',
 '0812.0581.pdf']

In [21]:
no_lead_papers_train_f2[:5]

['0907.2076.pdf',
 '0707.1515.pdf',
 '0905.3432.pdf',
 '0901.4747.pdf',
 '0806.0081.pdf']

In [22]:
no_lead_papers_dev_f1[:5]

['0908.1805.pdf',
 '1003.5783.pdf',
 '0901.0911.pdf',
 '0812.2423.pdf',
 '1005.0600.pdf']

In [26]:
train_f1_pd["Lenght context"] = train_f1_pd.Context.apply(lambda x: len(x.split()))
dev_f1_pd["Lenght context"] = dev_f1_pd.Context.apply(lambda x: len(x.split()))
train_f2_pd["Lenght context"] = train_f2_pd.Context.apply(lambda x: len(x.split()))
dev_f2_pd["Lenght context"] = dev_f2_pd.Context.apply(lambda x: len(x.split()))

In [27]:
# train_pd[train_pd["Lenght context"] < 400]

In [28]:
# train_pd = train_pd[train_pd["Lenght context"] < 400]

In [29]:
print("train_f1_pd describe: ")
display(train_f1_pd.describe())
print("dev_f1_pd describe: ")
display(dev_f1_pd.describe())

print("train_f2_pd describe: ")
display(train_f2_pd.describe())
print("dev_f2_pd describe: ")
display(dev_f2_pd.describe())

train_f1_pd describe: 


Unnamed: 0,Lenght context
count,12613.0
mean,381.659558
std,120.5737
min,5.0
25%,310.0
50%,379.0
75%,448.0
max,2161.0


dev_f1_pd describe: 


Unnamed: 0,Lenght context
count,5472.0
mean,383.148757
std,114.608898
min,31.0
25%,316.0
50%,374.0
75%,448.0
max,1750.0


train_f2_pd describe: 


Unnamed: 0,Lenght context
count,12677.0
mean,384.411927
std,122.812891
min,10.0
25%,315.0
50%,381.0
75%,450.0
max,2161.0


dev_f2_pd describe: 


Unnamed: 0,Lenght context
count,5408.0
mean,376.714497
std,108.632246
min,5.0
25%,307.0
50%,372.0
75%,442.0
max,1481.0


In [30]:
df_train_f1 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])
for i, title in tqdm(enumerate(title_to_content["train_f1"].keys()), total = len(title_to_content["train_f1"].keys())):
    
    if (len(title_to_content["train_f1"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_train_f1) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_train_f1 = pd.concat([df_train_f1, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["train_f1"][title],
                'Lenght Context': len(title_to_content["train_f1"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_train_f1 describe: ")
display(df_train_f1.describe())  

df_dev_f1 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])  
for i, title in tqdm(enumerate(title_to_content["dev_f1"].keys()), total = len(title_to_content["dev_f1"].keys())):
    
    if (len(title_to_content["dev_f1"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_dev_f1) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_dev_f1 = pd.concat([df_dev_f1, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["dev_f1"][title],
                'Lenght Context': len(title_to_content["dev_f1"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_dev_f1 describe: ")
display(df_dev_f1.describe())  

df_train_f2 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])
for i, title in tqdm(enumerate(title_to_content["train_f2"].keys()), total = len(title_to_content["train_f2"].keys())):
    
    if (len(title_to_content["train_f2"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_train_f2) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_train_f2 = pd.concat([df_train_f2, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["train_f2"][title],
                'Lenght Context': len(title_to_content["train_f2"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_train_f2 describe: ")
display(df_train_f2.describe())  
 
df_dev_f2 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])  
for i, title in tqdm(enumerate(title_to_content["dev_f2"].keys()), total = len(title_to_content["dev_f2"].keys())):
    
    if (len(title_to_content["dev_f2"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_dev_f2) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_dev_f2 = pd.concat([df_dev_f2, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["dev_f2"][title],
                'Lenght Context': len(title_to_content["dev_f2"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_dev_f2 describe: ")
display(df_dev_f2.describe())  

100%|██████████| 3076/3076 [00:01<00:00, 1758.29it/s]

df_train_f1 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,3075.0,3075.0
mean,1632.739187,55.449756
std,1149.200513,104.680918
min,10.0,1.0
25%,747.5,11.0
50%,1644.0,27.0
75%,2251.0,64.0
max,16453.0,2561.0


100%|██████████| 1298/1298 [00:00<00:00, 1933.12it/s]

df_dev_f1 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,1298.0,1298.0
mean,1604.204931,54.721109
std,1097.751424,94.97897
min,11.0,1.0
25%,763.0,11.0
50%,1618.0,26.0
75%,2211.75,64.0
max,10552.0,1870.0


100%|██████████| 3083/3083 [00:01<00:00, 1770.92it/s]

df_train_f2 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,3082.0,3082.0
mean,1629.446788,55.408501
std,1124.350335,100.978942
min,10.0,1.0
25%,775.5,11.0
50%,1642.0,27.0
75%,2242.5,65.0
max,15714.0,2561.0


100%|██████████| 1291/1291 [00:00<00:00, 1665.79it/s]

df_dev_f2 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,1291.0,1291.0
mean,1616.931061,54.815647
std,1114.1749,104.062497
min,11.0,1.0
25%,691.0,11.0
50%,1644.0,28.0
75%,2237.0,63.0
max,15834.0,1829.0


In [31]:
df_dev_f2.head()

Unnamed: 0,Title,TDMSs,Context,Lenght Context,Lenght TDMSs
0,2006.10721v2.pdf,[{'LEADERBOARD': {'Task': 'Visual Object Track...,Title:\tOcean: Object-aware Anchor-free Tracki...,2986.0,53.0
1,2105.02209v1.pdf,"[{'LEADERBOARD': {'Task': 'Image Relighting', ...",Title:\t Physically Inspired Dense Fusion Netw...,1024.0,60.0
2,1906.09826v1.pdf,[{'LEADERBOARD': {'Task': 'Semantic Segmentati...,Title:\tESNet: An Efficient Symmetric Network ...,1104.0,51.0
3,1902.09314v2.pdf,[{'LEADERBOARD': {'Task': 'Sentiment Analysis'...,Title:\tAttentional Encoder Network for Target...,1330.0,201.0
4,1901.00392v2.pdf,[{'LEADERBOARD': {'Task': 'Person Re-Identific...,Title:\tAttribute-Aware Attention Model for Fi...,2024.0,42.0


In [34]:
df_dev_f2.loc[0]["Context"]

'Title:\tOcean: Object-aware Anchor-free Tracking\n\nAbstract:\tAnchor-based Siamese trackers have achieved remarkable advancements in accuracy, yet the further improvement is restricted by the lagged tracking robustness. We find the underlying reason is that the regression network in anchor-based methods is only trained on the positive anchor boxes (i.e.., IoU\u2004≥\u20040.6). This mechanism makes it difficult to refine the anchors whose overlap with the target objects are small. In this paper, we propose a novel object-aware anchor-free network to address this issue. First, instead of refining the reference anchor boxes, we directly predict the position and scale of target objects in an anchor-free fashion. Since each pixel in groundtruth boxes is well trained, the tracker is capable of rectifying inexact predictions of target objects during inference. Second, we introduce a feature alignment module to learn an object-aware feature from predicted bounding boxes. The object-aware fea

In [35]:
df_dev_f2.loc[0]["TDMSs"]

[{'LEADERBOARD': {'Task': 'Visual Object Tracking',
   'Dataset': 'VOT2019',
   'Metric': 'Expected Average Overlap (EAO)',
   'Score': '0.327'}},
 {'LEADERBOARD': {'Task': 'Visual Object Tracking',
   'Dataset': 'VOT2018',
   'Metric': 'Expected Average Overlap (EAO)',
   'Score': '0.467'}},
 {'LEADERBOARD': {'Task': 'Visual Object Tracking',
   'Dataset': 'GOT-10k',
   'Metric': 'Average Overlap',
   'Score': '61.1'}},
 {'LEADERBOARD': {'Task': 'Visual Object Tracking',
   'Dataset': 'GOT-10k',
   'Metric': 'Success Rate 0.5',
   'Score': '72.1'}}]

In [36]:
template = ['Please answer a question about this article. If the question is unanswerable, say \"unanswerable\"',
'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".',
'If the question is unanswerable, say \"unanswerable\"',
'Try to answer this question if possible (otherwise reply \"unanswerable\"',
'If it is possible to answer this question, answer it for me (else, reply \"unanswerable\"',
'Answer this question, if possible (if impossible, reply \"unanswerable\"',
'Read this: What is the answer? (If it cannot be answered, return \"unanswerable\"',
'Read this: Now answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"',
'Answer based on context:',
'Answer this question based on the article:',
# ("{context}\n\n{question}", "{answer}"),
'Answer this question:',
'Read this article and answer this question',
'Based on the above article, answer a question.',
# 'Context: {context}\n\nQuestion: {question}\n\nAnswer:", "{answer}"),
]

template

['Please answer a question about this article. If the question is unanswerable, say "unanswerable"',
 'Read this and answer the question. If the question is unanswerable, say "unanswerable".',
 'If the question is unanswerable, say "unanswerable"',
 'Try to answer this question if possible (otherwise reply "unanswerable"',
 'If it is possible to answer this question, answer it for me (else, reply "unanswerable"',
 'Answer this question, if possible (if impossible, reply "unanswerable"',
 'Read this: What is the answer? (If it cannot be answered, return "unanswerable"',
 'Read this: Now answer this question, if there is an answer (If it cannot be answered, return "unanswerable"',
 'Answer based on context:',
 'Answer this question based on the article:',
 'Answer this question:',
 'Read this article and answer this question',
 'Based on the above article, answer a question.']

In [37]:
# df.head(2)

In [38]:
def create_pandas_dataset_from_pandas(df,
                                      squad_1 = False,
                                      squad_2 = False,
                                      squad_3 = False,
                                      squad_4 = False,
                                      squad_5 = False,
                                      squad_6 = False,
                                      squad_7 = False,
                                      squad_8 = False,
                                      drop_1 = False,
                                      drop_2 = False,
                                      drop_3 = False,
                                      drop_4 = False,
                                      drop_5 = False,
                                      drop_6 = False,
                                      drop_7 = False
                         ):

  ''' Create a Pandas Dataframe from pandas.
  Params:
        answer_threshold: Only consider those Question Answer pairs where the Answer is short.
  '''
  count_index = 0
  result_df  = pd.DataFrame(columns = ['prompt', 'answer'])   
  # q_type_1 = "Which Tasks are addressed in this article"
  # q_type_2 = "Which Datasets are addressed in this article"
  # q_type_3 = "Which Metrics are addressed in this article"
  # q_type_4 = "Which Tasks, Datasets, Metrics are addressed in this article"
  # q_type_5 = "Which Tasks, Datasets, Metrics and Scores are addressed in this article" 
  
  # q_types = [
  #   {"q": "Which Tasks are addressed in this article", "a_key": "Tasks"}, 
  #   {"q": "Which Datasets are addressed in this article", "a_key": "Datasets"}, 
  #   {"q": "Which Metrics are addressed in this article", "a_key": "Metrics"},
  #   {"q": "Which Tasks, Datasets, Metrics are addressed in this article", "a_key": "TDMs"},
  #   {"q": "Which Tasks, Datasets, Metrics and Scores are addressed in this article", "a_key": "TDMSs"}
  #   ]
  
  q_types = [
    # {"q": "What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, and metric?", "a_key": "TDMSs"},
    {"q": "What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, metric, and score?", "a_key": "TDMSs"},
    ]
  
  records = df.to_dict("records")
  # db_dict = defaultdict(lambda : list())
  for i, row in tqdm(enumerate(records), total = len(records)):        
      for q_type in q_types:
        
        # Squad_v2 
        if squad_1:
          result_df.loc[count_index] = [f'{row["Context"]}\n\nPlease answer a question about this article. If the question is unanswerable, say \"unanswerable\". {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_2:
          result_df.loc[count_index] = [f'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".\n\n{row["Context"]}\n\n{q_type["q"]}'
  ] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_3:
          result_df.loc[count_index] = [f'{row["Context"]}\n{q_type["q"]} (If the question is unanswerable, say \"unanswerable\"'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_4:
          result_df.loc[count_index] = [f'{row["Context"]}\nTry to answer this question if possible (otherwise reply \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_5:
          result_df.loc[count_index] = [f'{row["Context"]}\nIf it is possible to answer this question, answer it for me (else, reply \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_6:
          result_df.loc[count_index] = [f'{row["Context"]}\n\nAnswer this question, if possible (if impossible, reply \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_7:
          result_df.loc[count_index] = [f'Read this: {row["Context"]}\n\n{q_type["q"]}\nWhat is the answer? (If it cannot be answered, return \"unanswerable\")'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_8:
          result_df.loc[count_index] = [f'Read this: {row["Context"]}\nNow answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        
        # Drop
        if drop_1:
          result_df.loc[count_index] = [f'Answer based on context:\n\n{row["Context"]}\n\n{q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_2:
          result_df.loc[count_index] = [f'{row["Context"]}\n\nAnswer this question based on the article: {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_3:
          result_df.loc[count_index] = [f'{row["Context"]}\n\n{q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_4:
          result_df.loc[count_index] = [f'{row["Context"]}\nAnswer this question: {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_5:
          result_df.loc[count_index] = [f'Read this article and answer this question {row["Context"]}\n{q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_6:
          result_df.loc[count_index] = [f'{row["Context"]}\n\nBased on the above article, answer a question. {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_7:
          result_df.loc[count_index] = [f'Context: {row["Context"]}\n\nQuestion: {q_type["q"]}\n\nAnswer:'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
         
  return result_df

In [39]:
df_train_f1_all_templates = create_pandas_dataset_from_pandas(df_train_f1,
                                                                squad_1 = True,
                                                                squad_2 = True,
                                                                squad_3 = True,
                                                                squad_4 = True,
                                                                squad_5 = True,
                                                                squad_6 = True,
                                                                squad_7 = True,
                                                                squad_8 = True,
                                                                drop_1 = True,
                                                                drop_2 = True,
                                                                drop_3 = True,
                                                                drop_4 = True,
                                                                drop_5 = True,
                                                                drop_6 = True,
                                                                drop_7 = True
                                                              ) 
df_dev_f1_all_templates = create_pandas_dataset_from_pandas(df_dev_f1,
                                                            squad_1 = True,
                                                            squad_2 = True,
                                                            squad_3 = True,
                                                            squad_4 = True,
                                                            squad_5 = True,
                                                            squad_6 = True,
                                                            squad_7 = True,
                                                            squad_8 = True,
                                                            drop_1 = True,
                                                            drop_2 = True,
                                                            drop_3 = True,
                                                            drop_4 = True,
                                                            drop_5 = True,
                                                            drop_6 = True,
                                                            drop_7 = True
                                                            ) 
df_train_f2_all_templates = create_pandas_dataset_from_pandas(df_train_f2,
                                                            squad_1 = True,
                                                            squad_2 = True,
                                                            squad_3 = True,
                                                            squad_4 = True,
                                                            squad_5 = True,
                                                            squad_6 = True,
                                                            squad_7 = True,
                                                            squad_8 = True,
                                                            drop_1 = True,
                                                            drop_2 = True,
                                                            drop_3 = True,
                                                            drop_4 = True,
                                                            drop_5 = True,
                                                            drop_6 = True,
                                                            drop_7 = True
                                                              ) 
df_dev_f2_all_templates = create_pandas_dataset_from_pandas(df_dev_f2,
                                                            squad_1 = True,
                                                            squad_2 = True,
                                                            squad_3 = True,
                                                            squad_4 = True,
                                                            squad_5 = True,
                                                            squad_6 = True,
                                                            squad_7 = True,
                                                            squad_8 = True,
                                                            drop_1 = True,
                                                            drop_2 = True,
                                                            drop_3 = True,
                                                            drop_4 = True,
                                                            drop_5 = True,
                                                            drop_6 = True,
                                                            drop_7 = True 
                                                            ) 

print("df_train_f1_all_templates describe: ")
display(df_train_f1_all_templates.describe())
print("df_dev_f1_all_templates describe: ")
display(df_dev_f1_all_templates.describe())

print("df_train_f2_all_templates describe: ")
display(df_train_f2_all_templates.describe())
print("df_dev_f2_all_templates describe: ")
display(df_dev_f2_all_templates.describe())

100%|██████████| 3075/3075 [01:04<00:00, 47.60it/s]
100%|██████████| 1298/1298 [00:16<00:00, 80.45it/s]
100%|██████████| 3082/3082 [01:05<00:00, 47.16it/s]
100%|██████████| 1291/1291 [00:15<00:00, 81.23it/s]


df_train_f1_all_templates describe: 


Unnamed: 0,prompt,answer
count,46125,46125
unique,46035,2510
top,Title:\t\n\nAbstract:\tThis document provides ...,unanswerable
freq,2,8430


df_dev_f1_all_templates describe: 


Unnamed: 0,prompt,answer
count,19470,19470
unique,19470,1057
top,Title:\t Dark Model Adaptation: Semantic Image...,unanswerable
freq,1,3615


df_train_f2_all_templates describe: 


Unnamed: 0,prompt,answer
count,46230,46230
unique,46140,2517
top,Title:\tBare Demo of IEEEtran.cls\nfor IEEE Co...,unanswerable
freq,2,8430


df_dev_f2_all_templates describe: 


Unnamed: 0,prompt,answer
count,19365,19365
unique,19350,1051
top,Read this: Title:\tTFNet: Multi-Semantic Featu...,unanswerable
freq,2,3615


In [40]:
df_train_f1_all_templates.describe()

Unnamed: 0,prompt,answer
count,46125,46125
unique,46035,2510
top,Title:\t\n\nAbstract:\tThis document provides ...,unanswerable
freq,2,8430


In [41]:
df_train_f1_all_templates['answer'].apply(type).value_counts()


answer
<class 'str'>    46125
Name: count, dtype: int64

In [42]:
df_dev_f1_all_templates['answer'].apply(type).value_counts()

answer
<class 'str'>    19470
Name: count, dtype: int64

In [43]:
str(df_dev_f1_all_templates.at[5, 'answer'])

"[{'LEADERBOARD': {'Task': 'Semantic Segmentation', 'Dataset': 'Nighttime Driving', 'Metric': 'mIoU', 'Score': '36.1'}}]"

In [44]:
df_train_f1_all_templates.to_parquet('../data/df_train_tdms_long_summarized_f1_all_templates.parquet')
df_dev_f1_all_templates.to_parquet('../data/df_dev_tdms_long_summarized_f1_all_templates.parquet')
df_train_f2_all_templates.to_parquet('../data/df_train_tdms_long_summarized_f2_all_templates.parquet')
df_dev_f2_all_templates.to_parquet('../data/df_dev_tdms_long_summarized_f2_all_templates.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_all_templates.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_all_templates.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_all_templates.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_all_templates.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_ALL_TEMPLATE")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 46125
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 19470
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 46230
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 19365
        })
    })
})


Saving the dataset (0/2 shards):   0%|          | 0/46125 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19470 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/46230 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19365 [00:00<?, ? examples/s]

In [45]:
# df_train_f1_all_templates = pd.read_parquet('../data/df_train_f1_all_templates.parquet')
# df_dev_f1_all_templates = pd.read_parquet('../data/df_dev_f1_all_templates.parquet')
# df_train_f2_all_templates = pd.read_parquet('../data/df_train_f2_all_templates.parquet')
# df_dev_f2_all_templates = pd.read_parquet('../data/df_dev_f2_all_templates.parquet')

# Specific Template

In [46]:
df_train_f1_squad_1 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_1 = True,
                                                        ) 
df_dev_f1_squad_1 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_1 = True,
                                                        ) 
df_train_f2_squad_1 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_1 = True,
                                                        ) 
df_dev_f2_squad_1 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_1 = True,
                                                        ) 

print("df_train_f1_squad_1 describe: ")
display(df_train_f1_squad_1.describe())
print("df_dev_f1_squad_1 describe: ")
display(df_dev_f1_squad_1.describe())

print("df_train_f2_squad_1 describe: ")
display(df_train_f2_squad_1.describe())
print("df_dev_f2_squad_1 describe: ")
display(df_dev_f2_squad_1.describe())

df_train_f1_squad_1.to_parquet('../data/df_train_tdms_long_summarized_f1_squad_1.parquet')
df_dev_f1_squad_1.to_parquet('../data/df_dev_tdms_long_summarized_f1_squad_1.parquet')
df_train_f2_squad_1.to_parquet('../data/df_train_tdms_long_summarized_f2_squad_1.parquet')
df_dev_f2_squad_1.to_parquet('../data/df_dev_tdms_long_summarized_f2_squad_1.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_squad_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_squad_1.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_squad_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_squad_1.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_SQUAD_1")

100%|██████████| 3075/3075 [00:01<00:00, 1560.66it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1627.62it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1566.52it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1634.03it/s]


df_train_f1_squad_1 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Title:\tIJCAI–19 Example on typesetting multip...,unanswerable
freq,2,562


df_dev_f1_squad_1 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Title:\t Dark Model Adaptation: Semantic Image...,unanswerable
freq,1,241


df_train_f2_squad_1 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Title:\tBare Demo of IEEEtran.cls\nfor IEEE Co...,unanswerable
freq,2,562


df_dev_f2_squad_1 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Title:\tTFNet: Multi-Semantic Feature Interact...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [47]:
df_train_f1_squad_2 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_2 = True,
                                                        ) 
df_dev_f1_squad_2 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_2 = True,
                                                        ) 
df_train_f2_squad_2 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_2 = True,
                                                        ) 
df_dev_f2_squad_2 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_2 = True,
                                                        ) 

print("df_train_f1_squad_2 describe: ")
display(df_train_f1_squad_2.describe())
print("df_dev_f1_squad_2 describe: ")
display(df_dev_f1_squad_2.describe())

print("df_train_f2_squad_2 describe: ")
display(df_train_f2_squad_2.describe())
print("df_dev_f2_squad_2 describe: ")
display(df_dev_f2_squad_2.describe())

df_train_f1_squad_2.to_parquet('../data/df_train_tdms_long_summarized_f1_squad_2.parquet')
df_dev_f1_squad_2.to_parquet('../data/df_dev_tdms_long_summarized_f1_squad_2.parquet')
df_train_f2_squad_2.to_parquet('../data/df_train_tdms_long_summarized_f2_squad_2.parquet')
df_dev_f2_squad_2.to_parquet('../data/df_dev_tdms_long_summarized_f2_squad_2.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_squad_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_squad_2.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_squad_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_squad_2.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_SQUAD_2")

100%|██████████| 3075/3075 [00:01<00:00, 1602.25it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1632.93it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1574.88it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1668.14it/s]


df_train_f1_squad_2 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Read this and answer the question. If the ques...,unanswerable
freq,2,562


df_dev_f1_squad_2 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Read this and answer the question. If the ques...,unanswerable
freq,1,241


df_train_f2_squad_2 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Read this and answer the question. If the ques...,unanswerable
freq,2,562


df_dev_f2_squad_2 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Read this and answer the question. If the ques...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [48]:
df_train_f1_squad_3 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_3 = True,
                                                        ) 
df_dev_f1_squad_3 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_3 = True,
                                                        ) 
df_train_f2_squad_3 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_3 = True,
                                                        ) 
df_dev_f2_squad_3 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_3 = True,
                                                        ) 

print("df_train_f1_squad_3 describe: ")
display(df_train_f1_squad_3.describe())
print("df_dev_f1_squad_3 describe: ")
display(df_dev_f1_squad_3.describe())

print("df_train_f2_squad_3 describe: ")
display(df_train_f2_squad_3.describe())
print("df_dev_f2_squad_3 describe: ")
display(df_dev_f2_squad_3.describe())

df_train_f1_squad_3.to_parquet('../data/df_train_tdms_long_summarized_f1_squad_3.parquet')
df_dev_f1_squad_3.to_parquet('../data/df_dev_tdms_long_summarized_f1_squad_3.parquet')
df_train_f2_squad_3.to_parquet('../data/df_train_tdms_long_summarized_f2_squad_3.parquet')
df_dev_f2_squad_3.to_parquet('../data/df_dev_tdms_long_summarized_f2_squad_3.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_squad_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_squad_3.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_squad_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_squad_3.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_SQUAD_3")

100%|██████████| 3075/3075 [00:01<00:00, 1607.40it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1696.42it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1580.41it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1648.32it/s]


df_train_f1_squad_3 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Title:\tIJCAI–19 Example on typesetting multip...,unanswerable
freq,2,562


df_dev_f1_squad_3 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Title:\t Dark Model Adaptation: Semantic Image...,unanswerable
freq,1,241


df_train_f2_squad_3 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Title:\tBare Demo of IEEEtran.cls\nfor IEEE Co...,unanswerable
freq,2,562


df_dev_f2_squad_3 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Title:\tTFNet: Multi-Semantic Feature Interact...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [49]:
df_train_f1_squad_4 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_4 = True,
                                                        ) 
df_dev_f1_squad_4 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_4 = True,
                                                        ) 
df_train_f2_squad_4 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_4 = True,
                                                        ) 
df_dev_f2_squad_4 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_4 = True,
                                                        ) 

print("df_train_f1_squad_4 describe: ")
display(df_train_f1_squad_4.describe())
print("df_dev_f1_squad_4 describe: ")
display(df_dev_f1_squad_4.describe())

print("df_train_f2_squad_4 describe: ")
display(df_train_f2_squad_4.describe())
print("df_dev_f2_squad_4 describe: ")
display(df_dev_f2_squad_4.describe())

df_train_f1_squad_4.to_parquet('../data/df_train_tdms_long_summarized_f1_squad_4.parquet')
df_dev_f1_squad_4.to_parquet('../data/df_dev_tdms_long_summarized_f1_squad_4.parquet')
df_train_f2_squad_4.to_parquet('../data/df_train_tdms_long_summarized_f2_squad_4.parquet')
df_dev_f2_squad_4.to_parquet('../data/df_dev_tdms_long_summarized_f2_squad_4.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_squad_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_squad_4.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_squad_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_squad_4.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_SQUAD_4")

100%|██████████| 3075/3075 [00:01<00:00, 1599.80it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1654.68it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1578.76it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1635.04it/s]


df_train_f1_squad_4 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Title:\tIJCAI–19 Example on typesetting multip...,unanswerable
freq,2,562


df_dev_f1_squad_4 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Title:\t Dark Model Adaptation: Semantic Image...,unanswerable
freq,1,241


df_train_f2_squad_4 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Title:\tBare Demo of IEEEtran.cls\nfor IEEE Co...,unanswerable
freq,2,562


df_dev_f2_squad_4 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Title:\tTFNet: Multi-Semantic Feature Interact...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [50]:
df_train_f1_squad_5 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_5 = True,
                                                        ) 
df_dev_f1_squad_5 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_5 = True,
                                                        ) 
df_train_f2_squad_5 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_5 = True,
                                                        ) 
df_dev_f2_squad_5 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_5 = True,
                                                        ) 

print("df_train_f1_squad_5 describe: ")
display(df_train_f1_squad_5.describe())
print("df_dev_f1_squad_5 describe: ")
display(df_dev_f1_squad_5.describe())

print("df_train_f2_squad_5 describe: ")
display(df_train_f2_squad_5.describe())
print("df_dev_f2_squad_5 describe: ")
display(df_dev_f2_squad_5.describe())

df_train_f1_squad_5.to_parquet('../data/df_train_tdms_long_summarized_f1_squad_5.parquet')
df_dev_f1_squad_5.to_parquet('../data/df_dev_tdms_long_summarized_f1_squad_5.parquet')
df_train_f2_squad_5.to_parquet('../data/df_train_tdms_long_summarized_f2_squad_5.parquet')
df_dev_f2_squad_5.to_parquet('../data/df_dev_tdms_long_summarized_f2_squad_5.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_squad_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_squad_5.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_squad_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_squad_5.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_SQUAD_5")

100%|██████████| 3075/3075 [00:01<00:00, 1577.98it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1635.88it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1588.75it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1676.86it/s]


df_train_f1_squad_5 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Title:\tIJCAI–19 Example on typesetting multip...,unanswerable
freq,2,562


df_dev_f1_squad_5 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Title:\t Dark Model Adaptation: Semantic Image...,unanswerable
freq,1,241


df_train_f2_squad_5 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Title:\tBare Demo of IEEEtran.cls\nfor IEEE Co...,unanswerable
freq,2,562


df_dev_f2_squad_5 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Title:\tTFNet: Multi-Semantic Feature Interact...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [51]:
df_train_f1_squad_6 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_6 = True,
                                                        ) 
df_dev_f1_squad_6 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_6 = True,
                                                        ) 
df_train_f2_squad_6 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_6 = True,
                                                        ) 
df_dev_f2_squad_6 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_6 = True,
                                                        ) 

print("df_train_f1_squad_6 describe: ")
display(df_train_f1_squad_6.describe())
print("df_dev_f1_squad_6 describe: ")
display(df_dev_f1_squad_6.describe())

print("df_train_f2_squad_6 describe: ")
display(df_train_f2_squad_6.describe())
print("df_dev_f2_squad_6 describe: ")
display(df_dev_f2_squad_6.describe())

df_train_f1_squad_6.to_parquet('../data/df_train_tdms_long_summarized_f1_squad_6.parquet')
df_dev_f1_squad_6.to_parquet('../data/df_dev_tdms_long_summarized_f1_squad_6.parquet')
df_train_f2_squad_6.to_parquet('../data/df_train_tdms_long_summarized_f2_squad_6.parquet')
df_dev_f2_squad_6.to_parquet('../data/df_dev_tdms_long_summarized_f2_squad_6.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_squad_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_squad_6.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_squad_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_squad_6.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_SQUAD_6")

100%|██████████| 3075/3075 [00:01<00:00, 1593.57it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1637.86it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1568.97it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1643.80it/s]


df_train_f1_squad_6 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Title:\tIJCAI–19 Example on typesetting multip...,unanswerable
freq,2,562


df_dev_f1_squad_6 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Title:\t Dark Model Adaptation: Semantic Image...,unanswerable
freq,1,241


df_train_f2_squad_6 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Title:\tBare Demo of IEEEtran.cls\nfor IEEE Co...,unanswerable
freq,2,562


df_dev_f2_squad_6 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Title:\tTFNet: Multi-Semantic Feature Interact...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [52]:
df_train_f1_squad_7 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_7 = True,
                                                        ) 
df_dev_f1_squad_7 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_7 = True,
                                                        ) 
df_train_f2_squad_7 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_7 = True,
                                                        ) 
df_dev_f2_squad_7 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_7 = True,
                                                        ) 

print("df_train_f1_squad_7 describe: ")
display(df_train_f1_squad_7.describe())
print("df_dev_f1_squad_7 describe: ")
display(df_dev_f1_squad_7.describe())

print("df_train_f2_squad_7 describe: ")
display(df_train_f2_squad_7.describe())
print("df_dev_f2_squad_7 describe: ")
display(df_dev_f2_squad_7.describe())

df_train_f1_squad_7.to_parquet('../data/df_train_tdms_long_summarized_f1_squad_7.parquet')
df_dev_f1_squad_7.to_parquet('../data/df_dev_tdms_long_summarized_f1_squad_7.parquet')
df_train_f2_squad_7.to_parquet('../data/df_train_tdms_long_summarized_f2_squad_7.parquet')
df_dev_f2_squad_7.to_parquet('../data/df_dev_tdms_long_summarized_f2_squad_7.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_squad_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_squad_7.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_squad_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_squad_7.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_SQUAD_7")

100%|██████████| 3075/3075 [00:01<00:00, 1611.16it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1626.39it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1574.99it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1644.23it/s]


df_train_f1_squad_7 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Read this: Title:\tIJCAI–19 Example on typeset...,unanswerable
freq,2,562


df_dev_f1_squad_7 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Read this: Title:\t Dark Model Adaptation: Sem...,unanswerable
freq,1,241


df_train_f2_squad_7 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Read this: Title:\tBare Demo of IEEEtran.cls\n...,unanswerable
freq,2,562


df_dev_f2_squad_7 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Read this: Title:\tTFNet: Multi-Semantic Featu...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [53]:
df_train_f1_squad_8 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_8 = True,
                                                        ) 
df_dev_f1_squad_8 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_8 = True,
                                                        ) 
df_train_f2_squad_8 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_8 = True,
                                                        ) 
df_dev_f2_squad_8 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_8 = True,
                                                        ) 

print("df_train_f1_squad_8 describe: ")
display(df_train_f1_squad_8.describe())
print("df_dev_f1_squad_8 describe: ")
display(df_dev_f1_squad_8.describe())

print("df_train_f2_squad_8 describe: ")
display(df_train_f2_squad_8.describe())
print("df_dev_f2_squad_8 describe: ")
display(df_dev_f2_squad_8.describe())

df_train_f1_squad_8.to_parquet('../data/df_train_tdms_long_summarized_f1_squad_8.parquet')
df_dev_f1_squad_8.to_parquet('../data/df_dev_tdms_long_summarized_f1_squad_8.parquet')
df_train_f2_squad_8.to_parquet('../data/df_train_tdms_long_summarized_f2_squad_8.parquet')
df_dev_f2_squad_8.to_parquet('../data/df_dev_tdms_long_summarized_f2_squad_8.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_squad_8.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_squad_8.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_squad_8.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_squad_8.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_SQUAD_8")

100%|██████████| 3075/3075 [00:01<00:00, 1579.36it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1648.85it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1578.41it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1648.28it/s]


df_train_f1_squad_8 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Read this: Title:\tIJCAI–19 Example on typeset...,unanswerable
freq,2,562


df_dev_f1_squad_8 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Read this: Title:\t Dark Model Adaptation: Sem...,unanswerable
freq,1,241


df_train_f2_squad_8 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Read this: Title:\tBare Demo of IEEEtran.cls\n...,unanswerable
freq,2,562


df_dev_f2_squad_8 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Read this: Title:\tTFNet: Multi-Semantic Featu...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [54]:
df_train_f1_drop_1 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_1 = True,
                                                        ) 
df_dev_f1_drop_1 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_1 = True,
                                                        ) 
df_train_f2_drop_1 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_1 = True,
                                                        ) 
df_dev_f2_drop_1 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_1 = True,
                                                        ) 

print("df_train_f1_drop_1 describe: ")
display(df_train_f1_drop_1.describe())
print("df_dev_f1_drop_1 describe: ")
display(df_dev_f1_drop_1.describe())

print("df_train_f2_drop_1 describe: ")
display(df_train_f2_drop_1.describe())
print("df_dev_f2_drop_1 describe: ")
display(df_dev_f2_drop_1.describe())

df_train_f1_drop_1.to_parquet('../data/df_train_tdms_long_summarized_f1_drop_1.parquet')
df_dev_f1_drop_1.to_parquet('../data/df_dev_tdms_long_summarized_f1_drop_1.parquet')
df_train_f2_drop_1.to_parquet('../data/df_train_tdms_long_summarized_f2_drop_1.parquet')
df_dev_f2_drop_1.to_parquet('../data/df_dev_tdms_long_summarized_f2_drop_1.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_drop_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_drop_1.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_drop_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_drop_1.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_DROP_1")

100%|██████████| 3075/3075 [00:01<00:00, 1589.31it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1644.46it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1561.20it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1671.23it/s]


df_train_f1_drop_1 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Answer based on context:\n\nTitle:\tIJCAI–19 E...,unanswerable
freq,2,562


df_dev_f1_drop_1 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Answer based on context:\n\nTitle:\t Dark Mode...,unanswerable
freq,1,241


df_train_f2_drop_1 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Answer based on context:\n\nTitle:\tBare Demo ...,unanswerable
freq,2,562


df_dev_f2_drop_1 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Answer based on context:\n\nTitle:\tTFNet: Mul...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [55]:
df_train_f1_drop_2 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_2 = True,
                                                        ) 
df_dev_f1_drop_2 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_2 = True,
                                                        ) 
df_train_f2_drop_2 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_2 = True,
                                                        ) 
df_dev_f2_drop_2 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_2 = True,
                                                        ) 

print("df_train_f1_drop_2 describe: ")
display(df_train_f1_drop_2.describe())
print("df_dev_f1_drop_2 describe: ")
display(df_dev_f1_drop_2.describe())

print("df_train_f2_drop_2 describe: ")
display(df_train_f2_drop_2.describe())
print("df_dev_f2_drop_2 describe: ")
display(df_dev_f2_drop_2.describe())

df_train_f1_drop_2.to_parquet('../data/df_train_tdms_long_summarized_f1_drop_2.parquet')
df_dev_f1_drop_2.to_parquet('../data/df_dev_tdms_long_summarized_f1_drop_2.parquet')
df_train_f2_drop_2.to_parquet('../data/df_train_tdms_long_summarized_f2_drop_2.parquet')
df_dev_f2_drop_2.to_parquet('../data/df_dev_tdms_long_summarized_f2_drop_2.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_drop_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_drop_2.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_drop_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_drop_2.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_DROP_2")

100%|██████████| 3075/3075 [00:01<00:00, 1590.24it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1630.88it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1582.62it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1676.76it/s]


df_train_f1_drop_2 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Title:\tIJCAI–19 Example on typesetting multip...,unanswerable
freq,2,562


df_dev_f1_drop_2 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Title:\t Dark Model Adaptation: Semantic Image...,unanswerable
freq,1,241


df_train_f2_drop_2 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Title:\tBare Demo of IEEEtran.cls\nfor IEEE Co...,unanswerable
freq,2,562


df_dev_f2_drop_2 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Title:\tTFNet: Multi-Semantic Feature Interact...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [56]:
df_train_f1_drop_3 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_3 = True,
                                                        ) 
df_dev_f1_drop_3 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_3 = True,
                                                        ) 
df_train_f2_drop_3 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_3 = True,
                                                        ) 
df_dev_f2_drop_3 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_3 = True,
                                                        ) 

print("df_train_f1_drop_3 describe: ")
display(df_train_f1_drop_3.describe())
print("df_dev_f1_drop_3 describe: ")
display(df_dev_f1_drop_3.describe())

print("df_train_f2_drop_3 describe: ")
display(df_train_f2_drop_3.describe())
print("df_dev_f2_drop_3 describe: ")
display(df_dev_f2_drop_3.describe())

df_train_f1_drop_3.to_parquet('../data/df_train_tdms_long_summarized_f1_drop_3.parquet')
df_dev_f1_drop_3.to_parquet('../data/df_dev_tdms_long_summarized_f1_drop_3.parquet')
df_train_f2_drop_3.to_parquet('../data/df_train_tdms_long_summarized_f2_drop_3.parquet')
df_dev_f2_drop_3.to_parquet('../data/df_dev_tdms_long_summarized_f2_drop_3.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_drop_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_drop_3.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_drop_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_drop_3.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_DROP_3")

100%|██████████| 3075/3075 [00:01<00:00, 1574.77it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1651.95it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1585.46it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1644.63it/s]


df_train_f1_drop_3 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Title:\tIJCAI–19 Example on typesetting multip...,unanswerable
freq,2,562


df_dev_f1_drop_3 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Title:\t Dark Model Adaptation: Semantic Image...,unanswerable
freq,1,241


df_train_f2_drop_3 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Title:\tBare Demo of IEEEtran.cls\nfor IEEE Co...,unanswerable
freq,2,562


df_dev_f2_drop_3 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Title:\tTFNet: Multi-Semantic Feature Interact...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [57]:
df_train_f1_drop_4 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_4 = True,
                                                        ) 
df_dev_f1_drop_4 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_4 = True,
                                                        ) 
df_train_f2_drop_4 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_4 = True,
                                                        ) 
df_dev_f2_drop_4 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_4 = True,
                                                        ) 

print("df_train_f1_drop_4 describe: ")
display(df_train_f1_drop_4.describe())
print("df_dev_f1_drop_4 describe: ")
display(df_dev_f1_drop_4.describe())

print("df_train_f2_drop_4 describe: ")
display(df_train_f2_drop_4.describe())
print("df_dev_f2_drop_4 describe: ")
display(df_dev_f2_drop_4.describe())

df_train_f1_drop_4.to_parquet('../data/df_train_tdms_long_summarized_f1_drop_4.parquet')
df_dev_f1_drop_4.to_parquet('../data/df_dev_tdms_long_summarized_f1_drop_4.parquet')
df_train_f2_drop_4.to_parquet('../data/df_train_tdms_long_summarized_f2_drop_4.parquet')
df_dev_f2_drop_4.to_parquet('../data/df_dev_tdms_long_summarized_f2_drop_4.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_drop_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_drop_4.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_drop_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_drop_4.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_DROP_4")

100%|██████████| 3075/3075 [00:01<00:00, 1592.34it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1667.12it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1599.26it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1621.11it/s]


df_train_f1_drop_4 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Title:\tIJCAI–19 Example on typesetting multip...,unanswerable
freq,2,562


df_dev_f1_drop_4 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Title:\t Dark Model Adaptation: Semantic Image...,unanswerable
freq,1,241


df_train_f2_drop_4 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Title:\tBare Demo of IEEEtran.cls\nfor IEEE Co...,unanswerable
freq,2,562


df_dev_f2_drop_4 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Title:\tTFNet: Multi-Semantic Feature Interact...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [58]:
df_train_f1_drop_5 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_5 = True,
                                                        ) 
df_dev_f1_drop_5 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_5 = True,
                                                        ) 
df_train_f2_drop_5 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_5 = True,
                                                        ) 
df_dev_f2_drop_5 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_5 = True,
                                                        ) 

print("df_train_f1_drop_5 describe: ")
display(df_train_f1_drop_5.describe())
print("df_dev_f1_drop_5 describe: ")
display(df_dev_f1_drop_5.describe())

print("df_train_f2_drop_5 describe: ")
display(df_train_f2_drop_5.describe())
print("df_dev_f2_drop_5 describe: ")
display(df_dev_f2_drop_5.describe())

df_train_f1_drop_5.to_parquet('../data/df_train_tdms_long_summarized_f1_drop_5.parquet')
df_dev_f1_drop_5.to_parquet('../data/df_dev_tdms_long_summarized_f1_drop_5.parquet')
df_train_f2_drop_5.to_parquet('../data/df_train_tdms_long_summarized_f2_drop_5.parquet')
df_dev_f2_drop_5.to_parquet('../data/df_dev_tdms_long_summarized_f2_drop_5.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_drop_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_drop_5.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_drop_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_drop_5.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_DROP_5")

100%|██████████| 3075/3075 [00:01<00:00, 1606.28it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1640.91it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1580.64it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1631.33it/s]


df_train_f1_drop_5 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Read this article and answer this question Tit...,unanswerable
freq,2,562


df_dev_f1_drop_5 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Read this article and answer this question Tit...,unanswerable
freq,1,241


df_train_f2_drop_5 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Read this article and answer this question Tit...,unanswerable
freq,2,562


df_dev_f2_drop_5 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Read this article and answer this question Tit...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [59]:
df_train_f1_drop_6 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_6 = True,
                                                        ) 
df_dev_f1_drop_6 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_6 = True,
                                                        ) 
df_train_f2_drop_6 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_6 = True,
                                                        ) 
df_dev_f2_drop_6 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_6 = True,
                                                        ) 

print("df_train_f1_drop_6 describe: ")
display(df_train_f1_drop_6.describe())
print("df_dev_f1_drop_6 describe: ")
display(df_dev_f1_drop_6.describe())

print("df_train_f2_drop_6 describe: ")
display(df_train_f2_drop_6.describe())
print("df_dev_f2_drop_6 describe: ")
display(df_dev_f2_drop_6.describe())

df_train_f1_drop_6.to_parquet('../data/df_train_tdms_long_summarized_f1_drop_6.parquet')
df_dev_f1_drop_6.to_parquet('../data/df_dev_tdms_long_summarized_f1_drop_6.parquet')
df_train_f2_drop_6.to_parquet('../data/df_train_tdms_long_summarized_f2_drop_6.parquet')
df_dev_f2_drop_6.to_parquet('../data/df_dev_tdms_long_summarized_f2_drop_6.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_drop_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_drop_6.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_drop_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_drop_6.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_DROP_6")

100%|██████████| 3075/3075 [00:01<00:00, 1581.78it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1699.56it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1607.40it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1650.61it/s]


df_train_f1_drop_6 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Title:\tIJCAI–19 Example on typesetting multip...,unanswerable
freq,2,562


df_dev_f1_drop_6 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Title:\t Dark Model Adaptation: Semantic Image...,unanswerable
freq,1,241


df_train_f2_drop_6 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Title:\tBare Demo of IEEEtran.cls\nfor IEEE Co...,unanswerable
freq,2,562


df_dev_f2_drop_6 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Title:\tTFNet: Multi-Semantic Feature Interact...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [60]:
df_train_f1_drop_7 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_7 = True,
                                                        ) 
df_dev_f1_drop_7 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_7 = True,
                                                        ) 
df_train_f2_drop_7 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_7 = True,
                                                        ) 
df_dev_f2_drop_7 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_7 = True,
                                                        ) 

print("df_train_f1_drop_7 describe: ")
display(df_train_f1_drop_7.describe())
print("df_dev_f1_drop_7 describe: ")
display(df_dev_f1_drop_7.describe())

print("df_train_f2_drop_7 describe: ")
display(df_train_f2_drop_7.describe())
print("df_dev_f2_drop_7 describe: ")
display(df_dev_f2_drop_7.describe())

df_train_f1_drop_7.to_parquet('../data/df_train_tdms_long_summarized_f1_drop_7.parquet')
df_dev_f1_drop_7.to_parquet('../data/df_dev_tdms_long_summarized_f1_drop_7.parquet')
df_train_f2_drop_7.to_parquet('../data/df_train_tdms_long_summarized_f2_drop_7.parquet')
df_dev_f2_drop_7.to_parquet('../data/df_dev_tdms_long_summarized_f2_drop_7.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_drop_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_drop_7.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_drop_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_drop_7.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_DROP_7")

100%|██████████| 3075/3075 [00:01<00:00, 1582.36it/s]
100%|██████████| 1298/1298 [00:00<00:00, 1615.81it/s]
100%|██████████| 3082/3082 [00:01<00:00, 1589.28it/s]
100%|██████████| 1291/1291 [00:00<00:00, 1667.31it/s]


df_train_f1_drop_7 describe: 


Unnamed: 0,prompt,answer
count,3075,3075
unique,3069,2510
top,Context: Title:\tIJCAI–19 Example on typesetti...,unanswerable
freq,2,562


df_dev_f1_drop_7 describe: 


Unnamed: 0,prompt,answer
count,1298,1298
unique,1298,1057
top,Context: Title:\t Dark Model Adaptation: Seman...,unanswerable
freq,1,241


df_train_f2_drop_7 describe: 


Unnamed: 0,prompt,answer
count,3082,3082
unique,3076,2517
top,Context: Title:\tBare Demo of IEEEtran.cls\nfo...,unanswerable
freq,2,562


df_dev_f2_drop_7 describe: 


Unnamed: 0,prompt,answer
count,1291,1291
unique,1290,1051
top,Context: Title:\tTFNet: Multi-Semantic Feature...,unanswerable
freq,2,241


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3075
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1298
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 3082
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 1291
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/3075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1298 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3082 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1291 [00:00<?, ? examples/s]

In [61]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [62]:
# root_directory = "../data/LLLM_LONG_SUMMARIZED_TDMS_ALL_TEMPLATE"
root_directory = "../data/LLLM_LONG_SUMMARIZED_TDMS_DROP_7"


# reloaded_encoded_dataset = datasets.load_from_disk("../data/dataset/LLLM_TDMS_ALL_TEMPLATE")
# reloaded_encoded_dataset = DatasetDict.load_from_disk("../data/LLLM_TDMS_ALL_TEMPLATE")

dataset_fold1 = DatasetDict.load_from_disk(f"{root_directory}/fold1")
dataset_fold2 = DatasetDict.load_from_disk(f"{root_directory}/fold2")

In [63]:
dataset_fold1['train'][0]

{'prompt': 'Context: Title:\tValue Prediction Network\n\nAbstract:\tThis paper proposes a novel deep reinforcement learning (RL) architecture, called Value Prediction Network (VPN), which integrates model-free and model-based RL methods into a single neural network. In contrast to typical model-based RL methods, VPN learns a dynamics model whose abstract states are trained to make option-conditional predictions of future values (discounted sum of rewards) rather than of future observations. Our experimental results show that VPN has several advantages over both model-free and model-based baselines in a stochastic environment where careful planning is required but building an accurate observation-prediction model is difficult. Furthermore, VPN outperforms Deep Q-Network (DQN) on several Atari games even with short-lookahead planning, demonstrating its potential as a new way of learning a good state representation.\n\nExperiments\n\nOur experiments investigated the following questions: 1

In [21]:
template = ['Please answer a question about this article. If the question is unanswerable, say \"unanswerable\"',
'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".',
'If the question is unanswarable, say \"unanswerable\"',
'Try to answer this question if possible (otherwise reply \"unanswerable\"',
'If it is possible to answer this question, answer it for me (else, reply \"unanswerable\"',
'Answer this question, if possible (if impossible, reply \"unanswerable\"',
'Read this: What is the answer? (If it cannot be answered, return \"unanswerable\"',
'Read this: Now answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"',
'Answer based on context:',
'Answer this question based on theaarticle:',
# ("{context}\n\n{question}", "{answer}"),
'Answer this question:',
'Read this article and answer this question',
'Based on the above article, answer a question.',
# 'Context: {context}\n\nQuestion: {question}\n\nAnswer:", "{answer}"),
]

template

['Please answer a question about this article. If the question is unanswerable, say "unanswerable"',
 'Read this and answer the question. If the question is unanswerable, say "unanswerable".',
 'If the question is unanswerable, say "unanswerable"',
 'Try to answer this question if possible (otherwise reply "unanswerable"',
 'If it is possible to answer this question, answer it for me (else, reply "unanswerable"',
 'Answer this question, if possible (if impossible, reply "unanswerable"',
 'Read this: What is the answer? (If it cannot be answered, return "unanswerable"',
 'Read this: Now answer this question, if there is an answer (If it cannot be answered, return "unanswerable"',
 'Answer based on context:',
 'Answer this question based on the article:',
 'Answer this question:',
 'Read this article and answer this question',
 'Based on the above article, answer a question.']

In [22]:
# df.head(2)

In [23]:
def create_pandas_dataset_from_pandas(df,
                          answer_threshold=7,
                          verbose = False):

  ''' Create a Pandas Dataframe from pandas.
  Params:
        answer_threshold: Only consider those Question Answer pairs where the Answer is short.
  '''
  count_index = 0
  result_df  = pd.DataFrame(columns = ['prompt', 'answer'])   
  # q_type_1 = "Which Tasks are addressed in this article"
  # q_type_2 = "Which Datasets are addressed in this article"
  # q_type_3 = "Which Metrics are addressed in this article"
  # q_type_4 = "Which Tasks, Datasets, Metrics are addressed in this article"
  # q_type_5 = "Which Tasks, Datasets, Metrics and Scores are addressed in this article" 
  
  # q_types = [
  #   {"q": "Which Tasks are addressed in this article", "a_key": "Tasks"}, 
  #   {"q": "Which Datasets are addressed in this article", "a_key": "Datasets"}, 
  #   {"q": "Which Metrics are addressed in this article", "a_key": "Metrics"},
  #   {"q": "Which Tasks, Datasets, Metrics are addressed in this article", "a_key": "TDMs"},
  #   {"q": "Which Tasks, Datasets, Metrics and Scores are addressed in this article", "a_key": "TDMSs"}
  #   ]
  
  q_types = [
    # {"q": "What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, and metric?", "a_key": "TDMSs"},
    {"q": "What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, metric, and score?", "a_key": "TDMSs"},
    ]
  
  records = df.to_dict("records")
  # db_dict = defaultdict(lambda : list())
  for i, row in tqdm(enumerate(records), total = len(records)):        
      for q_type in q_types:
        
        # Squad_v2
        result_df.loc[count_index] = [f'{row["Context"]}\n\nPlease answer a question about this article. If the question is unanswerable, say \"unanswerable\". {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".\n\n{row["Context"]}\n\n{q_type["q"]}'
] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n{q_type["q"]} (If the question is unanswerable, say \"unanswerable\"'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\nTry to answer this question if possible (otherwise reply \"unanswerable\"): {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\nIf it is possible to answer this question, answer it for me (else, reply \"unanswerable\"): {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n\nAnswer this question, if possible (if impossible, reply \"unanswerable\"): {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'Read this: {row["Context"]}\n\n{q_type["q"]}\nWhat is the answer? (If it cannot be answered, return \"unanswerable\")'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'Read this: {row["Context"]}\nNow answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"): {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        
        # Drop
        result_df.loc[count_index] = [f'Answer based on context:\n\n{row["Context"]}\n\n{q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n\nAnswer this question based on the article: {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n\n{q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\nAnswer this question: {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'Read this article and answer this question {row["Context"]}\n{q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n\nBased on the above article, answer a question. {q_type["q"]}'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
        result_df.loc[count_index] = [f'Context: {row["Context"]}\n\nQuestion: {q_type["q"]}\n\nAnswer:'] \
          + [str(row[q_type["a_key"]])] 
        count_index += 1
         
            
  if verbose:
    # return (result_df,
    #         count_long,
    #         count_short)
    return (result_df)
  else:
    return result_df

In [24]:
df_train_f1_all_templates = create_pandas_dataset_from_pandas(df_train_f1) 
df_dev_f1_all_templates = create_pandas_dataset_from_pandas(df_dev_f1) 
df_train_f2_all_templates = create_pandas_dataset_from_pandas(df_train_f2) 
df_dev_f2_all_templates = create_pandas_dataset_from_pandas(df_dev_f2) 

  0%|          | 4/5512 [00:00<03:02, 30.26it/s]

100%|██████████| 5512/5512 [09:42<00:00,  9.46it/s]
100%|██████████| 2353/2353 [01:22<00:00, 28.38it/s]
100%|██████████| 5513/5513 [09:26<00:00,  9.72it/s]
100%|██████████| 2352/2352 [01:19<00:00, 29.60it/s]


In [25]:
df_train_f1_all_templates.describe()

Unnamed: 0,prompt,answer
count,82680,82680
unique,82335,3634
top,Title\tLaTeX Author Guidelines for CVPR Procee...,unanswerable
freq,5,28080


In [26]:
print("df_train_f1_all_templates describe: ")
display(df_train_f1_all_templates.describe())
print("df_dev_f1_all_templates describe: ")
display(df_dev_f1_all_templates.describe())

print("df_train_f2_all_templates describe: ")
display(df_train_f2_all_templates.describe())
print("df_dev_f2_all_templates describe: ")
display(df_dev_f2_all_templates.describe())

df_train_f1_all_templates describe: 


Unnamed: 0,prompt,answer
count,82680,82680
unique,82335,3634
top,Title\tLaTeX Author Guidelines for CVPR Procee...,unanswerable
freq,5,28080


df_dev_f1_all_templates describe: 


Unnamed: 0,prompt,answer
count,35295,35295
unique,35145,1549
top,Read this: Title\tLaTeX Author Guidelines for ...,unanswerable
freq,5,12060


df_train_f2_all_templates describe: 


Unnamed: 0,prompt,answer
count,82695,82695
unique,82260,3637
top,Title\tLaTeX Author Guidelines for CVPR Procee...,unanswerable
freq,6,28080


df_dev_f2_all_templates describe: 


Unnamed: 0,prompt,answer
count,35280,35280
unique,35205,1548
top,Title\tBare Advanced Demo of IEEEtran.cls for\...,unanswerable
freq,2,12060


In [27]:
df_train_f1_all_templates['answer'].apply(type).value_counts()


<class 'str'>    82680
Name: answer, dtype: int64

In [28]:
df_dev_f1_all_templates['answer'].apply(type).value_counts()


<class 'str'>    35295
Name: answer, dtype: int64

In [29]:
# df_train_f1_all_templates.to_parquet('../data/df_train_long_f1_all_templates.parquet')
# df_dev_f1_all_templates.to_parquet('../data/df_dev_long_f1_all_templates.parquet')
# df_train_f2_all_templates.to_parquet('../data/df_train_long_f2_all_templates.parquet')
# df_dev_f2_all_templates.to_parquet('../data/df_dev_long_f2_all_templates.parquet')

df_train_f1_all_templates.to_parquet('../data/df_train_tdms_long_summarized_f1_all_templates.parquet')
df_dev_f1_all_templates.to_parquet('../data/df_dev_tdms_long_summarized_f1_all_templates.parquet')
df_train_f2_all_templates.to_parquet('../data/df_train_tdms_long_summarized_f2_all_templates.parquet')
df_dev_f2_all_templates.to_parquet('../data/df_dev_tdms_long_summarized_f2_all_templates.parquet')

In [30]:
# df_train_f1_all_templates = pd.read_parquet('../data/df_train_f1_all_templates.parquet')
# df_dev_f1_all_templates = pd.read_parquet('../data/df_dev_f1_all_templates.parquet')
# df_train_f2_all_templates = pd.read_parquet('../data/df_train_f2_all_templates.parquet')
# df_dev_f2_all_templates = pd.read_parquet('../data/df_dev_f2_all_templates.parquet')

In [31]:
# dataset_train_f1_all_templates = Dataset.from_pandas(df_train_f1_all_templates)
# dataset_dev_f1_all_templates = Dataset.from_pandas(df_dev_f1_all_templates)
# dataset_train_f2_all_templates = Dataset.from_pandas(df_train_f2_all_templates)
# dataset_dev_f2_all_templates = Dataset.from_pandas(df_dev_f2_all_templates)



# f1 = DatasetDict({
#         "train": dataset_train_f1_all_templates,
#         "validation": dataset_dev_f1_all_templates
#     })
# f2 = DatasetDict({
#         "train": dataset_train_f2_all_templates,
#         "validation": dataset_dev_f2_all_templates
#     })

# Combine into a DatasetDict
# dataset = DatasetDict({
#     'fold1': DatasetDict({
#         "train": dataset_train_f1_all_templates,
#         "validation": dataset_dev_f1_all_templates
#     }),
#     'fold2': DatasetDict({
#         "train": dataset_train_f2_all_templates,
#         "validation": dataset_dev_f2_all_templates
#     })
# })


dataset = DatasetDict({
    'fold1': DatasetDict({
        # "train": Dataset.from_parquet('../data/df_train_long_f1_all_templates.parquet'),
        # "validation": Dataset.from_parquet('../data/df_dev_long_f1_all_templates.parquet'),
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f1_all_templates.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f1_all_templates.parquet'),
    }),
    'fold2': DatasetDict({
        # "train": Dataset.from_parquet('../data/df_train_long_f2_all_templates.parquet'),
        # "validation": Dataset.from_parquet('../data/df_dev_long_f2_all_templates.parquet'),
        "train": Dataset.from_parquet('../data/df_train_tdms_long_summarized_f2_all_templates.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_long_summarized_f2_all_templates.parquet'),
    })
})

# dataset_fold1 = DatasetDict({
#     "train": Dataset.from_parquet('../data/df_train_f1_all_templates.parquet'),
#     "validation": Dataset.from_parquet('../data/df_dev_f1_all_templates.parquet')
#     })

# dataset_fold2 =  DatasetDict({
#     "train": Dataset.from_parquet('../data/df_train_f2_all_templates.parquet'),
#     "validation": Dataset.from_parquet('../data/df_dev_f2_all_templates.parquet')
#     })

# dataset = DatasetDict({
#     'fold1': f1,
#     'fold2': f2
# })

# dataset = DatasetDict({
#     "train": dataset_train_f1_all_templates,
#     "validation": dataset_dev_f1_all_templates
#     })

print(dataset)
# print(dataset_fold1)

Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-ac107c7899e6dda8/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-ac107c7899e6dda8/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-019016c1ed339f15/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-019016c1ed339f15/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-cab000074a3fedd8/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-cab000074a3fedd8/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-cd61d162599e5da7/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-cd61d162599e5da7/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 82680
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 35295
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 82695
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 35280
        })
    })
})


In [32]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [33]:
dataset.save_to_disk("../data/LLLM_LONG_SUMMARIZED_TDMS_ALL_TEMPLATE")

Saving the dataset (0/5 shards):   0%|          | 0/82680 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/35295 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/82695 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/35280 [00:00<?, ? examples/s]

In [4]:
# root_directory = "../data/LLLM_LONG_TDM_ALL_TEMPLATE"
# root_directory = "../data/LLLM_LONG_SUMMARIZED_TDMS_ALL_TEMPLATE"
# root_directory = "../data/LLLM_LONG_TDM_ALL_TEMPLATE"
root_directory = "../data/LLLM_LONG_SUMMARIZED_TDMS_ALL_TEMPLATE"

# reloaded_encoded_dataset = datasets.load_from_disk("../data/dataset/LLLM_TDMS_ALL_TEMPLATE")
# reloaded_encoded_dataset = DatasetDict.load_from_disk("../data/LLLM_TDMS_ALL_TEMPLATE")

dataset_fold1 = DatasetDict.load_from_disk(f"{root_directory}/fold1")
dataset_fold2 = DatasetDict.load_from_disk(f"{root_directory}/fold2")

In [10]:
dataset_fold1['train'][1900]

{'prompt': 'Local Class-Specific and Global Image-Level Generative Adversarial Networks for Semantic-Guided Scene Generation In this paper, we address the task of semantic-guided scene generation. One open challenge widely observed in global image-level generation methods is the difficulty of generating small objects and detailed local texture. To tackle this issue, in this work we consider learning the scene generation in a local context, and correspondingly design a local class-specific generative network with semantic maps as a guidance, which separately constructs and learns sub-generators concentrating on the generation of different classes, and is able to provide more scene details. To learn more discriminative class-specific feature representations for the local generation, a novel classification module is also proposed. To combine the advantage of both global image-level and the local class-specific generation, a joint generation network is designed with an attention fusion mod

In [6]:
len(dataset_fold1['train'])

82680