In [1]:
# !pip install --quiet  datasets #to access squad dataset
# !pip install --quiet pyarrow   #to deal with parquet files for saving dataset if required
# !pip install --quiet  tqdm     #for progress bars
# !pip install --quiet transformers # for t5 model
# !pip install --quiet tokenizers  #tokenizers from HuggingFace
# !pip install --quiet sentencepiece #subword tokenizer used by T5
# !pip install --quiet pytorch-lightning # pytorch wrapper 
# !pip install --quiet torchtext # text utilities

# Fetching Datasets

In [2]:
#imports
import pandas as pd
import torch
from tqdm import tqdm
from datasets import DatasetDict, Dataset, load_from_disk
# from torch.utils.data import Dataset, DataLoader
from pprint import pprint
import copy
import numpy as np
from collections import defaultdict
import ipdb

pd.options.display.max_rows , pd.options.display.max_columns  = 100,100  

In [3]:
device  = 'cuda' if torch.cuda.is_available() else "cpu"
device

'cpu'

In [4]:
# path_to_source = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_full_5_10_10000_clone_latex_compare/10Neg10000unk/twofoldwithunk"
path_to_csv = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk"

fold1 = "fold1"
train_f1_pd = pd.read_csv(f"{path_to_csv}/{fold1}/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])
dev_f1_pd = pd.read_csv(f"{path_to_csv}/{fold1}/dev.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

fold2 = "fold2"
train_f2_pd = pd.read_csv(f"{path_to_csv}/{fold2}/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])
dev_f2_pd = pd.read_csv(f"{path_to_csv}/{fold2}/dev.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [5]:
# no_leaderboard_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/T5-Leaderboard-QA/data_proccess/arxiv_no_leaderboard_links_pdf_short/DocTAET_full.tsv", 
#                     sep="\t", names=["title", "Context"])

no_leaderboard_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/T5-Leaderboard-QA/data_proccess/arxiv_no_leaderboard_links_pdf_short/DocTAET_150.tsv", 
                    sep="\t", names=["title", "Context"])

no_leaderboard_pd.describe()

Unnamed: 0,title,Context
count,4369,4369
unique,4369,4365
top,0912.4438.pdf,! !
freq,1,2


In [6]:
resultsAnnotation_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/annotations_final/resultsAnnotation.tsv",
                                   sep="\t", names=["Title", "TDMSs"])
resultsAnnotation_pd = resultsAnnotation_pd.fillna("NAN")
resultsAnnotation_pd

Unnamed: 0,Title,TDMSs
0,1704.03549v4.pdf,Optical Character Recognition#FSNS - Test#Sequ...
1,1712.05404.pdf,Optical Character Recognition#FSNS - Test#Sequ...
2,1702.03970v1.pdf,Optical Character Recognition#FSNS - Test#Sequ...
3,2104.02324v1.pdf,"Active Object Detection#COCO#AP#(7.3, 13.8, 16..."
4,2008.12995v3.pdf,Handwriting Recognition#BanglaLekha Isolated D...
...,...,...
5724,2104.01378v1.pdf,Phone-level pronunciation scoring#speechocean7...
5725,2104.10283v1.pdf,Graph Question Answering#GQA#Accuracy#96.30
5726,2104.11980v1.pdf,Trajectory Modeling#NBA SportVU#1x1 NLL#0.472
5727,1704.00077v1.pdf,Video Segmentation#SegTrack v2#Accuracy#86.86


In [7]:
"""
This will take care of papers with more than one learderboard 
"""
records = resultsAnnotation_pd.to_dict("records")

title_to_tdms_dict = defaultdict(
    lambda : 
        list()
    )

for i, row in tqdm(enumerate(records), total = len(records)):
    if row['TDMSs'] == 'NAN':
        continue

    for tdms in row['TDMSs'].split("$"):
        if len(tdms.split("#")) != 4:
            # ipdb.set_trace()
            continue 
        t, d, m, s = tdms.split("#")
        title_to_tdms_dict[row['Title']].append(
            {
                "LEADERBOARD": {
                    "Task": t,
                    "Dataset": d,
                    "Metric": m,
                    "Score": s,
                }
            }            
        )

100%|██████████| 5729/5729 [00:00<00:00, 77080.06it/s]


In [8]:
len(title_to_tdms_dict)

5725

In [9]:
# No need for negative instances, but will still have 'duplicate' for paper with more than one leaderboard
train_f1_pd = train_f1_pd[train_f1_pd.label==True]
print("train_f1_pd")
display(train_f1_pd.describe())

dev_f1_pd = dev_f1_pd[dev_f1_pd.label==True]
print("dev_f1_pd")
display(dev_f1_pd.describe())

train_f2_pd = train_f2_pd[train_f2_pd.label==True]
print("train_f2_pd")
display(train_f2_pd.describe())

dev_f2_pd = dev_f2_pd[dev_f2_pd.label==True]
print("dev_f2_pd")
display(dev_f2_pd.describe())

train_f1_pd


Unnamed: 0,label,title,TDM,Context
count,12613,12613,12613,12613
unique,1,3753,1792,3747
top,True,1803.00933v1.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,12613,58,923,58


dev_f1_pd


Unnamed: 0,label,title,TDM,Context
count,5472,5472,5472,5472
unique,1,1608,1557,1606
top,True,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,5472,58,378,58


train_f2_pd


Unnamed: 0,label,title,TDM,Context
count,12677,12677,12677,12677
unique,1,3753,1821,3749
top,True,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,12677,58,920,58


dev_f2_pd


Unnamed: 0,label,title,TDM,Context
count,5408,5408,5408,5408
unique,1,1608,1542,1608
top,True,1802.01561v3.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,5408,58,381,58


In [10]:
# len(train_pd.title.unique())
records_train_f1 = train_f1_pd.to_dict("records")
records_dev_f1 = dev_f1_pd.to_dict("records")
records_train_f2 = train_f2_pd.to_dict("records")
records_dev_f2 = dev_f2_pd.to_dict("records")

In [11]:
# title_to_tdms_dict = defaultdict(lambda : defaultdict(lambda : str("| ")))
title_to_content = {
    "train_f1":{},
    "dev_f1":{},
    "train_f2":{},
    "dev_f2":{},
    }

for i, row in tqdm(enumerate(records_train_f1), total = len(records_train_f1)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["train_f1"]:
        continue 
    else:
        title_to_content["train_f1"][row['title']] = row['Context']
    
for i, row in tqdm(enumerate(records_dev_f1), total = len(records_dev_f1)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["dev_f1"]:
        continue 
    else:
        title_to_content["dev_f1"][row['title']] = row['Context']
        
for i, row in tqdm(enumerate(records_train_f2), total = len(records_train_f2)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["train_f2"]:
        continue 
    else:
        title_to_content["train_f2"][row['title']] = row['Context']
    
for i, row in tqdm(enumerate(records_dev_f2), total = len(records_dev_f2)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["dev_f2"]:
        continue 
    else:
        title_to_content["dev_f2"][row['title']] = row['Context']

100%|██████████| 12613/12613 [00:00<00:00, 1012202.36it/s]
100%|██████████| 5472/5472 [00:00<00:00, 960865.42it/s]
100%|██████████| 12677/12677 [00:00<00:00, 992610.97it/s]
100%|██████████| 5408/5408 [00:00<00:00, 984710.05it/s]


In [12]:
type(records)

list

In [13]:
no_leaderboard_pourcentage_train_f1 = int(len(train_f1_pd.title.unique())*50/100)
no_leaderboard_pourcentage_dev_f1 = int(len(dev_f1_pd.title.unique())*50/100)
no_leaderboard_pourcentage_train_f2 = int(len(train_f2_pd.title.unique())*50/100)
no_leaderboard_pourcentage_dev_f2 = int(len(dev_f2_pd.title.unique())*50/100)

print(f"no_leaderboard_pourcentage_train_f1: {no_leaderboard_pourcentage_train_f1}")
print(f"no_leaderboard_pourcentage_dev_f1: {no_leaderboard_pourcentage_dev_f1}")
print(f"no_leaderboard_pourcentage_train_f2: {no_leaderboard_pourcentage_train_f2}")
print(f"no_leaderboard_pourcentage_dev_f2: {no_leaderboard_pourcentage_dev_f2}")

no_leaderboard_pourcentage_train_f1: 1876
no_leaderboard_pourcentage_dev_f1: 804
no_leaderboard_pourcentage_train_f2: 1876
no_leaderboard_pourcentage_dev_f2: 804


In [None]:
(1876 + 1876)/2

In [None]:
(804+804)/2

In [14]:
# no_leaderboard_pourcentage = int(len(train_pd.title.unique())*50/100)
# no_leaderboard_pourcentage

In [15]:
records = no_leaderboard_pd.to_dict("records")

# For train only F1
no_lead_papers_train_f1 = []

already_seen = no_lead_papers_train_f1
no_lead_papers_train_f1 = []
i = 0
for _, row in tqdm(enumerate(records), total = len(records)):
    
    if row['title'] in already_seen:
        continue 
        
    if i >= no_leaderboard_pourcentage_train_f1:
        break 
        
    title_to_content["train_f1"][row['title']] = row['Context']
    no_lead_papers_train_f1.append(row['title'])
    i += 1
    
no_lead_papers_dev_f1 = no_lead_papers_train_f1
already_seen = no_lead_papers_dev_f1
no_lead_papers_dev_f1 = []
i = 0
for _, row in tqdm(enumerate(records), total = len(records)):
    
    if row['title'] in already_seen:
        continue 
        
    if i >= no_leaderboard_pourcentage_dev_f1:
        break 
        
    title_to_content["dev_f1"][row['title']] = row['Context']
    no_lead_papers_dev_f1.append(row['title'])  
    i += 1
    
    
# For train only F2
no_lead_papers_train_f2 = []

already_seen = no_lead_papers_train_f2
no_lead_papers_train_f2 = []
j = 0
for _, row in tqdm(enumerate(records), total = len(records)):
    
    if row['title'] in already_seen:
        continue 
        
    if j >= no_leaderboard_pourcentage_train_f2:
        break 
        
    title_to_content["train_f2"][row['title']] = row['Context']
    no_lead_papers_train_f2.append(row['title'])
    j += 1
    
no_lead_papers_dev_f2 = no_lead_papers_train_f2
already_seen = no_lead_papers_dev_f2
no_lead_papers_dev_f2 = []
j =0
for _, row in tqdm(enumerate(records), total = len(records)):
    
    if row['title'] in already_seen:
        continue 
        
    if j >= no_leaderboard_pourcentage_dev_f2:
        break 
        
    title_to_content["dev_f2"][row['title']] = row['Context']
    no_lead_papers_dev_f2.append(row['title'])
    j += 1

 43%|████▎     | 1876/4369 [00:00<00:00, 317214.85it/s]
 61%|██████▏   | 2680/4369 [00:00<00:00, 46365.59it/s]
 43%|████▎     | 1876/4369 [00:00<00:00, 1259365.29it/s]
 61%|██████▏   | 2680/4369 [00:00<00:00, 79233.20it/s]


In [16]:
# train_f1_pd["Lenght context"] = train_f1_pd.Context.apply(lambda x: len(x.split()))
# dev_f1_pd["Lenght context"] = dev_f1_pd.Context.apply(lambda x: len(x.split()))
# train_f2_pd["Lenght context"] = train_f2_pd.Context.apply(lambda x: len(x.split()))
# dev_f2_pd["Lenght context"] = dev_f2_pd.Context.apply(lambda x: len(x.split()))

In [17]:
# train_pd[train_pd["Lenght context"] < 400]

In [18]:
# train_pd = train_pd[train_pd["Lenght context"] < 400]

In [19]:
# print("train_f1_pd describe: ")
# display(train_f1_pd.describe())
# print("dev_f1_pd describe: ")
# display(dev_f1_pd.describe())

# print("train_f2_pd describe: ")
# display(train_f2_pd.describe())
# print("dev_f2_pd describe: ")
# display(dev_f2_pd.describe())

In [20]:
df_train_f1 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])
for i, title in tqdm(enumerate(title_to_content["train_f1"].keys()), total = len(title_to_content["train_f1"].keys())):
    
    if (len(title_to_content["train_f1"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_train_f1) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_train_f1 = pd.concat([df_train_f1, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["train_f1"][title],
                'Lenght Context': len(title_to_content["train_f1"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_train_f1 describe: ")
display(df_train_f1.describe())  

df_dev_f1 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])  
for i, title in tqdm(enumerate(title_to_content["dev_f1"].keys()), total = len(title_to_content["dev_f1"].keys())):
    
    if (len(title_to_content["dev_f1"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_dev_f1) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_dev_f1 = pd.concat([df_dev_f1, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["dev_f1"][title],
                'Lenght Context': len(title_to_content["dev_f1"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_dev_f1 describe: ")
display(df_dev_f1.describe())  

df_train_f2 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])
for i, title in tqdm(enumerate(title_to_content["train_f2"].keys()), total = len(title_to_content["train_f2"].keys())):
    
    if (len(title_to_content["train_f2"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_train_f2) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_train_f2 = pd.concat([df_train_f2, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["train_f2"][title],
                'Lenght Context': len(title_to_content["train_f2"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_train_f2 describe: ")
display(df_train_f2.describe())  
 
df_dev_f2 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])  
for i, title in tqdm(enumerate(title_to_content["dev_f2"].keys()), total = len(title_to_content["dev_f2"].keys())):
    
    if (len(title_to_content["dev_f2"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_dev_f2) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_dev_f2 = pd.concat([df_dev_f2, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["dev_f2"][title],
                'Lenght Context': len(title_to_content["dev_f2"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_dev_f2 describe: ")
display(df_dev_f2.describe())  

 10%|█         | 572/5629 [00:00<00:01, 2782.31it/s]

100%|██████████| 5629/5629 [00:02<00:00, 2503.47it/s]

df_train_f1 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,5512.0,5512.0
mean,318.346154,46.065312
std,167.995792,102.111303
min,3.0,1.0
25%,194.0,1.0
50%,323.0,20.0
75%,419.0,52.0
max,2510.0,2963.0


100%|██████████| 2412/2412 [00:00<00:00, 2694.34it/s]

df_dev_f1 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,2353.0,2353.0
mean,321.895028,45.59541
std,160.476254,86.319714
min,4.0,1.0
25%,197.0,1.0
50%,329.0,20.0
75%,428.0,52.0
max,1750.0,1870.0


100%|██████████| 5629/5629 [00:02<00:00, 2482.42it/s]

df_train_f2 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,5513.0,5513.0
mean,319.830219,46.404498
std,169.799786,100.723495
min,3.0,1.0
25%,194.0,1.0
50%,324.0,20.0
75%,423.0,52.0
max,2510.0,2963.0


100%|██████████| 2412/2412 [00:00<00:00, 2754.31it/s]

df_dev_f2 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,2352.0,2352.0
mean,318.417942,44.80017
std,155.981723,90.045289
min,4.0,1.0
25%,197.0,1.0
50%,326.0,20.0
75%,417.0,53.0
max,1481.0,1829.0


In [3]:
(46.065312+46.404498)/2

46.234905

In [4]:
(45.595410+44.800170)/2

45.19779

In [21]:
template = [\
    'Please answer a question about this article. If the question is unanswerable, say \"unanswerable\"',
    'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".',
    'If the question is unanswerable, say \"unanswerable\"',
    'Try to answer this question if possible (otherwise reply \"unanswerable\"',
    'If it is possible to answer this question, answer it for me (else, reply \"unanswerable\"',
    'Answer this question, if possible (if impossible, reply \"unanswerable\"',
    'Read this: What is the answer? (If it cannot be answered, return \"unanswerable\"',
    'Read this: Now answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"',
    'Answer based on context:',
    'Answer this question based on the article:',
    'Answer this question:',
    'Read this article and answer this question',
    'Based on the above article, answer a question.',
]

template

['Please answer a question about this article. If the question is unanswerable, say "unanswerable"',
 'Read this and answer the question. If the question is unanswerable, say "unanswerable".',
 'If the question is unanswerable, say "unanswerable"',
 'Try to answer this question if possible (otherwise reply "unanswerable"',
 'If it is possible to answer this question, answer it for me (else, reply "unanswerable"',
 'Answer this question, if possible (if impossible, reply "unanswerable"',
 'Read this: What is the answer? (If it cannot be answered, return "unanswerable"',
 'Read this: Now answer this question, if there is an answer (If it cannot be answered, return "unanswerable"',
 'Answer based on context:',
 'Answer this question based on the article:',
 'Answer this question:',
 'Read this article and answer this question',
 'Based on the above article, answer a question.']

In [22]:
# df.head(2)

In [21]:
def create_pandas_dataset_from_pandas(df,
                                      squad_1 = False,
                                      squad_2 = False,
                                      squad_3 = False,
                                      squad_4 = False,
                                      squad_5 = False,
                                      squad_6 = False,
                                      squad_7 = False,
                                      squad_8 = False,
                                      drop_1 = False,
                                      drop_2 = False,
                                      drop_3 = False,
                                      drop_4 = False,
                                      drop_5 = False,
                                      drop_6 = False,
                                      drop_7 = False
                         ):

  ''' Create a Pandas Dataframe from pandas.
  Params:
        answer_threshold: Only consider those Question Answer pairs where the Answer is short.
  '''
  count_index = 0
  result_df  = pd.DataFrame(columns = ['prompt', 'answer'])   
  # q_type_1 = "Which Tasks are addressed in this article"
  # q_type_2 = "Which Datasets are addressed in this article"
  # q_type_3 = "Which Metrics are addressed in this article"
  # q_type_4 = "Which Tasks, Datasets, Metrics are addressed in this article"
  # q_type_5 = "Which Tasks, Datasets, Metrics and Scores are addressed in this article" 
  
  # q_types = [
  #   {"q": "Which Tasks are addressed in this article", "a_key": "Tasks"}, 
  #   {"q": "Which Datasets are addressed in this article", "a_key": "Datasets"}, 
  #   {"q": "Which Metrics are addressed in this article", "a_key": "Metrics"},
  #   {"q": "Which Tasks, Datasets, Metrics are addressed in this article", "a_key": "TDMs"},
  #   {"q": "Which Tasks, Datasets, Metrics and Scores are addressed in this article", "a_key": "TDMSs"}
  #   ]
  
  q_types = [
    # {"q": "What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, and metric?", "a_key": "TDMSs"},
    {"q": "What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, metric, and score?", "a_key": "TDMSs"},
    ]
  
  records = df.to_dict("records")
  # db_dict = defaultdict(lambda : list())
  for i, row in tqdm(enumerate(records), total = len(records)):        
      for q_type in q_types:
        
        # Squad_v2 
        if squad_1:
          result_df.loc[count_index] = [f'{row["Context"]}\n\nPlease answer a question about this article. If the question is unanswerable, say \"unanswerable\". {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_2:
          result_df.loc[count_index] = [f'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".\n\n{row["Context"]}\n\n{q_type["q"]}'
  ] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_3:
          result_df.loc[count_index] = [f'{row["Context"]}\n{q_type["q"]} (If the question is unanswerable, say \"unanswerable\"'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_4:
          result_df.loc[count_index] = [f'{row["Context"]}\nTry to answer this question if possible (otherwise reply \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_5:
          result_df.loc[count_index] = [f'{row["Context"]}\nIf it is possible to answer this question, answer it for me (else, reply \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_6:
          result_df.loc[count_index] = [f'{row["Context"]}\n\nAnswer this question, if possible (if impossible, reply \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_7:
          result_df.loc[count_index] = [f'Read this: {row["Context"]}\n\n{q_type["q"]}\nWhat is the answer? (If it cannot be answered, return \"unanswerable\")'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_8:
          result_df.loc[count_index] = [f'Read this: {row["Context"]}\nNow answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        
        # Drop
        if drop_1:
          result_df.loc[count_index] = [f'Answer based on context:\n\n{row["Context"]}\n\n{q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_2:
          result_df.loc[count_index] = [f'{row["Context"]}\n\nAnswer this question based on the article: {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_3:
          result_df.loc[count_index] = [f'{row["Context"]}\n\n{q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_4:
          result_df.loc[count_index] = [f'{row["Context"]}\nAnswer this question: {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_5:
          result_df.loc[count_index] = [f'Read this article and answer this question {row["Context"]}\n{q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_6:
          result_df.loc[count_index] = [f'{row["Context"]}\n\nBased on the above article, answer a question. {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_7:
          result_df.loc[count_index] = [f'Context: {row["Context"]}\n\nQuestion: {q_type["q"]}\n\nAnswer:'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
         
  return result_df

In [22]:
df_train_f1_all_templates = create_pandas_dataset_from_pandas(df_train_f1,
                                                                squad_1 = True,
                                                                squad_2 = True,
                                                                squad_3 = True,
                                                                squad_4 = True,
                                                                squad_5 = True,
                                                                squad_6 = True,
                                                                squad_7 = True,
                                                                squad_8 = True,
                                                                drop_1 = True,
                                                                drop_2 = True,
                                                                drop_3 = True,
                                                                drop_4 = True,
                                                                drop_5 = True,
                                                                drop_6 = True,
                                                                drop_7 = True
                                                              ) 
df_dev_f1_all_templates = create_pandas_dataset_from_pandas(df_dev_f1,
                                                            squad_1 = True,
                                                            squad_2 = True,
                                                            squad_3 = True,
                                                            squad_4 = True,
                                                            squad_5 = True,
                                                            squad_6 = True,
                                                            squad_7 = True,
                                                            squad_8 = True,
                                                            drop_1 = True,
                                                            drop_2 = True,
                                                            drop_3 = True,
                                                            drop_4 = True,
                                                            drop_5 = True,
                                                            drop_6 = True,
                                                            drop_7 = True
                                                            ) 
df_train_f2_all_templates = create_pandas_dataset_from_pandas(df_train_f2,
                                                            squad_1 = True,
                                                            squad_2 = True,
                                                            squad_3 = True,
                                                            squad_4 = True,
                                                            squad_5 = True,
                                                            squad_6 = True,
                                                            squad_7 = True,
                                                            squad_8 = True,
                                                            drop_1 = True,
                                                            drop_2 = True,
                                                            drop_3 = True,
                                                            drop_4 = True,
                                                            drop_5 = True,
                                                            drop_6 = True,
                                                            drop_7 = True
                                                              ) 
df_dev_f2_all_templates = create_pandas_dataset_from_pandas(df_dev_f2,
                                                            squad_1 = True,
                                                            squad_2 = True,
                                                            squad_3 = True,
                                                            squad_4 = True,
                                                            squad_5 = True,
                                                            squad_6 = True,
                                                            squad_7 = True,
                                                            squad_8 = True,
                                                            drop_1 = True,
                                                            drop_2 = True,
                                                            drop_3 = True,
                                                            drop_4 = True,
                                                            drop_5 = True,
                                                            drop_6 = True,
                                                            drop_7 = True 
                                                            ) 

print("df_train_f1_all_templates describe: ")
display(df_train_f1_all_templates.describe())
print("df_dev_f1_all_templates describe: ")
display(df_dev_f1_all_templates.describe())

print("df_train_f2_all_templates describe: ")
display(df_train_f2_all_templates.describe())
print("df_dev_f2_all_templates describe: ")
display(df_dev_f2_all_templates.describe())

  0%|          | 27/5512 [00:00<00:40, 134.48it/s]

100%|██████████| 5512/5512 [02:43<00:00, 33.71it/s]
100%|██████████| 2353/2353 [00:27<00:00, 85.49it/s]
100%|██████████| 5513/5513 [02:42<00:00, 34.01it/s]
100%|██████████| 2352/2352 [00:27<00:00, 87.04it/s]


df_train_f1_all_templates describe: 


Unnamed: 0,prompt,answer
count,82680,82680
unique,82650,3634
top,Read this and answer the question. If the ques...,unanswerable
freq,2,28080


df_dev_f1_all_templates describe: 


Unnamed: 0,prompt,answer
count,35295,35295
unique,35280,1549
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,12060


df_train_f2_all_templates describe: 


Unnamed: 0,prompt,answer
count,82695,82695
unique,82665,3637
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,28080


df_dev_f2_all_templates describe: 


Unnamed: 0,prompt,answer
count,35280,35280
unique,35280,1548
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,12060


In [1]:
(82680+82695)/2

82687.5

In [2]:
(35295+35280)/2

35287.5

In [23]:
df_train_f1_all_templates.describe()

Unnamed: 0,prompt,answer
count,82680,82680
unique,82650,3634
top,Read this and answer the question. If the ques...,unanswerable
freq,2,28080


In [24]:
df_train_f1_all_templates['answer'].apply(type).value_counts()


answer
<class 'str'>    82680
Name: count, dtype: int64

In [26]:
df_dev_f1_all_templates['answer'].apply(type).value_counts()

answer
<class 'str'>    35295
Name: count, dtype: int64

In [27]:
str(df_dev_f1_all_templates.at[5, 'answer'])

"[{'LEADERBOARD': {'Task': 'Semantic Segmentation', 'Dataset': 'Nighttime Driving', 'Metric': 'mIoU', 'Score': '36.1'}}]"

In [29]:
# df_train_f1_all_templates.to_parquet('../data/df_train_tdms_docteat_f1_all_templates.parquet')
# df_dev_f1_all_templates.to_parquet('../data/df_dev_tdms_docteat_f1_all_templates.parquet')
# df_train_f2_all_templates.to_parquet('../data/df_train_tdms_docteat_f2_all_templates.parquet')
# df_dev_f2_all_templates.to_parquet('../data/df_dev_tdms_docteat_f2_all_templates.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_all_templates.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_all_templates.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_all_templates.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_all_templates.parquet')
    })
})

print(dataset)

# dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_ALL_TEMPLATE")

In [30]:
# df_train_f1_all_templates = pd.read_parquet('../data/df_train_f1_all_templates.parquet')
# df_dev_f1_all_templates = pd.read_parquet('../data/df_dev_f1_all_templates.parquet')
# df_train_f2_all_templates = pd.read_parquet('../data/df_train_f2_all_templates.parquet')
# df_dev_f2_all_templates = pd.read_parquet('../data/df_dev_f2_all_templates.parquet')

# Specific Template

In [28]:
df_train_f1_squad_1 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_1 = True,
                                                        ) 
df_dev_f1_squad_1 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_1 = True,
                                                        ) 
df_train_f2_squad_1 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_1 = True,
                                                        ) 
df_dev_f2_squad_1 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_1 = True,
                                                        ) 

print("df_train_f1_squad_1 describe: ")
display(df_train_f1_squad_1.describe())
print("df_dev_f1_squad_1 describe: ")
display(df_dev_f1_squad_1.describe())

print("df_train_f2_squad_1 describe: ")
display(df_train_f2_squad_1.describe())
print("df_dev_f2_squad_1 describe: ")
display(df_dev_f2_squad_1.describe())

df_train_f1_squad_1.to_parquet('../data/df_train_tdms_docteat_f1_squad_1.parquet')
df_dev_f1_squad_1.to_parquet('../data/df_dev_tdms_docteat_f1_squad_1.parquet')
df_train_f2_squad_1.to_parquet('../data/df_train_tdms_docteat_f2_squad_1.parquet')
df_dev_f2_squad_1.to_parquet('../data/df_dev_tdms_docteat_f2_squad_1.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_squad_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_squad_1.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_squad_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_squad_1.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_SQUAD_1")

100%|██████████| 5512/5512 [00:02<00:00, 2125.78it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2289.29it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2132.79it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2304.66it/s]

df_train_f1_squad_1 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_squad_1 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_squad_1 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_squad_1 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [29]:
df_train_f1_squad_2 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_2 = True,
                                                        ) 
df_dev_f1_squad_2 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_2 = True,
                                                        ) 
df_train_f2_squad_2 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_2 = True,
                                                        ) 
df_dev_f2_squad_2 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_2 = True,
                                                        ) 

print("df_train_f1_squad_2 describe: ")
display(df_train_f1_squad_2.describe())
print("df_dev_f1_squad_2 describe: ")
display(df_dev_f1_squad_2.describe())

print("df_train_f2_squad_2 describe: ")
display(df_train_f2_squad_2.describe())
print("df_dev_f2_squad_2 describe: ")
display(df_dev_f2_squad_2.describe())

df_train_f1_squad_2.to_parquet('../data/df_train_tdms_docteat_f1_squad_2.parquet')
df_dev_f1_squad_2.to_parquet('../data/df_dev_tdms_docteat_f1_squad_2.parquet')
df_train_f2_squad_2.to_parquet('../data/df_train_tdms_docteat_f2_squad_2.parquet')
df_dev_f2_squad_2.to_parquet('../data/df_dev_tdms_docteat_f2_squad_2.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_squad_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_squad_2.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_squad_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_squad_2.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_SQUAD_2")

100%|██████████| 5512/5512 [00:02<00:00, 2083.65it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2275.41it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2136.38it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2280.16it/s]

df_train_f1_squad_2 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,Read this and answer the question. If the ques...,unanswerable
freq,2,1872


df_dev_f1_squad_2 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,Read this and answer the question. If the ques...,unanswerable
freq,2,804


df_train_f2_squad_2 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,Read this and answer the question. If the ques...,unanswerable
freq,2,1872


df_dev_f2_squad_2 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Read this and answer the question. If the ques...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [30]:
df_train_f1_squad_3 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_3 = True,
                                                        ) 
df_dev_f1_squad_3 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_3 = True,
                                                        ) 
df_train_f2_squad_3 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_3 = True,
                                                        ) 
df_dev_f2_squad_3 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_3 = True,
                                                        ) 

print("df_train_f1_squad_3 describe: ")
display(df_train_f1_squad_3.describe())
print("df_dev_f1_squad_3 describe: ")
display(df_dev_f1_squad_3.describe())

print("df_train_f2_squad_3 describe: ")
display(df_train_f2_squad_3.describe())
print("df_dev_f2_squad_3 describe: ")
display(df_dev_f2_squad_3.describe())

df_train_f1_squad_3.to_parquet('../data/df_train_tdms_docteat_f1_squad_3.parquet')
df_dev_f1_squad_3.to_parquet('../data/df_dev_tdms_docteat_f1_squad_3.parquet')
df_train_f2_squad_3.to_parquet('../data/df_train_tdms_docteat_f2_squad_3.parquet')
df_dev_f2_squad_3.to_parquet('../data/df_dev_tdms_docteat_f2_squad_3.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_squad_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_squad_3.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_squad_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_squad_3.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_SQUAD_3")

100%|██████████| 5512/5512 [00:02<00:00, 2125.01it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2270.67it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2125.78it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2282.67it/s]

df_train_f1_squad_3 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_squad_3 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_squad_3 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_squad_3 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [31]:
df_train_f1_squad_4 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_4 = True,
                                                        ) 
df_dev_f1_squad_4 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_4 = True,
                                                        ) 
df_train_f2_squad_4 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_4 = True,
                                                        ) 
df_dev_f2_squad_4 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_4 = True,
                                                        ) 

print("df_train_f1_squad_4 describe: ")
display(df_train_f1_squad_4.describe())
print("df_dev_f1_squad_4 describe: ")
display(df_dev_f1_squad_4.describe())

print("df_train_f2_squad_4 describe: ")
display(df_train_f2_squad_4.describe())
print("df_dev_f2_squad_4 describe: ")
display(df_dev_f2_squad_4.describe())

df_train_f1_squad_4.to_parquet('../data/df_train_tdms_docteat_f1_squad_4.parquet')
df_dev_f1_squad_4.to_parquet('../data/df_dev_tdms_docteat_f1_squad_4.parquet')
df_train_f2_squad_4.to_parquet('../data/df_train_tdms_docteat_f2_squad_4.parquet')
df_dev_f2_squad_4.to_parquet('../data/df_dev_tdms_docteat_f2_squad_4.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_squad_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_squad_4.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_squad_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_squad_4.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_SQUAD_4")

100%|██████████| 5512/5512 [00:02<00:00, 2104.84it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2218.74it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2095.07it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2272.84it/s]

df_train_f1_squad_4 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_squad_4 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_squad_4 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_squad_4 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [32]:
df_train_f1_squad_5 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_5 = True,
                                                        ) 
df_dev_f1_squad_5 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_5 = True,
                                                        ) 
df_train_f2_squad_5 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_5 = True,
                                                        ) 
df_dev_f2_squad_5 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_5 = True,
                                                        ) 

print("df_train_f1_squad_5 describe: ")
display(df_train_f1_squad_5.describe())
print("df_dev_f1_squad_5 describe: ")
display(df_dev_f1_squad_5.describe())

print("df_train_f2_squad_5 describe: ")
display(df_train_f2_squad_5.describe())
print("df_dev_f2_squad_5 describe: ")
display(df_dev_f2_squad_5.describe())

df_train_f1_squad_5.to_parquet('../data/df_train_tdms_docteat_f1_squad_5.parquet')
df_dev_f1_squad_5.to_parquet('../data/df_dev_tdms_docteat_f1_squad_5.parquet')
df_train_f2_squad_5.to_parquet('../data/df_train_tdms_docteat_f2_squad_5.parquet')
df_dev_f2_squad_5.to_parquet('../data/df_dev_tdms_docteat_f2_squad_5.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_squad_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_squad_5.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_squad_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_squad_5.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_SQUAD_5")

100%|██████████| 5512/5512 [00:02<00:00, 2130.75it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2288.52it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2139.01it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2292.59it/s]

df_train_f1_squad_5 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_squad_5 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_squad_5 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_squad_5 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [33]:
df_train_f1_squad_6 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_6 = True,
                                                        ) 
df_dev_f1_squad_6 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_6 = True,
                                                        ) 
df_train_f2_squad_6 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_6 = True,
                                                        ) 
df_dev_f2_squad_6 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_6 = True,
                                                        ) 

print("df_train_f1_squad_6 describe: ")
display(df_train_f1_squad_6.describe())
print("df_dev_f1_squad_6 describe: ")
display(df_dev_f1_squad_6.describe())

print("df_train_f2_squad_6 describe: ")
display(df_train_f2_squad_6.describe())
print("df_dev_f2_squad_6 describe: ")
display(df_dev_f2_squad_6.describe())

df_train_f1_squad_6.to_parquet('../data/df_train_tdms_docteat_f1_squad_6.parquet')
df_dev_f1_squad_6.to_parquet('../data/df_dev_tdms_docteat_f1_squad_6.parquet')
df_train_f2_squad_6.to_parquet('../data/df_train_tdms_docteat_f2_squad_6.parquet')
df_dev_f2_squad_6.to_parquet('../data/df_dev_tdms_docteat_f2_squad_6.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_squad_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_squad_6.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_squad_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_squad_6.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_SQUAD_6")

100%|██████████| 5512/5512 [00:02<00:00, 2139.09it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2251.28it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2118.51it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2294.09it/s]

df_train_f1_squad_6 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_squad_6 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_squad_6 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_squad_6 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [34]:
df_train_f1_squad_7 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_7 = True,
                                                        ) 
df_dev_f1_squad_7 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_7 = True,
                                                        ) 
df_train_f2_squad_7 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_7 = True,
                                                        ) 
df_dev_f2_squad_7 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_7 = True,
                                                        ) 

print("df_train_f1_squad_7 describe: ")
display(df_train_f1_squad_7.describe())
print("df_dev_f1_squad_7 describe: ")
display(df_dev_f1_squad_7.describe())

print("df_train_f2_squad_7 describe: ")
display(df_train_f2_squad_7.describe())
print("df_dev_f2_squad_7 describe: ")
display(df_dev_f2_squad_7.describe())

df_train_f1_squad_7.to_parquet('../data/df_train_tdms_docteat_f1_squad_7.parquet')
df_dev_f1_squad_7.to_parquet('../data/df_dev_tdms_docteat_f1_squad_7.parquet')
df_train_f2_squad_7.to_parquet('../data/df_train_tdms_docteat_f2_squad_7.parquet')
df_dev_f2_squad_7.to_parquet('../data/df_dev_tdms_docteat_f2_squad_7.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_squad_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_squad_7.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_squad_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_squad_7.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_SQUAD_7")

100%|██████████| 5512/5512 [00:02<00:00, 2150.67it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2253.27it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2142.49it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2307.14it/s]

df_train_f1_squad_7 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,Read this: FixMatch: Simplifying Semi-Supervis...,unanswerable
freq,2,1872


df_dev_f1_squad_7 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,Read this: PANDA: Adapting Pretrained Features...,unanswerable
freq,2,804


df_train_f2_squad_7 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,Read this: IEEE TRANSACTIONS ON PATTERN ANALYS...,unanswerable
freq,2,1872


df_dev_f2_squad_7 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Read this: Ocean: Object-aware Anchor-free Tra...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [35]:
df_train_f1_squad_8 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_8 = True,
                                                        ) 
df_dev_f1_squad_8 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_8 = True,
                                                        ) 
df_train_f2_squad_8 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_8 = True,
                                                        ) 
df_dev_f2_squad_8 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_8 = True,
                                                        ) 

print("df_train_f1_squad_8 describe: ")
display(df_train_f1_squad_8.describe())
print("df_dev_f1_squad_8 describe: ")
display(df_dev_f1_squad_8.describe())

print("df_train_f2_squad_8 describe: ")
display(df_train_f2_squad_8.describe())
print("df_dev_f2_squad_8 describe: ")
display(df_dev_f2_squad_8.describe())

df_train_f1_squad_8.to_parquet('../data/df_train_tdms_docteat_f1_squad_8.parquet')
df_dev_f1_squad_8.to_parquet('../data/df_dev_tdms_docteat_f1_squad_8.parquet')
df_train_f2_squad_8.to_parquet('../data/df_train_tdms_docteat_f2_squad_8.parquet')
df_dev_f2_squad_8.to_parquet('../data/df_dev_tdms_docteat_f2_squad_8.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_squad_8.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_squad_8.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_squad_8.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_squad_8.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_SQUAD_8")

100%|██████████| 5512/5512 [00:02<00:00, 2176.77it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2281.51it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2115.45it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2301.36it/s]

df_train_f1_squad_8 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,Read this: FixMatch: Simplifying Semi-Supervis...,unanswerable
freq,2,1872


df_dev_f1_squad_8 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,Read this: PANDA: Adapting Pretrained Features...,unanswerable
freq,2,804


df_train_f2_squad_8 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,Read this: IEEE TRANSACTIONS ON PATTERN ANALYS...,unanswerable
freq,2,1872


df_dev_f2_squad_8 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Read this: Ocean: Object-aware Anchor-free Tra...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [36]:
df_train_f1_drop_1 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_1 = True,
                                                        ) 
df_dev_f1_drop_1 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_1 = True,
                                                        ) 
df_train_f2_drop_1 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_1 = True,
                                                        ) 
df_dev_f2_drop_1 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_1 = True,
                                                        ) 

print("df_train_f1_drop_1 describe: ")
display(df_train_f1_drop_1.describe())
print("df_dev_f1_drop_1 describe: ")
display(df_dev_f1_drop_1.describe())

print("df_train_f2_drop_1 describe: ")
display(df_train_f2_drop_1.describe())
print("df_dev_f2_drop_1 describe: ")
display(df_dev_f2_drop_1.describe())

df_train_f1_drop_1.to_parquet('../data/df_train_tdms_docteat_f1_drop_1.parquet')
df_dev_f1_drop_1.to_parquet('../data/df_dev_tdms_docteat_f1_drop_1.parquet')
df_train_f2_drop_1.to_parquet('../data/df_train_tdms_docteat_f2_drop_1.parquet')
df_dev_f2_drop_1.to_parquet('../data/df_dev_tdms_docteat_f2_drop_1.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_drop_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_drop_1.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_drop_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_drop_1.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_DROP_1")

100%|██████████| 5512/5512 [00:02<00:00, 2174.24it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2304.66it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2142.61it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2302.10it/s]

df_train_f1_drop_1 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,Answer based on context:\n\nFixMatch: Simplify...,unanswerable
freq,2,1872


df_dev_f1_drop_1 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,Answer based on context:\n\nPANDA: Adapting Pr...,unanswerable
freq,2,804


df_train_f2_drop_1 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,Answer based on context:\n\nIEEE TRANSACTIONS ...,unanswerable
freq,2,1872


df_dev_f2_drop_1 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Answer based on context:\n\nOcean: Object-awar...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [37]:
df_train_f1_drop_2 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_2 = True,
                                                        ) 
df_dev_f1_drop_2 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_2 = True,
                                                        ) 
df_train_f2_drop_2 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_2 = True,
                                                        ) 
df_dev_f2_drop_2 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_2 = True,
                                                        ) 

print("df_train_f1_drop_2 describe: ")
display(df_train_f1_drop_2.describe())
print("df_dev_f1_drop_2 describe: ")
display(df_dev_f1_drop_2.describe())

print("df_train_f2_drop_2 describe: ")
display(df_train_f2_drop_2.describe())
print("df_dev_f2_drop_2 describe: ")
display(df_dev_f2_drop_2.describe())

df_train_f1_drop_2.to_parquet('../data/df_train_tdms_docteat_f1_drop_2.parquet')
df_dev_f1_drop_2.to_parquet('../data/df_dev_tdms_docteat_f1_drop_2.parquet')
df_train_f2_drop_2.to_parquet('../data/df_train_tdms_docteat_f2_drop_2.parquet')
df_dev_f2_drop_2.to_parquet('../data/df_dev_tdms_docteat_f2_drop_2.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_drop_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_drop_2.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_drop_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_drop_2.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_DROP_2")

100%|██████████| 5512/5512 [00:02<00:00, 2165.93it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2308.81it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2169.10it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2310.23it/s]

df_train_f1_drop_2 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_drop_2 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_drop_2 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_drop_2 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [38]:
df_train_f1_drop_3 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_3 = True,
                                                        ) 
df_dev_f1_drop_3 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_3 = True,
                                                        ) 
df_train_f2_drop_3 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_3 = True,
                                                        ) 
df_dev_f2_drop_3 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_3 = True,
                                                        ) 

print("df_train_f1_drop_3 describe: ")
display(df_train_f1_drop_3.describe())
print("df_dev_f1_drop_3 describe: ")
display(df_dev_f1_drop_3.describe())

print("df_train_f2_drop_3 describe: ")
display(df_train_f2_drop_3.describe())
print("df_dev_f2_drop_3 describe: ")
display(df_dev_f2_drop_3.describe())

df_train_f1_drop_3.to_parquet('../data/df_train_tdms_docteat_f1_drop_3.parquet')
df_dev_f1_drop_3.to_parquet('../data/df_dev_tdms_docteat_f1_drop_3.parquet')
df_train_f2_drop_3.to_parquet('../data/df_train_tdms_docteat_f2_drop_3.parquet')
df_dev_f2_drop_3.to_parquet('../data/df_dev_tdms_docteat_f2_drop_3.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_drop_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_drop_3.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_drop_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_drop_3.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_DROP_3")

100%|██████████| 5512/5512 [00:02<00:00, 2185.47it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2307.82it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2127.82it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2301.48it/s]

df_train_f1_drop_3 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_drop_3 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_drop_3 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_drop_3 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [39]:
df_train_f1_drop_4 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_4 = True,
                                                        ) 
df_dev_f1_drop_4 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_4 = True,
                                                        ) 
df_train_f2_drop_4 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_4 = True,
                                                        ) 
df_dev_f2_drop_4 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_4 = True,
                                                        ) 

print("df_train_f1_drop_4 describe: ")
display(df_train_f1_drop_4.describe())
print("df_dev_f1_drop_4 describe: ")
display(df_dev_f1_drop_4.describe())

print("df_train_f2_drop_4 describe: ")
display(df_train_f2_drop_4.describe())
print("df_dev_f2_drop_4 describe: ")
display(df_dev_f2_drop_4.describe())

df_train_f1_drop_4.to_parquet('../data/df_train_tdms_docteat_f1_drop_4.parquet')
df_dev_f1_drop_4.to_parquet('../data/df_dev_tdms_docteat_f1_drop_4.parquet')
df_train_f2_drop_4.to_parquet('../data/df_train_tdms_docteat_f2_drop_4.parquet')
df_dev_f2_drop_4.to_parquet('../data/df_dev_tdms_docteat_f2_drop_4.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_drop_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_drop_4.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_drop_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_drop_4.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_DROP_4")

100%|██████████| 5512/5512 [00:02<00:00, 2133.01it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2286.51it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2130.71it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2289.56it/s]

df_train_f1_drop_4 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_drop_4 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_drop_4 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_drop_4 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [40]:
df_train_f1_drop_5 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_5 = True,
                                                        ) 
df_dev_f1_drop_5 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_5 = True,
                                                        ) 
df_train_f2_drop_5 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_5 = True,
                                                        ) 
df_dev_f2_drop_5 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_5 = True,
                                                        ) 

print("df_train_f1_drop_5 describe: ")
display(df_train_f1_drop_5.describe())
print("df_dev_f1_drop_5 describe: ")
display(df_dev_f1_drop_5.describe())

print("df_train_f2_drop_5 describe: ")
display(df_train_f2_drop_5.describe())
print("df_dev_f2_drop_5 describe: ")
display(df_dev_f2_drop_5.describe())

df_train_f1_drop_5.to_parquet('../data/df_train_tdms_docteat_f1_drop_5.parquet')
df_dev_f1_drop_5.to_parquet('../data/df_dev_tdms_docteat_f1_drop_5.parquet')
df_train_f2_drop_5.to_parquet('../data/df_train_tdms_docteat_f2_drop_5.parquet')
df_dev_f2_drop_5.to_parquet('../data/df_dev_tdms_docteat_f2_drop_5.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_drop_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_drop_5.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_drop_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_drop_5.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_DROP_5")

100%|██████████| 5512/5512 [00:02<00:00, 2159.04it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2307.77it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2156.86it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2326.07it/s]

df_train_f1_drop_5 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,Read this article and answer this question Fix...,unanswerable
freq,2,1872


df_dev_f1_drop_5 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,Read this article and answer this question PAN...,unanswerable
freq,2,804


df_train_f2_drop_5 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,Read this article and answer this question IEE...,unanswerable
freq,2,1872


df_dev_f2_drop_5 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Read this article and answer this question Oce...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [41]:
df_train_f1_drop_6 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_6 = True,
                                                        ) 
df_dev_f1_drop_6 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_6 = True,
                                                        ) 
df_train_f2_drop_6 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_6 = True,
                                                        ) 
df_dev_f2_drop_6 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_6 = True,
                                                        ) 

print("df_train_f1_drop_6 describe: ")
display(df_train_f1_drop_6.describe())
print("df_dev_f1_drop_6 describe: ")
display(df_dev_f1_drop_6.describe())

print("df_train_f2_drop_6 describe: ")
display(df_train_f2_drop_6.describe())
print("df_dev_f2_drop_6 describe: ")
display(df_dev_f2_drop_6.describe())

df_train_f1_drop_6.to_parquet('../data/df_train_tdms_docteat_f1_drop_6.parquet')
df_dev_f1_drop_6.to_parquet('../data/df_dev_tdms_docteat_f1_drop_6.parquet')
df_train_f2_drop_6.to_parquet('../data/df_train_tdms_docteat_f2_drop_6.parquet')
df_dev_f2_drop_6.to_parquet('../data/df_dev_tdms_docteat_f2_drop_6.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_drop_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_drop_6.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_drop_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_drop_6.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_DROP_6")

100%|██████████| 5512/5512 [00:02<00:00, 2166.68it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2311.09it/s]
100%|██████████| 5513/5513 [00:02<00:00, 1949.34it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2298.17it/s]

df_train_f1_drop_6 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_drop_6 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_drop_6 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_drop_6 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [42]:
df_train_f1_drop_7 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_7 = True,
                                                        ) 
df_dev_f1_drop_7 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_7 = True,
                                                        ) 
df_train_f2_drop_7 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_7 = True,
                                                        ) 
df_dev_f2_drop_7 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_7 = True,
                                                        ) 

print("df_train_f1_drop_7 describe: ")
display(df_train_f1_drop_7.describe())
print("df_dev_f1_drop_7 describe: ")
display(df_dev_f1_drop_7.describe())

print("df_train_f2_drop_7 describe: ")
display(df_train_f2_drop_7.describe())
print("df_dev_f2_drop_7 describe: ")
display(df_dev_f2_drop_7.describe())

df_train_f1_drop_7.to_parquet('../data/df_train_tdms_docteat_f1_drop_7.parquet')
df_dev_f1_drop_7.to_parquet('../data/df_dev_tdms_docteat_f1_drop_7.parquet')
df_train_f2_drop_7.to_parquet('../data/df_train_tdms_docteat_f2_drop_7.parquet')
df_dev_f2_drop_7.to_parquet('../data/df_dev_tdms_docteat_f2_drop_7.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f1_drop_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f1_drop_7.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_docteat_f2_drop_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_docteat_f2_drop_7.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDMS_DROP_7")

100%|██████████| 5512/5512 [00:02<00:00, 2131.89it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2250.30it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2142.85it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2300.72it/s]

df_train_f1_drop_7 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,3634
top,Context: FixMatch: Simplifying Semi-Supervised...,unanswerable
freq,2,1872


df_dev_f1_drop_7 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1549
top,Context: PANDA: Adapting Pretrained Features f...,unanswerable
freq,2,804


df_train_f2_drop_7 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,3637
top,Context: IEEE TRANSACTIONS ON PATTERN ANALYSIS...,unanswerable
freq,2,1872


df_dev_f2_drop_7 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1548
top,Context: Ocean: Object-aware Anchor-free Track...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [43]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [44]:
# root_directory = "../data/LLLM_DOCTEAT_TDMS_ALL_TEMPLATE"
root_directory = "../data/LLLM_DOCTEAT_TDMS_DROP_7"


# reloaded_encoded_dataset = datasets.load_from_disk("../data/dataset/LLLM_TDMS_ALL_TEMPLATE")
# reloaded_encoded_dataset = DatasetDict.load_from_disk("../data/LLLM_TDMS_ALL_TEMPLATE")

dataset_fold1 = DatasetDict.load_from_disk(f"{root_directory}/fold1")
dataset_fold2 = DatasetDict.load_from_disk(f"{root_directory}/fold2")

In [45]:
dataset_fold1['train'][0]

{'prompt': "Context: Value Prediction Network This paper proposes a novel deep reinforcement learning (RL) architecture, called Value Prediction Network (VPN), which integrates model-free and model-based RL methods into a single neural network. In contrast to typical model-based RL methods, VPN learns a dynamics model whose abstract states are trained to make option-conditional predictions of future values (discounted sum of rewards) rather than of future observations. Our experimental results show that VPN has several advantages over both model-free and model-based baselines in a stochastic environment where careful planning is required but building an accurate observation-prediction model is difficult. Furthermore, VPN outperforms Deep Q-Network (DQN) on several Atari games even with short-lookahead planning, demonstrating its potential as anew way of learning a good state representation. VPN has four more hyperparameters: 1) the number of predictions steps (k) during training, 2) th