In [1]:
# !pip install --quiet  datasets #to access squad dataset
# !pip install --quiet pyarrow   #to deal with parquet files for saving dataset if required
# !pip install --quiet  tqdm     #for progress bars
# !pip install --quiet transformers # for t5 model
# !pip install --quiet tokenizers  #tokenizers from HuggingFace
# !pip install --quiet sentencepiece #subword tokenizer used by T5
# !pip install --quiet pytorch-lightning # pytorch wrapper 
# !pip install --quiet torchtext # text utilities

# Fetching Datasets

In [2]:
#imports
import pandas as pd
import torch
from tqdm import tqdm
from datasets import DatasetDict, Dataset, load_from_disk
# from torch.utils.data import Dataset, DataLoader
from pprint import pprint
import copy
import numpy as np
from collections import defaultdict
import ipdb

pd.options.display.max_rows , pd.options.display.max_columns  = 100,100  

In [3]:
device  = 'cuda' if torch.cuda.is_available() else "cpu"
device

'cpu'

In [4]:
# path_to_source = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_full_5_10_10000_clone_latex_compare/10Neg10000unk/twofoldwithunk"
path_to_csv = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk"

fold1 = "fold1"
train_f1_pd = pd.read_csv(f"{path_to_csv}/{fold1}/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])
dev_f1_pd = pd.read_csv(f"{path_to_csv}/{fold1}/dev.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

fold2 = "fold2"
train_f2_pd = pd.read_csv(f"{path_to_csv}/{fold2}/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])
dev_f2_pd = pd.read_csv(f"{path_to_csv}/{fold2}/dev.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [5]:
# no_leaderboard_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/T5-Leaderboard-QA/data_proccess/arxiv_no_leaderboard_links_pdf_short/DocTAET_full.tsv", 
#                     sep="\t", names=["title", "Context"])

no_leaderboard_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/T5-Leaderboard-QA/data_proccess/arxiv_no_leaderboard_links_pdf_short/DocTAET_150.tsv", 
                    sep="\t", names=["title", "Context"])

no_leaderboard_pd.describe()

Unnamed: 0,title,Context
count,4369,4369
unique,4369,4365
top,0912.4438.pdf,! !
freq,1,2


In [6]:
resultsAnnotation_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/annotations_final/resultsAnnotation.tsv",
                                   sep="\t", names=["Title", "TDMSs"])
resultsAnnotation_pd = resultsAnnotation_pd.fillna("NAN")
resultsAnnotation_pd

Unnamed: 0,Title,TDMSs
0,1704.03549v4.pdf,Optical Character Recognition#FSNS - Test#Sequ...
1,1712.05404.pdf,Optical Character Recognition#FSNS - Test#Sequ...
2,1702.03970v1.pdf,Optical Character Recognition#FSNS - Test#Sequ...
3,2104.02324v1.pdf,"Active Object Detection#COCO#AP#(7.3, 13.8, 16..."
4,2008.12995v3.pdf,Handwriting Recognition#BanglaLekha Isolated D...
...,...,...
5724,2104.01378v1.pdf,Phone-level pronunciation scoring#speechocean7...
5725,2104.10283v1.pdf,Graph Question Answering#GQA#Accuracy#96.30
5726,2104.11980v1.pdf,Trajectory Modeling#NBA SportVU#1x1 NLL#0.472
5727,1704.00077v1.pdf,Video Segmentation#SegTrack v2#Accuracy#86.86


In [7]:
"""
This will take care of papers with more than one learderboard 
"""
records = resultsAnnotation_pd.to_dict("records")

title_to_tdms_dict = defaultdict(
    lambda : 
        list()
    )

for i, row in tqdm(enumerate(records), total = len(records)):
    if row['TDMSs'] == 'NAN':
        continue

    for tdms in row['TDMSs'].split("$"):
        if len(tdms.split("#")) != 4:
            # ipdb.set_trace()
            continue 
        t, d, m, s = tdms.split("#")
        title_to_tdms_dict[row['Title']].append(
            {
                "LEADERBOARD": {
                    "Task": t,
                    "Dataset": d,
                    "Metric": m,
                    # "Score": s,
                }
            }            
        )

100%|██████████| 5729/5729 [00:00<00:00, 119578.44it/s]


In [8]:
len(title_to_tdms_dict)

5725

In [9]:
# No need for negative instances, but will still have 'duplicate' for paper with more than one leaderboard
train_f1_pd = train_f1_pd[train_f1_pd.label==True]
print("train_f1_pd")
display(train_f1_pd.describe())

dev_f1_pd = dev_f1_pd[dev_f1_pd.label==True]
print("dev_f1_pd")
display(dev_f1_pd.describe())

train_f2_pd = train_f2_pd[train_f2_pd.label==True]
print("train_f2_pd")
display(train_f2_pd.describe())

dev_f2_pd = dev_f2_pd[dev_f2_pd.label==True]
print("dev_f2_pd")
display(dev_f2_pd.describe())

train_f1_pd


Unnamed: 0,label,title,TDM,Context
count,12613,12613,12613,12613
unique,1,3753,1792,3747
top,True,1803.00933v1.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,12613,58,923,58


dev_f1_pd


Unnamed: 0,label,title,TDM,Context
count,5472,5472,5472,5472
unique,1,1608,1557,1606
top,True,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,5472,58,378,58


train_f2_pd


Unnamed: 0,label,title,TDM,Context
count,12677,12677,12677,12677
unique,1,3753,1821,3749
top,True,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,12677,58,920,58


dev_f2_pd


Unnamed: 0,label,title,TDM,Context
count,5408,5408,5408,5408
unique,1,1608,1542,1608
top,True,1802.01561v3.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,5408,58,381,58


In [10]:
# len(train_pd.title.unique())
records_train_f1 = train_f1_pd.to_dict("records")
records_dev_f1 = dev_f1_pd.to_dict("records")
records_train_f2 = train_f2_pd.to_dict("records")
records_dev_f2 = dev_f2_pd.to_dict("records")

In [11]:
# title_to_tdms_dict = defaultdict(lambda : defaultdict(lambda : str("| ")))
title_to_content = {
    "train_f1":{},
    "dev_f1":{},
    "train_f2":{},
    "dev_f2":{},
    }

for i, row in tqdm(enumerate(records_train_f1), total = len(records_train_f1)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["train_f1"]:
        continue 
    else:
        title_to_content["train_f1"][row['title']] = row['Context']
    
for i, row in tqdm(enumerate(records_dev_f1), total = len(records_dev_f1)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["dev_f1"]:
        continue 
    else:
        title_to_content["dev_f1"][row['title']] = row['Context']
        
for i, row in tqdm(enumerate(records_train_f2), total = len(records_train_f2)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["train_f2"]:
        continue 
    else:
        title_to_content["train_f2"][row['title']] = row['Context']
    
for i, row in tqdm(enumerate(records_dev_f2), total = len(records_dev_f2)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["dev_f2"]:
        continue 
    else:
        title_to_content["dev_f2"][row['title']] = row['Context']

100%|██████████| 12613/12613 [00:00<00:00, 1842980.54it/s]
100%|██████████| 5472/5472 [00:00<00:00, 864322.95it/s]
100%|██████████| 12677/12677 [00:00<00:00, 857074.56it/s]
100%|██████████| 5408/5408 [00:00<00:00, 860467.97it/s]


In [12]:
type(records)

list

In [13]:
no_leaderboard_pourcentage_train_f1 = int(len(train_f1_pd.title.unique())*50/100)
no_leaderboard_pourcentage_dev_f1 = int(len(dev_f1_pd.title.unique())*50/100)
no_leaderboard_pourcentage_train_f2 = int(len(train_f2_pd.title.unique())*50/100)
no_leaderboard_pourcentage_dev_f2 = int(len(dev_f2_pd.title.unique())*50/100)

print(f"no_leaderboard_pourcentage_train_f1: {no_leaderboard_pourcentage_train_f1}")
print(f"no_leaderboard_pourcentage_dev_f1: {no_leaderboard_pourcentage_dev_f1}")
print(f"no_leaderboard_pourcentage_train_f2: {no_leaderboard_pourcentage_train_f2}")
print(f"no_leaderboard_pourcentage_dev_f2: {no_leaderboard_pourcentage_dev_f2}")

no_leaderboard_pourcentage_train_f1: 1876
no_leaderboard_pourcentage_dev_f1: 804
no_leaderboard_pourcentage_train_f2: 1876
no_leaderboard_pourcentage_dev_f2: 804


In [14]:
# no_leaderboard_pourcentage = int(len(train_pd.title.unique())*50/100)
# no_leaderboard_pourcentage

In [15]:
records = no_leaderboard_pd.to_dict("records")

# For train only F1
no_lead_papers_train_f1 = []

already_seen = no_lead_papers_train_f1
no_lead_papers_train_f1 = []
i = 0
for _, row in tqdm(enumerate(records), total = len(records)):
    
    if row['title'] in already_seen:
        continue 
        
    if i >= no_leaderboard_pourcentage_train_f1:
        break 
        
    title_to_content["train_f1"][row['title']] = row['Context']
    no_lead_papers_train_f1.append(row['title'])
    i += 1
    
no_lead_papers_dev_f1 = no_lead_papers_train_f1
already_seen = no_lead_papers_dev_f1
no_lead_papers_dev_f1 = []
i = 0
for _, row in tqdm(enumerate(records), total = len(records)):
    
    if row['title'] in already_seen:
        continue 
        
    if i >= no_leaderboard_pourcentage_dev_f1:
        break 
        
    title_to_content["dev_f1"][row['title']] = row['Context']
    no_lead_papers_dev_f1.append(row['title'])  
    i += 1
    
    
# For train only F2
no_lead_papers_train_f2 = []

already_seen = no_lead_papers_train_f2
no_lead_papers_train_f2 = []
j = 0
for _, row in tqdm(enumerate(records), total = len(records)):
    
    if row['title'] in already_seen:
        continue 
        
    if j >= no_leaderboard_pourcentage_train_f2:
        break 
        
    title_to_content["train_f2"][row['title']] = row['Context']
    no_lead_papers_train_f2.append(row['title'])
    j += 1
    
no_lead_papers_dev_f2 = no_lead_papers_train_f2
already_seen = no_lead_papers_dev_f2
no_lead_papers_dev_f2 = []
j =0
for _, row in tqdm(enumerate(records), total = len(records)):
    
    if row['title'] in already_seen:
        continue 
        
    if j >= no_leaderboard_pourcentage_dev_f2:
        break 
        
    title_to_content["dev_f2"][row['title']] = row['Context']
    no_lead_papers_dev_f2.append(row['title'])
    j += 1

 43%|████▎     | 1876/4369 [00:00<00:00, 718192.25it/s]
 61%|██████▏   | 2680/4369 [00:00<00:00, 42593.87it/s]
 43%|████▎     | 1876/4369 [00:00<00:00, 1412152.60it/s]
 61%|██████▏   | 2680/4369 [00:00<00:00, 79355.14it/s]


In [16]:
df_train_f1 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])
for i, title in tqdm(enumerate(title_to_content["train_f1"].keys()), total = len(title_to_content["train_f1"].keys())):
    
    if (len(title_to_content["train_f1"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_train_f1) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_train_f1 = pd.concat([df_train_f1, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["train_f1"][title],
                'Lenght Context': len(title_to_content["train_f1"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_train_f1 describe: ")
display(df_train_f1.describe())  

df_dev_f1 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])  
for i, title in tqdm(enumerate(title_to_content["dev_f1"].keys()), total = len(title_to_content["dev_f1"].keys())):
    
    if (len(title_to_content["dev_f1"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_dev_f1) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_dev_f1 = pd.concat([df_dev_f1, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["dev_f1"][title],
                'Lenght Context': len(title_to_content["dev_f1"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_dev_f1 describe: ")
display(df_dev_f1.describe())  

df_train_f2 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])
for i, title in tqdm(enumerate(title_to_content["train_f2"].keys()), total = len(title_to_content["train_f2"].keys())):
    
    if (len(title_to_content["train_f2"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_train_f2) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_train_f2 = pd.concat([df_train_f2, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["train_f2"][title],
                'Lenght Context': len(title_to_content["train_f2"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_train_f2 describe: ")
display(df_train_f2.describe())  
 
df_dev_f2 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])  
for i, title in tqdm(enumerate(title_to_content["dev_f2"].keys()), total = len(title_to_content["dev_f2"].keys())):
    
    if (len(title_to_content["dev_f2"][title]) < 10):
        continue 
    
    if (title not in no_lead_papers_dev_f2) :
        if (title_to_tdms_dict[title] == []):
             continue

    df_dev_f2 = pd.concat([df_dev_f2, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["dev_f2"][title],
                'Lenght Context': len(title_to_content["dev_f2"][title].split()),
                'Lenght TDMSs': len(str(title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_dev_f2 describe: ")
display(df_dev_f2.describe())  

 13%|█▎        | 716/5629 [00:00<00:01, 2543.70it/s]

100%|██████████| 5629/5629 [00:02<00:00, 2508.48it/s]


df_train_f1 describe: 


Unnamed: 0,Lenght Context,Lenght TDMSs
count,5512.0,5512.0
mean,318.346154,38.429064
std,167.995792,85.362865
min,3.0,1.0
25%,194.0,1.0
50%,323.0,16.0
75%,419.0,44.0
max,2510.0,2455.0


100%|██████████| 2412/2412 [00:00<00:00, 2851.54it/s]

df_dev_f1 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,2353.0,2353.0
mean,321.895028,38.00765
std,160.476254,71.784381
min,4.0,1.0
25%,197.0,1.0
50%,329.0,16.0
75%,428.0,43.0
max,1750.0,1530.0


100%|██████████| 5629/5629 [00:02<00:00, 2582.04it/s]

df_train_f2 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,5513.0,5513.0
mean,319.830219,38.690731
std,169.799786,83.986774
min,3.0,1.0
25%,194.0,1.0
50%,324.0,16.0
75%,423.0,44.0
max,2510.0,2455.0


100%|██████████| 2412/2412 [00:00<00:00, 2774.64it/s]

df_dev_f2 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,2352.0,2352.0
mean,318.417942,37.394133
std,155.981723,75.481169
min,4.0,1.0
25%,197.0,1.0
50%,326.0,16.0
75%,417.0,45.0
max,1481.0,1537.0


In [17]:
def create_pandas_dataset_from_pandas(df,
                                      squad_1 = False,
                                      squad_2 = False,
                                      squad_3 = False,
                                      squad_4 = False,
                                      squad_5 = False,
                                      squad_6 = False,
                                      squad_7 = False,
                                      squad_8 = False,
                                      drop_1 = False,
                                      drop_2 = False,
                                      drop_3 = False,
                                      drop_4 = False,
                                      drop_5 = False,
                                      drop_6 = False,
                                      drop_7 = False
                         ):

  ''' Create a Pandas Dataframe from pandas.
  Params:
        answer_threshold: Only consider those Question Answer pairs where the Answer is short.
  '''
  count_index = 0
  result_df  = pd.DataFrame(columns = ['prompt', 'answer'])   
  
  q_types = [
    {"q": "What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, and metric?", "a_key": "TDMSs"},
    ]
  
  records = df.to_dict("records")
  # db_dict = defaultdict(lambda : list())
  for i, row in tqdm(enumerate(records), total = len(records)):        
      for q_type in q_types:
        
        # Squad_v2 
        if squad_1:
          result_df.loc[count_index] = [f'{row["Context"]}\n\nPlease answer a question about this article. If the question is unanswerable, say \"unanswerable\". {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_2:
          result_df.loc[count_index] = [f'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".\n\n{row["Context"]}\n\n{q_type["q"]}'
  ] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_3:
          result_df.loc[count_index] = [f'{row["Context"]}\n{q_type["q"]} (If the question is unanswerable, say \"unanswerable\"'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_4:
          result_df.loc[count_index] = [f'{row["Context"]}\nTry to answer this question if possible (otherwise reply \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_5:
          result_df.loc[count_index] = [f'{row["Context"]}\nIf it is possible to answer this question, answer it for me (else, reply \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_6:
          result_df.loc[count_index] = [f'{row["Context"]}\n\nAnswer this question, if possible (if impossible, reply \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_7:
          result_df.loc[count_index] = [f'Read this: {row["Context"]}\n\n{q_type["q"]}\nWhat is the answer? (If it cannot be answered, return \"unanswerable\")'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_8:
          result_df.loc[count_index] = [f'Read this: {row["Context"]}\nNow answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        
        # Drop
        if drop_1:
          result_df.loc[count_index] = [f'Answer based on context:\n\n{row["Context"]}\n\n{q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_2:
          result_df.loc[count_index] = [f'{row["Context"]}\n\nAnswer this question based on the article: {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_3:
          result_df.loc[count_index] = [f'{row["Context"]}\n\n{q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_4:
          result_df.loc[count_index] = [f'{row["Context"]}\nAnswer this question: {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_5:
          result_df.loc[count_index] = [f'Read this article and answer this question {row["Context"]}\n{q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_6:
          result_df.loc[count_index] = [f'{row["Context"]}\n\nBased on the above article, answer a question. {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_7:
          result_df.loc[count_index] = [f'Context: {row["Context"]}\n\nQuestion: {q_type["q"]}\n\nAnswer:'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
         
  return result_df

In [18]:
df_train_f1_all_templates = create_pandas_dataset_from_pandas(df_train_f1,
                                                                squad_1 = True,
                                                                squad_2 = True,
                                                                squad_3 = True,
                                                                squad_4 = True,
                                                                squad_5 = True,
                                                                squad_6 = True,
                                                                squad_7 = True,
                                                                squad_8 = True,
                                                                drop_1 = True,
                                                                drop_2 = True,
                                                                drop_3 = True,
                                                                drop_4 = True,
                                                                drop_5 = True,
                                                                drop_6 = True,
                                                                drop_7 = True
                                                              ) 
df_dev_f1_all_templates = create_pandas_dataset_from_pandas(df_dev_f1,
                                                            squad_1 = True,
                                                            squad_2 = True,
                                                            squad_3 = True,
                                                            squad_4 = True,
                                                            squad_5 = True,
                                                            squad_6 = True,
                                                            squad_7 = True,
                                                            squad_8 = True,
                                                            drop_1 = True,
                                                            drop_2 = True,
                                                            drop_3 = True,
                                                            drop_4 = True,
                                                            drop_5 = True,
                                                            drop_6 = True,
                                                            drop_7 = True
                                                            ) 
df_train_f2_all_templates = create_pandas_dataset_from_pandas(df_train_f2,
                                                            squad_1 = True,
                                                            squad_2 = True,
                                                            squad_3 = True,
                                                            squad_4 = True,
                                                            squad_5 = True,
                                                            squad_6 = True,
                                                            squad_7 = True,
                                                            squad_8 = True,
                                                            drop_1 = True,
                                                            drop_2 = True,
                                                            drop_3 = True,
                                                            drop_4 = True,
                                                            drop_5 = True,
                                                            drop_6 = True,
                                                            drop_7 = True
                                                              ) 
df_dev_f2_all_templates = create_pandas_dataset_from_pandas(df_dev_f2,
                                                            squad_1 = True,
                                                            squad_2 = True,
                                                            squad_3 = True,
                                                            squad_4 = True,
                                                            squad_5 = True,
                                                            squad_6 = True,
                                                            squad_7 = True,
                                                            squad_8 = True,
                                                            drop_1 = True,
                                                            drop_2 = True,
                                                            drop_3 = True,
                                                            drop_4 = True,
                                                            drop_5 = True,
                                                            drop_6 = True,
                                                            drop_7 = True 
                                                            ) 

print("df_train_f1_all_templates describe: ")
display(df_train_f1_all_templates.describe())
print("df_dev_f1_all_templates describe: ")
display(df_dev_f1_all_templates.describe())

print("df_train_f2_all_templates describe: ")
display(df_train_f2_all_templates.describe())
print("df_dev_f2_all_templates describe: ")
display(df_dev_f2_all_templates.describe())

  0%|          | 14/5512 [00:00<00:40, 134.57it/s]

100%|██████████| 5512/5512 [03:10<00:00, 28.98it/s]
100%|██████████| 2353/2353 [00:27<00:00, 84.94it/s]
100%|██████████| 5513/5513 [03:28<00:00, 26.45it/s]
100%|██████████| 2352/2352 [00:27<00:00, 86.49it/s]


df_train_f1_all_templates describe: 


Unnamed: 0,prompt,answer
count,82680,82680
unique,82650,2821
top,Read this and answer the question. If the ques...,unanswerable
freq,2,28080


df_dev_f1_all_templates describe: 


Unnamed: 0,prompt,answer
count,35295,35295
unique,35280,1338
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,12060


df_train_f2_all_templates describe: 


Unnamed: 0,prompt,answer
count,82695,82695
unique,82665,2849
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,28080


df_dev_f2_all_templates describe: 


Unnamed: 0,prompt,answer
count,35280,35280
unique,35280,1326
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,12060


In [None]:
# df_train_f1_all_templates.to_parquet('../data/df_train_tdm_docteat_f1_all_templates.parquet')
# df_dev_f1_all_templates.to_parquet('../data/df_dev_tdm_docteat_f1_all_templates.parquet')
# df_train_f2_all_templates.to_parquet('../data/df_train_tdm_docteat_f2_all_templates.parquet')
# df_dev_f2_all_templates.to_parquet('../data/df_dev_tdm_docteat_f2_all_templates.parquet')

# dataset_train_f1_all_templates = Dataset.from_pandas(df_train_f1_all_templates)
# dataset_dev_f1_all_templates = Dataset.from_pandas(df_dev_f1_all_templates)
# dataset_train_f2_all_templates = Dataset.from_pandas(df_train_f2_all_templates)
# dataset_dev_f2_all_templates = Dataset.from_pandas(df_dev_f2_all_templates)


dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_all_templates.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_all_templates.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_all_templates.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_all_templates.parquet')
    })
})

print(dataset)

# dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE")

# Specific Template

In [19]:
df_train_f1_squad_1 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_1 = True,
                                                        ) 
df_dev_f1_squad_1 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_1 = True,
                                                        ) 
df_train_f2_squad_1 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_1 = True,
                                                        ) 
df_dev_f2_squad_1 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_1 = True,
                                                        ) 

print("df_train_f1_squad_1 describe: ")
display(df_train_f1_squad_1.describe())
print("df_dev_f1_squad_1 describe: ")
display(df_dev_f1_squad_1.describe())

print("df_train_f2_squad_1 describe: ")
display(df_train_f2_squad_1.describe())
print("df_dev_f2_squad_1 describe: ")
display(df_dev_f2_squad_1.describe())

df_train_f1_squad_1.to_parquet('../data/df_train_tdm_docteat_f1_squad_1.parquet')
df_dev_f1_squad_1.to_parquet('../data/df_dev_tdm_docteat_f1_squad_1.parquet')
df_train_f2_squad_1.to_parquet('../data/df_train_tdm_docteat_f2_squad_1.parquet')
df_dev_f2_squad_1.to_parquet('../data/df_dev_tdm_docteat_f2_squad_1.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_squad_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_squad_1.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_squad_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_squad_1.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_SQUAD_1")

  0%|          | 0/5512 [00:00<?, ?it/s]

100%|██████████| 5512/5512 [00:02<00:00, 2133.77it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2285.96it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2142.10it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2280.85it/s]

df_train_f1_squad_1 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_squad_1 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_squad_1 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_squad_1 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [20]:
df_train_f1_squad_2 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_2 = True,
                                                        ) 
df_dev_f1_squad_2 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_2 = True,
                                                        ) 
df_train_f2_squad_2 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_2 = True,
                                                        ) 
df_dev_f2_squad_2 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_2 = True,
                                                        ) 

print("df_train_f1_squad_2 describe: ")
display(df_train_f1_squad_2.describe())
print("df_dev_f1_squad_2 describe: ")
display(df_dev_f1_squad_2.describe())

print("df_train_f2_squad_2 describe: ")
display(df_train_f2_squad_2.describe())
print("df_dev_f2_squad_2 describe: ")
display(df_dev_f2_squad_2.describe())

df_train_f1_squad_2.to_parquet('../data/df_train_tdm_docteat_f1_squad_2.parquet')
df_dev_f1_squad_2.to_parquet('../data/df_dev_tdm_docteat_f1_squad_2.parquet')
df_train_f2_squad_2.to_parquet('../data/df_train_tdm_docteat_f2_squad_2.parquet')
df_dev_f2_squad_2.to_parquet('../data/df_dev_tdm_docteat_f2_squad_2.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_squad_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_squad_2.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_squad_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_squad_2.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_SQUAD_2")

  7%|▋         | 408/5512 [00:00<00:02, 2089.05it/s]

100%|██████████| 5512/5512 [00:02<00:00, 2083.18it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2230.65it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2125.37it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2298.99it/s]

df_train_f1_squad_2 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,Read this and answer the question. If the ques...,unanswerable
freq,2,1872


df_dev_f1_squad_2 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,Read this and answer the question. If the ques...,unanswerable
freq,2,804


df_train_f2_squad_2 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,Read this and answer the question. If the ques...,unanswerable
freq,2,1872


df_dev_f2_squad_2 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Read this and answer the question. If the ques...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [21]:
df_train_f1_squad_3 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_3 = True,
                                                        ) 
df_dev_f1_squad_3 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_3 = True,
                                                        ) 
df_train_f2_squad_3 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_3 = True,
                                                        ) 
df_dev_f2_squad_3 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_3 = True,
                                                        ) 

print("df_train_f1_squad_3 describe: ")
display(df_train_f1_squad_3.describe())
print("df_dev_f1_squad_3 describe: ")
display(df_dev_f1_squad_3.describe())

print("df_train_f2_squad_3 describe: ")
display(df_train_f2_squad_3.describe())
print("df_dev_f2_squad_3 describe: ")
display(df_dev_f2_squad_3.describe())

df_train_f1_squad_3.to_parquet('../data/df_train_tdm_docteat_f1_squad_3.parquet')
df_dev_f1_squad_3.to_parquet('../data/df_dev_tdm_docteat_f1_squad_3.parquet')
df_train_f2_squad_3.to_parquet('../data/df_train_tdm_docteat_f2_squad_3.parquet')
df_dev_f2_squad_3.to_parquet('../data/df_dev_tdm_docteat_f2_squad_3.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_squad_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_squad_3.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_squad_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_squad_3.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_SQUAD_3")

  3%|▎         | 171/5512 [00:00<00:03, 1704.65it/s]

100%|██████████| 5512/5512 [00:02<00:00, 2150.18it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2248.43it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2116.75it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2262.22it/s]

df_train_f1_squad_3 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_squad_3 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_squad_3 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_squad_3 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [22]:
df_train_f1_squad_4 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_4 = True,
                                                        ) 
df_dev_f1_squad_4 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_4 = True,
                                                        ) 
df_train_f2_squad_4 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_4 = True,
                                                        ) 
df_dev_f2_squad_4 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_4 = True,
                                                        ) 

print("df_train_f1_squad_4 describe: ")
display(df_train_f1_squad_4.describe())
print("df_dev_f1_squad_4 describe: ")
display(df_dev_f1_squad_4.describe())

print("df_train_f2_squad_4 describe: ")
display(df_train_f2_squad_4.describe())
print("df_dev_f2_squad_4 describe: ")
display(df_dev_f2_squad_4.describe())

df_train_f1_squad_4.to_parquet('../data/df_train_tdm_docteat_f1_squad_4.parquet')
df_dev_f1_squad_4.to_parquet('../data/df_dev_tdm_docteat_f1_squad_4.parquet')
df_train_f2_squad_4.to_parquet('../data/df_train_tdm_docteat_f2_squad_4.parquet')
df_dev_f2_squad_4.to_parquet('../data/df_dev_tdm_docteat_f2_squad_4.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_squad_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_squad_4.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_squad_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_squad_4.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_SQUAD_4")

  0%|          | 0/5512 [00:00<?, ?it/s]

100%|██████████| 5512/5512 [00:02<00:00, 2142.20it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2289.03it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2142.31it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2285.57it/s]

df_train_f1_squad_4 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_squad_4 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_squad_4 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_squad_4 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [23]:
df_train_f1_squad_5 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_5 = True,
                                                        ) 
df_dev_f1_squad_5 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_5 = True,
                                                        ) 
df_train_f2_squad_5 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_5 = True,
                                                        ) 
df_dev_f2_squad_5 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_5 = True,
                                                        ) 

print("df_train_f1_squad_5 describe: ")
display(df_train_f1_squad_5.describe())
print("df_dev_f1_squad_5 describe: ")
display(df_dev_f1_squad_5.describe())

print("df_train_f2_squad_5 describe: ")
display(df_train_f2_squad_5.describe())
print("df_dev_f2_squad_5 describe: ")
display(df_dev_f2_squad_5.describe())

df_train_f1_squad_5.to_parquet('../data/df_train_tdm_docteat_f1_squad_5.parquet')
df_dev_f1_squad_5.to_parquet('../data/df_dev_tdm_docteat_f1_squad_5.parquet')
df_train_f2_squad_5.to_parquet('../data/df_train_tdm_docteat_f2_squad_5.parquet')
df_dev_f2_squad_5.to_parquet('../data/df_dev_tdm_docteat_f2_squad_5.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_squad_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_squad_5.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_squad_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_squad_5.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_SQUAD_5")

  3%|▎         | 179/5512 [00:00<00:02, 1785.77it/s]

100%|██████████| 5512/5512 [00:02<00:00, 2139.65it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2287.03it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2136.34it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2301.49it/s]

df_train_f1_squad_5 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_squad_5 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_squad_5 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_squad_5 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [24]:
df_train_f1_squad_6 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_6 = True,
                                                        ) 
df_dev_f1_squad_6 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_6 = True,
                                                        ) 
df_train_f2_squad_6 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_6 = True,
                                                        ) 
df_dev_f2_squad_6 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_6 = True,
                                                        ) 

print("df_train_f1_squad_6 describe: ")
display(df_train_f1_squad_6.describe())
print("df_dev_f1_squad_6 describe: ")
display(df_dev_f1_squad_6.describe())

print("df_train_f2_squad_6 describe: ")
display(df_train_f2_squad_6.describe())
print("df_dev_f2_squad_6 describe: ")
display(df_dev_f2_squad_6.describe())

df_train_f1_squad_6.to_parquet('../data/df_train_tdm_docteat_f1_squad_6.parquet')
df_dev_f1_squad_6.to_parquet('../data/df_dev_tdm_docteat_f1_squad_6.parquet')
df_train_f2_squad_6.to_parquet('../data/df_train_tdm_docteat_f2_squad_6.parquet')
df_dev_f2_squad_6.to_parquet('../data/df_dev_tdm_docteat_f2_squad_6.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_squad_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_squad_6.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_squad_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_squad_6.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_SQUAD_6")

  7%|▋         | 391/5512 [00:00<00:02, 1969.01it/s]

100%|██████████| 5512/5512 [00:02<00:00, 2129.20it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2303.60it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2145.02it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2308.02it/s]

df_train_f1_squad_6 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_squad_6 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_squad_6 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_squad_6 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [25]:
df_train_f1_squad_7 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_7 = True,
                                                        ) 
df_dev_f1_squad_7 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_7 = True,
                                                        ) 
df_train_f2_squad_7 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_7 = True,
                                                        ) 
df_dev_f2_squad_7 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_7 = True,
                                                        ) 

print("df_train_f1_squad_7 describe: ")
display(df_train_f1_squad_7.describe())
print("df_dev_f1_squad_7 describe: ")
display(df_dev_f1_squad_7.describe())

print("df_train_f2_squad_7 describe: ")
display(df_train_f2_squad_7.describe())
print("df_dev_f2_squad_7 describe: ")
display(df_dev_f2_squad_7.describe())

df_train_f1_squad_7.to_parquet('../data/df_train_tdm_docteat_f1_squad_7.parquet')
df_dev_f1_squad_7.to_parquet('../data/df_dev_tdm_docteat_f1_squad_7.parquet')
df_train_f2_squad_7.to_parquet('../data/df_train_tdm_docteat_f2_squad_7.parquet')
df_dev_f2_squad_7.to_parquet('../data/df_dev_tdm_docteat_f2_squad_7.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_squad_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_squad_7.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_squad_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_squad_7.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_SQUAD_7")

100%|██████████| 5512/5512 [00:02<00:00, 2172.32it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2314.03it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2165.31it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2314.05it/s]

df_train_f1_squad_7 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,Read this: FixMatch: Simplifying Semi-Supervis...,unanswerable
freq,2,1872


df_dev_f1_squad_7 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,Read this: PANDA: Adapting Pretrained Features...,unanswerable
freq,2,804


df_train_f2_squad_7 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,Read this: IEEE TRANSACTIONS ON PATTERN ANALYS...,unanswerable
freq,2,1872


df_dev_f2_squad_7 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Read this: Ocean: Object-aware Anchor-free Tra...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [26]:
df_train_f1_squad_8 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_8 = True,
                                                        ) 
df_dev_f1_squad_8 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_8 = True,
                                                        ) 
df_train_f2_squad_8 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_8 = True,
                                                        ) 
df_dev_f2_squad_8 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_8 = True,
                                                        ) 

print("df_train_f1_squad_8 describe: ")
display(df_train_f1_squad_8.describe())
print("df_dev_f1_squad_8 describe: ")
display(df_dev_f1_squad_8.describe())

print("df_train_f2_squad_8 describe: ")
display(df_train_f2_squad_8.describe())
print("df_dev_f2_squad_8 describe: ")
display(df_dev_f2_squad_8.describe())

df_train_f1_squad_8.to_parquet('../data/df_train_tdm_docteat_f1_squad_8.parquet')
df_dev_f1_squad_8.to_parquet('../data/df_dev_tdm_docteat_f1_squad_8.parquet')
df_train_f2_squad_8.to_parquet('../data/df_train_tdm_docteat_f2_squad_8.parquet')
df_dev_f2_squad_8.to_parquet('../data/df_dev_tdm_docteat_f2_squad_8.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_squad_8.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_squad_8.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_squad_8.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_squad_8.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_SQUAD_8")

  3%|▎         | 185/5512 [00:00<00:02, 1844.71it/s]

100%|██████████| 5512/5512 [00:02<00:00, 2182.31it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2281.16it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2152.93it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2313.46it/s]

df_train_f1_squad_8 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,Read this: FixMatch: Simplifying Semi-Supervis...,unanswerable
freq,2,1872


df_dev_f1_squad_8 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,Read this: PANDA: Adapting Pretrained Features...,unanswerable
freq,2,804


df_train_f2_squad_8 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,Read this: IEEE TRANSACTIONS ON PATTERN ANALYS...,unanswerable
freq,2,1872


df_dev_f2_squad_8 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Read this: Ocean: Object-aware Anchor-free Tra...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [27]:
df_train_f1_drop_1 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_1 = True,
                                                        ) 
df_dev_f1_drop_1 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_1 = True,
                                                        ) 
df_train_f2_drop_1 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_1 = True,
                                                        ) 
df_dev_f2_drop_1 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_1 = True,
                                                        ) 

print("df_train_f1_drop_1 describe: ")
display(df_train_f1_drop_1.describe())
print("df_dev_f1_drop_1 describe: ")
display(df_dev_f1_drop_1.describe())

print("df_train_f2_drop_1 describe: ")
display(df_train_f2_drop_1.describe())
print("df_dev_f2_drop_1 describe: ")
display(df_dev_f2_drop_1.describe())

df_train_f1_drop_1.to_parquet('../data/df_train_tdm_docteat_f1_drop_1.parquet')
df_dev_f1_drop_1.to_parquet('../data/df_dev_tdm_docteat_f1_drop_1.parquet')
df_train_f2_drop_1.to_parquet('../data/df_train_tdm_docteat_f2_drop_1.parquet')
df_dev_f2_drop_1.to_parquet('../data/df_dev_tdm_docteat_f2_drop_1.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_drop_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_drop_1.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_drop_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_drop_1.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_DROP_1")

  7%|▋         | 401/5512 [00:00<00:02, 2058.20it/s]

100%|██████████| 5512/5512 [00:02<00:00, 2144.73it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2280.32it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2131.69it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2288.67it/s]

df_train_f1_drop_1 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,Answer based on context:\n\nFixMatch: Simplify...,unanswerable
freq,2,1872


df_dev_f1_drop_1 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,Answer based on context:\n\nPANDA: Adapting Pr...,unanswerable
freq,2,804


df_train_f2_drop_1 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,Answer based on context:\n\nIEEE TRANSACTIONS ...,unanswerable
freq,2,1872


df_dev_f2_drop_1 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Answer based on context:\n\nOcean: Object-awar...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [28]:
df_train_f1_drop_2 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_2 = True,
                                                        ) 
df_dev_f1_drop_2 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_2 = True,
                                                        ) 
df_train_f2_drop_2 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_2 = True,
                                                        ) 
df_dev_f2_drop_2 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_2 = True,
                                                        ) 

print("df_train_f1_drop_2 describe: ")
display(df_train_f1_drop_2.describe())
print("df_dev_f1_drop_2 describe: ")
display(df_dev_f1_drop_2.describe())

print("df_train_f2_drop_2 describe: ")
display(df_train_f2_drop_2.describe())
print("df_dev_f2_drop_2 describe: ")
display(df_dev_f2_drop_2.describe())

df_train_f1_drop_2.to_parquet('../data/df_train_tdm_docteat_f1_drop_2.parquet')
df_dev_f1_drop_2.to_parquet('../data/df_dev_tdm_docteat_f1_drop_2.parquet')
df_train_f2_drop_2.to_parquet('../data/df_train_tdm_docteat_f2_drop_2.parquet')
df_dev_f2_drop_2.to_parquet('../data/df_dev_tdm_docteat_f2_drop_2.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_drop_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_drop_2.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_drop_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_drop_2.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_DROP_2")

100%|██████████| 5512/5512 [00:02<00:00, 2161.06it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2286.31it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2131.21it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2260.77it/s]

df_train_f1_drop_2 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_drop_2 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_drop_2 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_drop_2 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [29]:
df_train_f1_drop_3 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_3 = True,
                                                        ) 
df_dev_f1_drop_3 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_3 = True,
                                                        ) 
df_train_f2_drop_3 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_3 = True,
                                                        ) 
df_dev_f2_drop_3 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_3 = True,
                                                        ) 

print("df_train_f1_drop_3 describe: ")
display(df_train_f1_drop_3.describe())
print("df_dev_f1_drop_3 describe: ")
display(df_dev_f1_drop_3.describe())

print("df_train_f2_drop_3 describe: ")
display(df_train_f2_drop_3.describe())
print("df_dev_f2_drop_3 describe: ")
display(df_dev_f2_drop_3.describe())

df_train_f1_drop_3.to_parquet('../data/df_train_tdm_docteat_f1_drop_3.parquet')
df_dev_f1_drop_3.to_parquet('../data/df_dev_tdm_docteat_f1_drop_3.parquet')
df_train_f2_drop_3.to_parquet('../data/df_train_tdm_docteat_f2_drop_3.parquet')
df_dev_f2_drop_3.to_parquet('../data/df_dev_tdm_docteat_f2_drop_3.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_drop_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_drop_3.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_drop_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_drop_3.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_DROP_3")

  3%|▎         | 181/5512 [00:00<00:02, 1804.71it/s]

100%|██████████| 5512/5512 [00:02<00:00, 2168.80it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2247.82it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2117.07it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2298.80it/s]

df_train_f1_drop_3 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_drop_3 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_drop_3 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_drop_3 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [30]:
df_train_f1_drop_4 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_4 = True,
                                                        ) 
df_dev_f1_drop_4 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_4 = True,
                                                        ) 
df_train_f2_drop_4 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_4 = True,
                                                        ) 
df_dev_f2_drop_4 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_4 = True,
                                                        ) 

print("df_train_f1_drop_4 describe: ")
display(df_train_f1_drop_4.describe())
print("df_dev_f1_drop_4 describe: ")
display(df_dev_f1_drop_4.describe())

print("df_train_f2_drop_4 describe: ")
display(df_train_f2_drop_4.describe())
print("df_dev_f2_drop_4 describe: ")
display(df_dev_f2_drop_4.describe())

df_train_f1_drop_4.to_parquet('../data/df_train_tdm_docteat_f1_drop_4.parquet')
df_dev_f1_drop_4.to_parquet('../data/df_dev_tdm_docteat_f1_drop_4.parquet')
df_train_f2_drop_4.to_parquet('../data/df_train_tdm_docteat_f2_drop_4.parquet')
df_dev_f2_drop_4.to_parquet('../data/df_dev_tdm_docteat_f2_drop_4.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_drop_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_drop_4.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_drop_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_drop_4.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_DROP_4")

  3%|▎         | 170/5512 [00:00<00:03, 1693.00it/s]

100%|██████████| 5512/5512 [00:02<00:00, 2134.26it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2280.49it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2136.43it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2282.71it/s]

df_train_f1_drop_4 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_drop_4 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_drop_4 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_drop_4 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [31]:
df_train_f1_drop_5 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_5 = True,
                                                        ) 
df_dev_f1_drop_5 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_5 = True,
                                                        ) 
df_train_f2_drop_5 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_5 = True,
                                                        ) 
df_dev_f2_drop_5 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_5 = True,
                                                        ) 

print("df_train_f1_drop_5 describe: ")
display(df_train_f1_drop_5.describe())
print("df_dev_f1_drop_5 describe: ")
display(df_dev_f1_drop_5.describe())

print("df_train_f2_drop_5 describe: ")
display(df_train_f2_drop_5.describe())
print("df_dev_f2_drop_5 describe: ")
display(df_dev_f2_drop_5.describe())

df_train_f1_drop_5.to_parquet('../data/df_train_tdm_docteat_f1_drop_5.parquet')
df_dev_f1_drop_5.to_parquet('../data/df_dev_tdm_docteat_f1_drop_5.parquet')
df_train_f2_drop_5.to_parquet('../data/df_train_tdm_docteat_f2_drop_5.parquet')
df_dev_f2_drop_5.to_parquet('../data/df_dev_tdm_docteat_f2_drop_5.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_drop_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_drop_5.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_drop_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_drop_5.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_DROP_5")

  7%|▋         | 408/5512 [00:00<00:02, 2082.24it/s]

100%|██████████| 5512/5512 [00:02<00:00, 2143.81it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2232.04it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2078.44it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2171.29it/s]

df_train_f1_drop_5 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,Read this article and answer this question Fix...,unanswerable
freq,2,1872


df_dev_f1_drop_5 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,Read this article and answer this question PAN...,unanswerable
freq,2,804


df_train_f2_drop_5 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,Read this article and answer this question IEE...,unanswerable
freq,2,1872


df_dev_f2_drop_5 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Read this article and answer this question Oce...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [32]:
df_train_f1_drop_6 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_6 = True,
                                                        ) 
df_dev_f1_drop_6 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_6 = True,
                                                        ) 
df_train_f2_drop_6 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_6 = True,
                                                        ) 
df_dev_f2_drop_6 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_6 = True,
                                                        ) 

print("df_train_f1_drop_6 describe: ")
display(df_train_f1_drop_6.describe())
print("df_dev_f1_drop_6 describe: ")
display(df_dev_f1_drop_6.describe())

print("df_train_f2_drop_6 describe: ")
display(df_train_f2_drop_6.describe())
print("df_dev_f2_drop_6 describe: ")
display(df_dev_f2_drop_6.describe())

df_train_f1_drop_6.to_parquet('../data/df_train_tdm_docteat_f1_drop_6.parquet')
df_dev_f1_drop_6.to_parquet('../data/df_dev_tdm_docteat_f1_drop_6.parquet')
df_train_f2_drop_6.to_parquet('../data/df_train_tdm_docteat_f2_drop_6.parquet')
df_dev_f2_drop_6.to_parquet('../data/df_dev_tdm_docteat_f2_drop_6.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_drop_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_drop_6.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_drop_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_drop_6.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_DROP_6")

  3%|▎         | 186/5512 [00:00<00:02, 1853.74it/s]

100%|██████████| 5512/5512 [00:02<00:00, 2141.42it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2262.34it/s]
100%|██████████| 5513/5513 [00:02<00:00, 1887.18it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2270.19it/s]

df_train_f1_drop_6 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


df_dev_f1_drop_6 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,PANDA: Adapting Pretrained Features for Anomal...,unanswerable
freq,2,804


df_train_f2_drop_6 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACH...,unanswerable
freq,2,1872


df_dev_f2_drop_6 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Ocean: Object-aware Anchor-free Tracking Ancho...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [33]:
df_train_f1_drop_7 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_7 = True,
                                                        ) 
df_dev_f1_drop_7 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_7 = True,
                                                        ) 
df_train_f2_drop_7 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_7 = True,
                                                        ) 
df_dev_f2_drop_7 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_7 = True,
                                                        ) 

print("df_train_f1_drop_7 describe: ")
display(df_train_f1_drop_7.describe())
print("df_dev_f1_drop_7 describe: ")
display(df_dev_f1_drop_7.describe())

print("df_train_f2_drop_7 describe: ")
display(df_train_f2_drop_7.describe())
print("df_dev_f2_drop_7 describe: ")
display(df_dev_f2_drop_7.describe())

df_train_f1_drop_7.to_parquet('../data/df_train_tdm_docteat_f1_drop_7.parquet')
df_dev_f1_drop_7.to_parquet('../data/df_dev_tdm_docteat_f1_drop_7.parquet')
df_train_f2_drop_7.to_parquet('../data/df_train_tdm_docteat_f2_drop_7.parquet')
df_dev_f2_drop_7.to_parquet('../data/df_dev_tdm_docteat_f2_drop_7.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f1_drop_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f1_drop_7.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdm_docteat_f2_drop_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdm_docteat_f2_drop_7.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_DOCTEAT_TDM_DROP_7")

  4%|▎         | 193/5512 [00:00<00:02, 1923.83it/s]

100%|██████████| 5512/5512 [00:02<00:00, 2135.27it/s]
100%|██████████| 2353/2353 [00:01<00:00, 2281.14it/s]
100%|██████████| 5513/5513 [00:02<00:00, 2134.69it/s]
100%|██████████| 2352/2352 [00:01<00:00, 2288.81it/s]

df_train_f1_drop_7 describe: 





Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,Context: FixMatch: Simplifying Semi-Supervised...,unanswerable
freq,2,1872


df_dev_f1_drop_7 describe: 


Unnamed: 0,prompt,answer
count,2353,2353
unique,2352,1338
top,Context: PANDA: Adapting Pretrained Features f...,unanswerable
freq,2,804


df_train_f2_drop_7 describe: 


Unnamed: 0,prompt,answer
count,5513,5513
unique,5511,2849
top,Context: IEEE TRANSACTIONS ON PATTERN ANALYSIS...,unanswerable
freq,2,1872


df_dev_f2_drop_7 describe: 


Unnamed: 0,prompt,answer
count,2352,2352
unique,2352,1326
top,Context: Ocean: Object-aware Anchor-free Track...,unanswerable
freq,1,804


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5512
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2353
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 5513
        })
        validation: Dataset({
            features: ['prompt', 'answer', '__index_level_0__'],
            num_rows: 2352
        })
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/5512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2353 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2352 [00:00<?, ? examples/s]

In [46]:
# df_train_f1_all_templates = pd.read_parquet('../data/df_train_f1_all_templates.parquet')
# df_dev_f1_all_templates = pd.read_parquet('../data/df_dev_f1_all_templates.parquet')
# df_train_f2_all_templates = pd.read_parquet('../data/df_train_f2_all_templates.parquet')
# df_dev_f2_all_templates = pd.read_parquet('../data/df_dev_f2_all_templates.parquet')

In [34]:
1+3

4

In [48]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [35]:
# root_directory = "../data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE"
root_directory = "../data/LLLM_DOCTEAT_TDM_DROP_7"

# reloaded_encoded_dataset = datasets.load_from_disk("../data/dataset/LLLM_TDMS_ALL_TEMPLATE")
# reloaded_encoded_dataset = DatasetDict.load_from_disk("../data/LLLM_TDMS_ALL_TEMPLATE")

dataset_fold1 = DatasetDict.load_from_disk(f"{root_directory}/fold1")
dataset_fold2 = DatasetDict.load_from_disk(f"{root_directory}/fold2")

In [36]:
dataset_fold1['train'][0]

{'prompt': "Context: Value Prediction Network This paper proposes a novel deep reinforcement learning (RL) architecture, called Value Prediction Network (VPN), which integrates model-free and model-based RL methods into a single neural network. In contrast to typical model-based RL methods, VPN learns a dynamics model whose abstract states are trained to make option-conditional predictions of future values (discounted sum of rewards) rather than of future observations. Our experimental results show that VPN has several advantages over both model-free and model-based baselines in a stochastic environment where careful planning is required but building an accurate observation-prediction model is difficult. Furthermore, VPN outperforms Deep Q-Network (DQN) on several Atari games even with short-lookahead planning, demonstrating its potential as anew way of learning a good state representation. VPN has four more hyperparameters: 1) the number of predictions steps (k) during training, 2) th

In [None]:
# 

# Contructed random 50% dataset

In [1]:
#imports
import pandas as pd
import torch
from tqdm import tqdm
from datasets import DatasetDict, Dataset, load_from_disk
# from torch.utils.data import Dataset, DataLoader
from pprint import pprint
import copy
import numpy as np
from collections import defaultdict
import ipdb, re

pd.options.display.max_rows , pd.options.display.max_columns  = 100,100  

In [2]:
df_train_tdm_docteat_f1_all_templates = pd.read_parquet('../data/df_train_tdm_docteat_f1_all_templates.parquet')
df_dev_tdm_docteat_f1_all_templates = pd.read_parquet('../data/df_dev_tdm_docteat_f1_all_templates.parquet')

df_train_tdm_docteat_f2_all_templates = pd.read_parquet('../data/df_train_tdm_docteat_f2_all_templates.parquet')
df_dev_tdm_docteat_f2_all_templates = pd.read_parquet('../data/df_dev_tdm_docteat_f2_all_templates.parquet')

df_train_tdm_docteat_f1_all_templates.head()

Unnamed: 0,prompt,answer
0,Value Prediction Network This paper proposes a...,"[{'LEADERBOARD': {'Task': 'Atari Games', 'Data..."
1,Read this and answer the question. If the ques...,"[{'LEADERBOARD': {'Task': 'Atari Games', 'Data..."
2,Value Prediction Network This paper proposes a...,"[{'LEADERBOARD': {'Task': 'Atari Games', 'Data..."
3,Value Prediction Network This paper proposes a...,"[{'LEADERBOARD': {'Task': 'Atari Games', 'Data..."
4,Value Prediction Network This paper proposes a...,"[{'LEADERBOARD': {'Task': 'Atari Games', 'Data..."


In [3]:
df_train_tdm_docteat_f1_all_templates.describe()

Unnamed: 0,prompt,answer
count,82680,82680
unique,82650,2821
top,Read this and answer the question. If the ques...,unanswerable
freq,2,28080


In [4]:
# substring_to_match_squad_1 = re.escape('Please answer a question about this article. If the question is unanswerable, say "unanswerable".')
# substring_to_match_squad_1

In [5]:
# Filter rows where the 'text' column contains the given substring
# substring_to_match_squad_1 = re.escape('Please answer a question about this article. If the question is unanswerable, say "unanswerable".')
substring_to_match_squad_1 = 'Please answer a question about this article. If the question is unanswerable, say "unanswerable".'
substring_to_match_squad_2 = 'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".'
substring_to_match_squad_3 = '(If the question is unanswerable, say \"unanswerable\"'
substring_to_match_squad_4 = 'Try to answer this question if possible (otherwise reply \"unanswerable\")'
substring_to_match_squad_5 = 'If it is possible to answer this question, answer it for me (else, reply \"unanswerable\"):'
substring_to_match_squad_6 = 'Answer this question, if possible (if impossible, reply \"unanswerable\"):'
substring_to_match_squad_7 = 'What is the answer? (If it cannot be answered, return \"unanswerable\")'
substring_to_match_squad_8 = 'Now answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"):'

substring_to_match_drop_1 = 'Answer based on context:'
substring_to_match_drop_2 = 'Answer this question based on the article:'
# substring_to_match_drop_3 = ''
substring_to_match_drop_4 = 'Answer this question: '
substring_to_match_drop_5 = 'Read this article and answer this question'
substring_to_match_drop_6 = 'Based on the above article, answer a question.'
substring_to_match_drop_7 = 'Context: '


# df_train_tdm_docteat_f1_all_templates_squad_1 = df_train_tdm_docteat_f1_all_templates[
#     df_train_tdm_docteat_f1_all_templates['prompt'].str.contains(substring_to_match_squad_1, case=False, na=False)]

# df_train_tdm_docteat_f1_all_templates['template'] = df_train_tdm_docteat_f1_all_templates['prompt'].apply(
#     lambda x : "squad_1" if substring_to_match_squad_1 in x else "None")

df_train_tdm_docteat_f1_all_templates['template'] = df_train_tdm_docteat_f1_all_templates['prompt'].apply(
    lambda x : "squad_1" if substring_to_match_squad_1 in x else \
        "squad_2" if substring_to_match_squad_2 in x else \
        "squad_3" if substring_to_match_squad_3 in x else \
        "squad_4" if substring_to_match_squad_4 in x else \
        "squad_5" if substring_to_match_squad_5 in x else \
        "squad_6" if substring_to_match_squad_6 in x else \
        "squad_7" if substring_to_match_squad_7 in x else \
        "squad_8" if substring_to_match_squad_8 in x else \
        "drop_1" if substring_to_match_drop_1 in x else \
        "drop_2" if substring_to_match_drop_2 in x else \
        "drop_4" if substring_to_match_drop_4 in x else \
        "drop_5" if substring_to_match_drop_5 in x else \
        "drop_6" if substring_to_match_drop_6 in x else \
        "drop_7" if substring_to_match_drop_7 in x else "drop_3"
        )

# df_train_tdm_docteat_f1_all_templates['template'] = df_train_tdm_docteat_f1_all_templates['prompt'].apply(
#     lambda x : "squad_4" if re.search(f"{substring_to_match_squad_4}", x) else "None")

# df_train_tdm_docteat_f1_all_templates['template'] = df_train_tdm_docteat_f1_all_templates['prompt'].apply(
#     lambda x : "squad_1" if re.search(f"\b{substring_to_match_squad_1}\b", x) else \
#                 "squad_2" if re.search(f"\b{substring_to_match_squad_2}\b", x) else \
#                 "squad_3" if re.search(f"\b{substring_to_match_squad_3}\b", x) else "drop_3")


df_train_tdm_docteat_f1_all_templates.describe()

Unnamed: 0,prompt,answer,template
count,82680,82680,82680
unique,82650,2821,15
top,Read this and answer the question. If the ques...,unanswerable,squad_1
freq,2,28080,5512


In [7]:
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='squad_1'].describe())
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='squad_2'].describe())
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='squad_3'].describe())
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='squad_4'].describe())
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='squad_5'].describe())
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='squad_6'].describe())
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='squad_7'].describe())
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='squad_8'].describe())
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='drop_1'].describe())
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='drop_2'].describe())
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='drop_3'].describe())
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='drop_4'].describe())
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='drop_5'].describe())
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='drop_5'].describe())
display(df_train_tdm_docteat_f1_all_templates[df_train_tdm_docteat_f1_all_templates['template']=='drop_7'].describe())

Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable,squad_1
freq,2,1872,5512


Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,Read this and answer the question. If the ques...,unanswerable,squad_2
freq,2,1872,5512


Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable,squad_3
freq,2,1872,5512


Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable,squad_4
freq,2,1872,5512


Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable,squad_5
freq,2,1872,5512


Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable,squad_6
freq,2,1872,5512


Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,Read this: FixMatch: Simplifying Semi-Supervis...,unanswerable,squad_7
freq,2,1872,5512


Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,Read this: FixMatch: Simplifying Semi-Supervis...,unanswerable,squad_8
freq,2,1872,5512


Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,Answer based on context:\n\nFixMatch: Simplify...,unanswerable,drop_1
freq,2,1872,5512


Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable,drop_2
freq,2,1872,5512


Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable,drop_3
freq,2,1872,5512


Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable,drop_4
freq,2,1872,5512


Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,Read this article and answer this question Fix...,unanswerable,drop_5
freq,2,1872,5512


Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,Read this article and answer this question Fix...,unanswerable,drop_5
freq,2,1872,5512


Unnamed: 0,prompt,answer,template
count,5512,5512,5512
unique,5510,2821,1
top,Context: FixMatch: Simplifying Semi-Supervised...,unanswerable,drop_7
freq,2,1872,5512


In [None]:
result_df  = pd.DataFrame(columns = ['prompt', 'answer'])   
  
  q_types = [
    {"q": "What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, and metric?", "a_key": "TDMSs"},
    ]
  
  records = df.to_dict("records")

In [9]:
df_train_tdm_docteat_f1_squad_1 = pd.read_parquet('../data/df_train_tdm_docteat_f1_squad_1.parquet')
df_train_tdm_docteat_f1_all_templates_squad_1.describe()

Unnamed: 0,prompt,answer
count,5512,5512
unique,5510,2821
top,FixMatch: Simplifying Semi-Supervised Learning...,unanswerable
freq,2,1872


In [None]:
# Squad_v2 
if squad_1:
    result_df.loc[count_index] = [f'{row["Context"]}\n\nPlease answer a question about this article. If the question is unanswerable, say \"unanswerable\". {q_type["q"]}'] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1

if squad_2:
    result_df.loc[count_index] = [f'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".\n\n{row["Context"]}\n\n{q_type["q"]}'
] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1

if squad_3:
    result_df.loc[count_index] = [f'{row["Context"]}\n{q_type["q"]} (If the question is unanswerable, say \"unanswerable\"'] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1

if squad_4:
    result_df.loc[count_index] = [f'{row["Context"]}\nTry to answer this question if possible (otherwise reply \"unanswerable\"): {q_type["q"]}'] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1

if squad_5:
    result_df.loc[count_index] = [f'{row["Context"]}\nIf it is possible to answer this question, answer it for me (else, reply \"unanswerable\"): {q_type["q"]}'] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1

if squad_6:
    result_df.loc[count_index] = [f'{row["Context"]}\n\nAnswer this question, if possible (if impossible, reply \"unanswerable\"): {q_type["q"]}'] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1

if squad_7:
    result_df.loc[count_index] = [f'Read this: {row["Context"]}\n\n{q_type["q"]}\nWhat is the answer? (If it cannot be answered, return \"unanswerable\")'] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1

if squad_8:
    result_df.loc[count_index] = [f'Read this: {row["Context"]}\nNow answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"): {q_type["q"]}'] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1


# Drop
if drop_1:
    result_df.loc[count_index] = [f'Answer based on context:\n\n{row["Context"]}\n\n{q_type["q"]}'] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1

if drop_2:
    result_df.loc[count_index] = [f'{row["Context"]}\n\nAnswer this question based on the article: {q_type["q"]}'] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1

if drop_3:
    result_df.loc[count_index] = [f'{row["Context"]}\n\n{q_type["q"]}'] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1

if drop_4:
    result_df.loc[count_index] = [f'{row["Context"]}\nAnswer this question: {q_type["q"]}'] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1

if drop_5:
    result_df.loc[count_index] = [f'Read this article and answer this question {row["Context"]}\n{q_type["q"]}'] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1

if drop_6:
    result_df.loc[count_index] = [f'{row["Context"]}\n\nBased on the above article, answer a question. {q_type["q"]}'] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1

if drop_7:
    result_df.loc[count_index] = [f'Context: {row["Context"]}\n\nQuestion: {q_type["q"]}\n\nAnswer:'] \
    + [str(row[q_type["a_key"]])] 
    count_index += 1

In [None]:


df_train_f1_drop_1.to_parquet('../data/df_train_tdm_docteat_f1_drop_1.parquet')
df_dev_f1_drop_1.to_parquet('../data/df_dev_tdm_docteat_f1_drop_1.parquet')
df_train_f2_drop_1.to_parquet('../data/df_train_tdm_docteat_f2_drop_1.parquet')
df_dev_f2_drop_1.to_parquet('../data/df_dev_tdm_docteat_f2_drop_1.parquet')