In [1]:
# !pip install --quiet  datasets #to access squad dataset
# !pip install --quiet pyarrow   #to deal with parquet files for saving dataset if required
# !pip install --quiet  tqdm     #for progress bars
# !pip install --quiet transformers # for t5 model
# !pip install --quiet tokenizers  #tokenizers from HuggingFace
# !pip install --quiet sentencepiece #subword tokenizer used by T5
# !pip install --quiet pytorch-lightning # pytorch wrapper /
# !pip install --quiet torchtext # text utilities

# Fetching Datasets

In [2]:
#imports
import pandas as pd
import torch
from tqdm import tqdm
from datasets import DatasetDict, Dataset, load_from_disk
# from torch.utils.data import Dataset, DataLoader
from pprint import pprint
import copy, os, json
import numpy as np
from collections import defaultdict
import ipdb
import shutil
from fuzzywuzzy import fuzz
import random

random_seed = 47 

torch.manual_seed(random_seed)
np.random.seed(random_seed)
random.seed(random_seed)

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)



In [3]:
device  = 'cuda' if torch.cuda.is_available() else "cpu"
device

'cpu'

In [4]:
# path_to_source = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_full_5_10_10000_clone_latex_compare/10Neg10000unk/twofoldwithunk"
path_to_csv = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk"

fold1 = "fold1"
train_f1_pd = pd.read_csv(f"{path_to_csv}/{fold1}/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])
dev_f1_pd = pd.read_csv(f"{path_to_csv}/{fold1}/dev.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

fold2 = "fold2"
train_f2_pd = pd.read_csv(f"{path_to_csv}/{fold2}/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])
dev_f2_pd = pd.read_csv(f"{path_to_csv}/{fold2}/dev.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [5]:
pwd

'/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/notebooks'

In [6]:
# # no_leaderboard_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/T5-Leaderboard-QA/data_proccess/arxiv_no_leaderboard_links_pdf_short/DocTAET_full.tsv", 
# #                     sep="\t", names=["title", "Context"])

# no_leaderboard_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_no_leaderboard_links_pdf_short/DocTAET_150.tsv", 
#                     sep="\t", names=["title", "Context"])

# no_leaderboard_pd.describe()

In [7]:
resultsAnnotation_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/annotations_dec092023/resultsAnnotation.tsv",
                                   sep="\t", names=["Title", "TDMSs"])
resultsAnnotation_pd = resultsAnnotation_pd.fillna("NAN")
resultsAnnotation_pd

Unnamed: 0,Title,TDMSs
0,1704.03549v4.pdf,Optical Character Recognition (OCR)#FSNS - Tes...
1,1712.05404.pdf,Optical Character Recognition (OCR)#FSNS - Tes...
2,1702.03970v1.pdf,Optical Character Recognition (OCR)#FSNS - Tes...
3,1802.05415v2.pdf,Optical Character Recognition (OCR)#im2latex-1...
4,2308.15996v1.pdf,Optical Character Recognition (OCR)#Benchmarki...
...,...,...
12728,2112.07910v2.pdf,Open Vocabulary Semantic Segmentation#COCO-Stu...
12729,1709.08011v3.pdf,Japanese Word Segmentation#BCCWJ#F1-score (Wor...
12730,2203.04616v2.pdf,Binary Condescension Detection#DPM#F1-score#63...
12731,2208.01312v1.pdf,Binary Condescension Detection#DPM#F1-score#61...


In [8]:
resultsAnnotation_pd.describe()

Unnamed: 0,Title,TDMSs
count,12733,12733
unique,12729,12707
top,(RTX3090),NAN
freq,4,6


In [9]:
"""
This will take care of papers with more than one learderboard 
"""
records = resultsAnnotation_pd.to_dict("records")

global_title_to_tdms_dict = defaultdict(
    lambda : 
        list()
    )
ccount = 0
for i, row in tqdm(enumerate(records), total = len(records)):
    if row['TDMSs'] == 'NAN':
        continue

    for tdms in row['TDMSs'].split("$"):
        if len(tdms.split("#")) != 4:
            # print(tdms)
            ccount += 1
            continue 
        t, d, m, s = tdms.split("#")
        global_title_to_tdms_dict[row['Title']].append(
            {
                "LEADERBOARD": {
                    "Task": t,
                    "Dataset": d,
                    "Metric": m,
                    "Score": s,
                }
            }            
        )
ccount

  9%|▉         | 1144/12733 [00:00<00:01, 9746.73it/s]

100%|██████████| 12733/12733 [00:00<00:00, 35202.02it/s]


138

In [10]:
len(global_title_to_tdms_dict)

12718

In [11]:
np.unique(list(global_title_to_tdms_dict)).shape

(12718,)

In [12]:
tdms_test = []
for file_pdf in list(global_title_to_tdms_dict):
    tdms_test.append(global_title_to_tdms_dict[file_pdf])

# np.unique(tdms_test).shape

In [13]:
len(tdms_test)

12718

In [14]:
# No need for negative instances, but will still have 'duplicate' for paper with more than one leaderboard
train_f1_pd = train_f1_pd[train_f1_pd.label==True]
print("train_f1_pd")
display(train_f1_pd.describe())

dev_f1_pd = dev_f1_pd[dev_f1_pd.label==True]
print("dev_f1_pd")
display(dev_f1_pd.describe())

train_f2_pd = train_f2_pd[train_f2_pd.label==True]
print("train_f2_pd")
display(train_f2_pd.describe())

dev_f2_pd = dev_f2_pd[dev_f2_pd.label==True]
print("dev_f2_pd")
display(dev_f2_pd.describe())

train_f1_pd


Unnamed: 0,label,title,TDM,Context
count,12613,12613,12613,12613
unique,1,3753,1792,3747
top,True,1803.00933v1.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,12613,58,923,58


dev_f1_pd


Unnamed: 0,label,title,TDM,Context
count,5472,5472,5472,5472
unique,1,1608,1557,1606
top,True,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,5472,58,378,58


train_f2_pd


Unnamed: 0,label,title,TDM,Context
count,12677,12677,12677,12677
unique,1,3753,1821,3749
top,True,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,12677,58,920,58


dev_f2_pd


Unnamed: 0,label,title,TDM,Context
count,5408,5408,5408,5408
unique,1,1608,1542,1608
top,True,1802.01561v3.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,5408,58,381,58


In [15]:
# len(train_pd.title.unique())
records_train_f1 = train_f1_pd.to_dict("records")
records_dev_f1 = dev_f1_pd.to_dict("records")
records_train_f2 = train_f2_pd.to_dict("records")
records_dev_f2 = dev_f2_pd.to_dict("records")

In [16]:
title_id = records_train_f1[0]["title"].split(".pdf")[0]
title_id

'1707.03497v2'

In [17]:
global_title_to_content = {}

title_to_content = {
    "train_f1":{},
    "dev_f1":{},
    "train_f2":{},
    "dev_f2":{},
    "zero_shot_f1":{},
    "zero_shot_f2":{}
    }

# arxiv_leaderboard_full_txt = 
# "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_leaderboard_full_txt"
arxiv_leaderboard_full_txt = "\
/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_txt_summarised_dec092023"

missed_parsing = 0
missed_short_context = 0
copied = 0
for i, row in tqdm(enumerate(records_train_f1), total = len(records_train_f1)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["train_f1"]:
        continue 
    else:
        try:
            with open(f'{arxiv_leaderboard_full_txt}/{title_id}_summarised.txt', 'r') as file:
                # Read the file
                data = file.read()
                
        except :
            missed_parsing += 1
            continue

        if len(data.split()) < 10:
            missed_short_context += 1
            continue
        else:
            title_to_content["train_f1"][row['title']] = data
            global_title_to_content[row['title']] = data
            copied += 1
        # title_to_content["train_f1"][row['title']] = row['Context'] if len(data.split()) < 100 else data
        # title_to_content["train_f1"][row['title']] = row['Context']

print(f"TRAIN F1:\nCopied: {copied}\nMissed Parsing: {missed_parsing}\
\nMissed Short Context: {missed_short_context}\nAll items: {len(records_train_f1)}\n")

missed_parsing = 0
missed_short_context = 0
copied = 0
for i, row in tqdm(enumerate(records_dev_f1), total = len(records_dev_f1)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["dev_f1"]:
        continue 
    else:
        try:
            with open(f'{arxiv_leaderboard_full_txt}/{title_id}_summarised.txt', 'r') as file:
                # Read the file
                data = file.read()
                
        except :
            # print(f"Error on file {row['title']}")
            data = ""
            missed_parsing += 1
            continue 
        
        # title_to_content["dev_f1"][row['title']] = row['Context'] if len(data.split()) < 100 else data
        if len(data.split()) < 10:
            missed_short_context += 1
            continue
        else:
            title_to_content["dev_f1"][row['title']] = data
            global_title_to_content[row['title']] = data
            copied += 1

print(f"DEV F1:\nCopied: {copied}\nMissed Parsing: {missed_parsing}\
\nMissed Short Context: {missed_short_context}\nAll items: {len(records_dev_f1)}\n")  

missed_parsing = 0
missed_short_context = 0
copied = 0
for i, row in tqdm(enumerate(records_train_f2), total = len(records_train_f2)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["train_f2"]:
        continue 
    else:
        try:
            with open(f'{arxiv_leaderboard_full_txt}/{title_id}_summarised.txt', 'r') as file:
                # Read the file
                data = file.read()
                
        except :
            # print(f"Error on file {row['title']}")
            missed_parsing += 1
            continue 
        
        # title_to_content["train_f2"][row['title']] = row['Context'] if len(data.split()) < 100 else data
        if len(data.split()) < 10:
            missed_short_context += 1
            continue
        else:
            title_to_content["train_f2"][row['title']] = data
            global_title_to_content[row['title']] = data
            copied += 1
            
print(f"TRAIN F2:\nCopied: {copied}\nMissed Parsing: {missed_parsing}\
\nMissed Short Context: {missed_short_context}\nAll items: {len(records_train_f2)}\n") 

missed_parsing = 0
missed_short_context = 0
copied = 0
for i, row in tqdm(enumerate(records_dev_f2), total = len(records_dev_f2)):
    title_id = row['title'].split(".pdf")[0]
    if row['title'] in title_to_content["dev_f2"]:
        continue 
    else:
        try:
            with open(f'{arxiv_leaderboard_full_txt}/{title_id}_summarised.txt', 'r') as file:
                # Read the file
                data = file.read()
                
        except :
            # print(f"Error on file {row['title']}")
            missed_parsing += 1
            continue 
            
        # title_to_content["dev_f2"][row['title']] = row['Context'] if len(data.split()) < 100 else data
        # title_to_content["dev_f2"][row['title']] = row['Context']
        if len(data.split()) < 10:
            missed_short_context += 1
            continue
        else:
            title_to_content["dev_f2"][row['title']] = data
            global_title_to_content[row['title']] = data
            copied += 1
            
print(f"DEV F2:\nCopied: {copied}\nMissed Parsing: {missed_parsing}\
\nMissed Short Context: {missed_short_context}\nAll items: {len(records_dev_f2)}\n") 

100%|██████████| 12613/12613 [00:04<00:00, 3006.70it/s]


TRAIN F1:
Copied: 2628
Missed Parsing: 3977
Missed Short Context: 3
All items: 12613



100%|██████████| 5472/5472 [00:01<00:00, 4850.75it/s]


DEV F1:
Copied: 1094
Missed Parsing: 1824
Missed Short Context: 2
All items: 5472



100%|██████████| 12677/12677 [00:02<00:00, 4521.91it/s]


TRAIN F2:
Copied: 2622
Missed Parsing: 4104
Missed Short Context: 3
All items: 12677



100%|██████████| 5408/5408 [00:01<00:00, 3465.36it/s]

DEV F2:
Copied: 1100
Missed Parsing: 1697
Missed Short Context: 2
All items: 5408






In [18]:
# q

### Add additional Training data

In [19]:
list(title_to_content["train_f1"].keys())[:4]

['1707.03497v2.pdf',
 '2006.10721v2.pdf',
 '1901.10995v4.pdf',
 '1801.01315v1.pdf']

In [20]:
len(title_to_content["train_f1"].keys())

2628

In [21]:
list(title_to_content["dev_f1"].keys())[:4]

['1810.02575v1.pdf',
 '1909.00794v1.pdf',
 '1807.10066v1.pdf',
 '1805.04554v4.pdf']

In [22]:
len(title_to_content["dev_f1"].keys())

1094

In [23]:
print(len(os.listdir(arxiv_leaderboard_full_txt)))

9354


In [24]:
# title_to_content["dev_f1"].keys()
# global_title_to_tdms_dict[row['Title']]

In [25]:
arxiv_leaderboard_full_txt = "\
/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_txt_summarised_dec092023"

print(f"Total available papers: {len(os.listdir(arxiv_leaderboard_full_txt))}\n\n")

print(f'Total train f1 before adding {len(title_to_content["train_f1"].keys())}')
print(f'Total dev f1 {len(title_to_content["dev_f1"].keys())}')

missed = 0

for file_id_sum_txt in os.listdir(arxiv_leaderboard_full_txt):
    
    file_id = file_id_sum_txt.rsplit("_", 1)[0]

    if f"{file_id}.pdf" in title_to_content["dev_f1"]:
        # ipdb.set_trace()
        continue 
    
    try:
        with open(f'{arxiv_leaderboard_full_txt}/{file_id}_summarised.txt', 'r') as file:
            # Read the file
            data = file.read()

        # ipdb.set_trace()
        
        title_to_content["train_f1"][f"{file_id}.pdf"] = data
        global_title_to_content[f"{file_id}.pdf"] = data
        
    except :
        # print(f"Error on file {row['title']}")
        missed += 1
        ipdb.set_trace()
        # continue

print(f"Missed reading F1 {missed}")
print(f'Total train f1 after adding {len(title_to_content["train_f1"].keys())}\n')


missed = 0
print(f'Total train f2 before adding {len(title_to_content["train_f2"].keys())}')
print(f'Total dev f2 {len(title_to_content["dev_f2"].keys())}')

for file_id_sum_txt in os.listdir(arxiv_leaderboard_full_txt):
    
    file_id = file_id_sum_txt.rsplit("_", 1)[0]

    if f"{file_id}.pdf" in title_to_content["dev_f2"]:
        # ipdb.set_trace()
        continue 
    
    try:
        with open(f'{arxiv_leaderboard_full_txt}/{file_id}_summarised.txt', 'r') as file:
            # Read the file
            data = file.read()

        # ipdb.set_trace()
        
        title_to_content["train_f2"][f"{file_id}.pdf"] = data
        global_title_to_content[f"{file_id}.pdf"] = data
        
    except :
        # print(f"Error on file {row['title']}")
        missed += 1
        ipdb.set_trace()
        # continue

print(f"Missed reading F2 {missed}")
print(f'Total train f2 after adding {len(title_to_content["train_f2"].keys())}')

Total available papers: 9354


Total train f1 before adding 2628
Total dev f1 1094


Missed reading F1 0
Total train f1 after adding 8260

Total train f2 before adding 2622
Total dev f2 1100
Missed reading F2 0
Total train f2 after adding 8254


In [26]:
len(train_f1_pd.title.unique())

3753

In [27]:
no_leaderboard_pourcentage_train_f1 = int(len(title_to_content["train_f1"].keys())*50/100)
no_leaderboard_pourcentage_dev_f1 = int(len(title_to_content["dev_f1"].keys())*50/100)
no_leaderboard_pourcentage_train_f2 = int(len(title_to_content["train_f2"].keys())*50/100)
no_leaderboard_pourcentage_dev_f2 = int(len(title_to_content["dev_f2"].keys())*50/100)

print(f"no_leaderboard_pourcentage_train_f1: {no_leaderboard_pourcentage_train_f1}")
print(f"no_leaderboard_pourcentage_dev_f1: {no_leaderboard_pourcentage_dev_f1}")
print(f"no_leaderboard_pourcentage_train_f2: {no_leaderboard_pourcentage_train_f2}")
print(f"no_leaderboard_pourcentage_dev_f2: {no_leaderboard_pourcentage_dev_f2}")

no_leaderboard_pourcentage_train_f1: 4130
no_leaderboard_pourcentage_dev_f1: 547
no_leaderboard_pourcentage_train_f2: 4127
no_leaderboard_pourcentage_dev_f2: 550


In [28]:
(no_leaderboard_pourcentage_train_f1+no_leaderboard_pourcentage_train_f2)//2

4128

In [29]:
(no_leaderboard_pourcentage_dev_f1+no_leaderboard_pourcentage_dev_f2)//2

548

In [30]:
# no_leaderboard_pourcentage = int(len(train_pd.title.unique())*50/100)
# no_leaderboard_pourcentage

In [31]:
papers_in_training = []
no_leaderboard_papers = []

i = 0

arxiv_no_leaderboard_full_txt = "\
/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_no_leaderboard_txt_25_000_summarised_dec092023"

print(f"Total available no leaderboard papers: {len(os.listdir(arxiv_no_leaderboard_full_txt))}\n\n")


for file_id_sum_txt in tqdm(os.listdir(arxiv_no_leaderboard_full_txt), total = len(os.listdir(arxiv_no_leaderboard_full_txt))):
    
    file_id = file_id_sum_txt.rsplit("_", 1)[0]

    
    if f"{file_id}.pdf" in title_to_content["train_f1"] or f"{file_id}.pdf" in title_to_content["train_f2"]:
        # ipdb.set_trace()
        continue 

    if i >= (no_leaderboard_pourcentage_train_f1+no_leaderboard_pourcentage_train_f2)//2:
        break 
        
    
    try:
        with open(f'{arxiv_no_leaderboard_full_txt}/{file_id}_summarised.txt', 'r') as file:
            # Read the file
            data = file.read()
        
    except :
        missed_parsing += 1
        continue 

    if len(data.split()) < 10:
        missed_short_context += 1
        continue
    else:
        title_to_content["train_f1"][f"{file_id}.pdf"] = data
        title_to_content["train_f2"][f"{file_id}.pdf"] = data
        global_title_to_content[f"{file_id}.pdf"] = data
        papers_in_training.append(f"{file_id}.pdf")
        no_leaderboard_papers.append(f"{file_id}.pdf")
        i += 1

print(f"TRAIN F1&F2:\Added No_LB_Papers: {i}\nMissed Parsing: {missed_parsing}\
\nMissed Short Context: {missed_short_context}\n") 

i = 0

for file_id_sum_txt in tqdm(os.listdir(arxiv_no_leaderboard_full_txt), total = len(os.listdir(arxiv_no_leaderboard_full_txt))):
    
    file_id = file_id_sum_txt.rsplit("_", 1)[0]

    
    if f"{file_id}.pdf" in title_to_content["dev_f1"] or f"{file_id}.pdf" in title_to_content["dev_f2"]  or f"{file_id}.pdf" in papers_in_training:
        # ipdb.set_trace()
        continue 

    if i >= (no_leaderboard_pourcentage_dev_f1+no_leaderboard_pourcentage_dev_f2)//2:
        break 
        
    
    try:
        with open(f'{arxiv_no_leaderboard_full_txt}/{file_id}_summarised.txt', 'r') as file:
            # Read the file
            data = file.read()
        
    except :
        missed_parsing += 1
        continue 

    if len(data.split()) < 10:
        missed_short_context += 1
        continue
    else:
        title_to_content["dev_f1"][f"{file_id}.pdf"] = data
        title_to_content["dev_f2"][f"{file_id}.pdf"] = data
        global_title_to_content[f"{file_id}.pdf"] = data
        no_leaderboard_papers.append(f"{file_id}.pdf")
        i += 1

print(f"DEV F1&F2:\Added No_LB_Papers: {i}\nMissed Parsing: {missed_parsing}\
\nMissed Short Context: {missed_short_context}\n") 

Total available no leaderboard papers: 11258




  1%|          | 105/11258 [00:00<00:10, 1045.96it/s]

 37%|███▋      | 4138/11258 [00:03<00:06, 1119.66it/s]


TRAIN F1&F2:\Added No_LB_Papers: 4128
Missed Parsing: 1697
Missed Short Context: 12



 42%|████▏     | 4686/11258 [00:00<00:00, 7222.57it/s] 

DEV F1&F2:\Added No_LB_Papers: 548
Missed Parsing: 1697
Missed Short Context: 22






In [32]:
print(f'Final total train f1 after adding {len(title_to_content["train_f1"].keys())}')
print(f'Final total dev f1 after adding {len(title_to_content["dev_f1"].keys())}')
print(f'Final total train f2 after adding {len(title_to_content["train_f2"].keys())}')
print(f'Final total dev f1 after adding {len(title_to_content["dev_f2"].keys())}')

Final total train f1 after adding 12388
Final total dev f1 after adding 1642
Final total train f2 after adding 12382
Final total dev f1 after adding 1648


In [33]:
12388*10/100

1238.8

In [34]:
# train_f1_pd["Lenght context"] = train_f1_pd.Context.apply(lambda x: len(x.split()))
# dev_f1_pd["Lenght context"] = dev_f1_pd.Context.apply(lambda x: len(x.split()))
# train_f2_pd["Lenght context"] = train_f2_pd.Context.apply(lambda x: len(x.split()))
# dev_f2_pd["Lenght context"] = dev_f2_pd.Context.apply(lambda x: len(x.split()))

In [35]:
# train_pd[train_pd["Lenght context"] < 400]

In [36]:
# train_pd = train_pd[train_pd["Lenght context"] < 400]

In [37]:
# print("train_f1_pd describe: ")
# display(train_f1_pd.describe())
# print("dev_f1_pd describe: ")
# display(dev_f1_pd.describe())

# print("train_f2_pd describe: ")
# display(train_f2_pd.describe())
# print("dev_f2_pd describe: ")
# display(dev_f2_pd.describe())

In [38]:
# Title' : title, 
# 'TDMSs' : global_title_to_tdms_dict[title] if title in global_title_to_tdms_dict.keys() else "unanswerable",
# 'Context' : title_to_content["train_f1"][title],
# 'Lenght Context': len(title_to_content["train_f1"][title].split()),
# 'Lenght TDMSs': len(str(global_title_to_tdms_dict[title] if title in global_title_to_tdms_dict.keys() else "unanswerable").split())



# shutil.copyfile(src, dst)

## Create data for codalab 

In [39]:
root_data_folder = "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess"

arxiv_leaderboard_folder = f"{root_data_folder}/arxiv_tex_dec092023"

arxiv_no_leaderboard_fold = f"{root_data_folder}/arxiv_no_leaderboard_tex_25_000"

target_folder = f"{root_data_folder}/codalab/fewshot"

# zero_shot_file = "/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/zero-shot/new_output/fold1/zero_shot_papers_with_true_only.tsv"
zero_shot_file = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/zero-shot/new_output/fold1/arxiv_pdf_zero_shot_1000/test.tsv"

In [40]:
print(f'Final total train f1 after adding {len(title_to_content["train_f1"].keys())}')
print(f'Final total dev f1 after adding {len(title_to_content["dev_f1"].keys())}')
print(f'Final total train f2 after adding {len(title_to_content["train_f2"].keys())}')
print(f'Final total dev f1 after adding {len(title_to_content["dev_f2"].keys())}')

Final total train f1 after adding 12388
Final total dev f1 after adding 1642
Final total train f2 after adding 12382
Final total dev f1 after adding 1648


In [41]:
test_f1_pd = pd.read_csv(f"{zero_shot_file}", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

test_f1_pd.tail()

Unnamed: 0,label,title,TDM,Context
1937995,False,2007.01548v2.pdf,Relation Extraction; ACE 2005; RE Micro F1,Multiple Instance-Based Video Anomaly Detectio...
1937996,False,2007.01548v2.pdf,Named Entity Recognition; BC2GM; F1,Multiple Instance-Based Video Anomaly Detectio...
1937997,False,2007.01548v2.pdf,Action Classification; Kinetics-600; GFLOPs,Multiple Instance-Based Video Anomaly Detectio...
1937998,False,2007.01548v2.pdf,Visual Question Answering; A-OKVQA; MC Accuracy,Multiple Instance-Based Video Anomaly Detectio...
1937999,False,2007.01548v2.pdf,unknown,Multiple Instance-Based Video Anomaly Detectio...


In [42]:
test_f1_pd = test_f1_pd[test_f1_pd.label==True]
test_f1_pd

Unnamed: 0,label,title,TDM,Context
163,True,2209.03182v1.pdf,Named Entity Recognition; BC5CDR-disease; F1,On the Effectiveness of Compact Biomedical Tra...
1283,True,2209.03182v1.pdf,Named Entity Recognition; BC5CDR-chemical; F1,On the Effectiveness of Compact Biomedical Tra...
1934,True,2209.03182v1.pdf,Named Entity Recognition; BC2GM; F1,On the Effectiveness of Compact Biomedical Tra...
2748,True,2102.06108v1.pdf,Image Generation; FFHQ 1024 x 1024; FID,SWAGAN: A Style-based WAvelet-driven Generativ...
3958,True,1911.03911v2.pdf,Semantic Retrieval; Contract Discovery; Soft-F1,Contract Discovery: Dataset and a Few-Shot Sem...
...,...,...,...,...
1933078,True,2205.13271v2.pdf,Unsupervised Object Segmentation; ClevrTex; MSE,Unsupervised multi-object segmentation using a...
1933964,True,2205.13271v2.pdf,Unsupervised Object Segmentation; ObjectsRoom;...,Unsupervised multi-object segmentation using a...
1935769,True,2002.12177v1.pdf,Self-Supervised Action Recognition; HMDB51; Fr...,Evolving Losses for Unsupervised Video Represe...
1937416,True,2007.01548v2.pdf,Anomaly Detection In Surveillance Videos; Shan...,Multiple Instance-Based Video Anomaly Detectio...


In [43]:
test_f1_pd.describe()

Unnamed: 0,label,title,TDM,Context
count,3167,3167,3167,3167
unique,1,1000,1371,1000
top,True,2207.07115v2.pdf,Image Classification; ImageNet; GFLOPs,XMem: Long-Term Video Object Segmentation with...
freq,3167,40,33,40


In [44]:
test_f1_pd.to_csv(f"{root_data_folder}/test_1000_f1_jcdl.tsv", 
                       header=False, index=False, sep="\t")

In [45]:
# path_to_source = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_full_5_10_10000_clone_latex_compare/10Neg10000unk/twofoldwithunk"
path_to_csv = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk"

fold1 = "fold1"
train_f1_pd = pd.read_csv(f"{path_to_csv}/{fold1}/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])
dev_f1_pd = pd.read_csv(f"{path_to_csv}/{fold1}/dev.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

fold2 = "fold2"
train_f2_pd = pd.read_csv(f"{path_to_csv}/{fold2}/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])
dev_f2_pd = pd.read_csv(f"{path_to_csv}/{fold2}/dev.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

# No need for negative instances, but will still have 'duplicate' for paper with more than one leaderboard
train_f1_pd = train_f1_pd[train_f1_pd.label==True]
print("train_f1_pd")
display(train_f1_pd.describe())

dev_f1_pd = dev_f1_pd[dev_f1_pd.label==True]
print("dev_f1_pd")
display(dev_f1_pd.describe())

train_f2_pd = train_f2_pd[train_f2_pd.label==True]
print("train_f2_pd")
display(train_f2_pd.describe())

dev_f2_pd = dev_f2_pd[dev_f2_pd.label==True]
print("dev_f2_pd")
display(dev_f2_pd.describe())

train_f1_pd


Unnamed: 0,label,title,TDM,Context
count,12613,12613,12613,12613
unique,1,3753,1792,3747
top,True,1803.00933v1.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,12613,58,923,58


dev_f1_pd


Unnamed: 0,label,title,TDM,Context
count,5472,5472,5472,5472
unique,1,1608,1557,1606
top,True,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,5472,58,378,58


train_f2_pd


Unnamed: 0,label,title,TDM,Context
count,12677,12677,12677,12677
unique,1,3753,1821,3749
top,True,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,12677,58,920,58


dev_f2_pd


Unnamed: 0,label,title,TDM,Context
count,5408,5408,5408,5408
unique,1,1608,1542,1608
top,True,1802.01561v3.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,5408,58,381,58


In [46]:
dev_f2_pd.TDM.unique().shape

(1542,)

In [47]:
i = 0
for tdm in train_f2_pd.TDM.unique():
    if tdm in test_f1_pd.TDM.unique():
        # print(tdm)
        i+=1
        
print(i)

21


In [48]:
i = 0
for tdm in test_f1_pd.TDM.unique():
    if tdm in train_f2_pd.TDM.unique():
        # print(tdm)
        i+=1
        
print(i)

21


In [49]:

def calculate_fuzz_ratio(text1, text2):
    return fuzz.ratio(str(text1).strip().lower(), str(text2).strip().lower())

In [50]:
calculate_fuzz_ratio("BG; 1-2", "BG;1-2")

92

In [51]:
for ppaer in global_title_to_tdms_dict:
    # ipdb.set_trace()
    if global_title_to_tdms_dict[ppaer] ==[]:
        ipdb.set_trace()
        
# tdms = global_title_to_tdms_dict[k] if k in global_title_to_tdms_dict.keys() else "unanswerable"
    
# if tdms == "unanswerable":
#     unanswerable_count += 1
# else:
#     answerable_count += 1
    
# if tdms == []:
#     ipdb.set_trace()

In [52]:
papers_to_drop = []
missed_parsing_zero = 0
THRESHOLD = 90

for paper, contents in tqdm(title_to_content["train_f1"].items(), total=len(title_to_content["train_f1"].items())):
    tdms = global_title_to_tdms_dict[paper] if paper in global_title_to_tdms_dict.keys() else "unanswerable" if paper in no_leaderboard_papers else "missed"
    
    if tdms == "unanswerable" or tdms == "missed":
        # ipdb.set_trace()
        continue 
    
    for tdm in test_f1_pd.TDM.unique():
        
        try:
            test_task, test_dataset, test_metric = tdm.split(";", 2)
            test_task, test_dataset, test_metric = test_task.strip(), test_dataset.strip(), test_metric.strip()
        except :
            missed_parsing_zero += 1
            
            continue 
        
        if paper in papers_to_drop:
            break
        
        for leaderboard in tdms:
            train_task = leaderboard["LEADERBOARD"]["Task"]
            train_dataset = leaderboard["LEADERBOARD"]["Dataset"]
            train_metric = leaderboard["LEADERBOARD"]["Metric"]
            train_score = leaderboard["LEADERBOARD"]["Score"]
            
            train_task, train_dataset, train_metric = train_task.strip(), train_dataset.strip(), train_metric.strip()
            
            # # A) Exact 
            # if train_task == test_task and train_dataset == test_dataset and train_metric == test_metric:
                
            # B) Partial 
            if calculate_fuzz_ratio(train_task, test_task) >= THRESHOLD and calculate_fuzz_ratio(train_dataset, test_dataset) >= THRESHOLD and calculate_fuzz_ratio(train_metric, test_metric) >= THRESHOLD:
            
                papers_to_drop.append(paper)
                # ipdb.set_trace()
                break 
        
print(f"missed_parsing_zero {missed_parsing_zero}")       
len(papers_to_drop)

100%|██████████| 12388/12388 [52:09<00:00,  3.96it/s]  

missed_parsing_zero 0





2663

In [53]:
# 2564


In [54]:
papers_to_drop[:5]

['2006.10721v2.pdf',
 '1901.10995v4.pdf',
 '2105.02209v1.pdf',
 '1901.00392v2.pdf',
 '2009.04703v2.pdf']

In [55]:
len(papers_to_drop)

2663

In [56]:
test_f1_pd.title.unique().shape

(1000,)

In [57]:
arxiv_leaderboard_folder = "\
/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_tex_dec092023"

arxiv_no_leaderboard_fold = "\
/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_no_leaderboard_tex_25_000"

target_folder = "\
/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/codalab"

# zero_shot_test_data = ""

unanswerable_count = 0
answerable_count = 0
missed_count = 0

for k, v in tqdm(title_to_content["train_f1"].items(), total=len(title_to_content["train_f1"].items())):
    
    if k in papers_to_drop:
        # ipdb.set_trace()
        continue 
    
    tdms = global_title_to_tdms_dict[k] if k in global_title_to_tdms_dict.keys() else "unanswerable" if k in no_leaderboard_papers else "missed"
    
    if tdms == "unanswerable":
        unanswerable_count += 1
    elif tdms == "missed":
        missed_count += 1
        continue
    else:
        answerable_count += 1
        
    if tdms == []:
        ipdb.set_trace()
    
    id_k = k.rsplit(".", 1)[0]
    if not os.path.exists(f"{target_folder}/train/{id_k}"):
        os.makedirs(f"{target_folder}/train/{id_k}")

    src_lb = f"{arxiv_leaderboard_folder}/{id_k}.tex"
    dst = f"{target_folder}/train/{id_k}/{id_k}.tex"

    src_no_lb = f"{arxiv_no_leaderboard_fold}/{id_k}.tex"
    
    if os.path.exists(src_lb):
        shutil.copyfile(src_lb, dst)

    if os.path.exists(src_no_lb):
        shutil.copyfile(src_no_lb, dst)
        
    with open(f'{target_folder}/train/{id_k}/annotations.txt', 'w') as file:
        file.write(f"{''.join(str(tdms))}\n")
          
print(f"unanswerable_count : {unanswerable_count}")
print(f"missed_count : {missed_count}")
print(f"answerable_count : {answerable_count}")

100%|██████████| 12388/12388 [04:42<00:00, 43.79it/s]

unanswerable_count : 4128
missed_count : 273
answerable_count : 5324





In [58]:
# 100%|██████████| 12388/12388 [01:03<00:00, 193.73it/s]

# unanswerable_count : 4128
# missed_count : 273
# answerable_count : 5423

# 100%|██████████| 12388/12388 [01:06<00:00, 185.92it/s]
# unanswerable_count : 4128
# missed_count : 273
# answerable_count : 5324




In [59]:
5324+4128

9452

In [60]:
# fewshot

unanswerable_count = 0
answerable_count = 0
missed_count = 0

for k, v in tqdm(title_to_content["dev_f1"].items(), total=len(title_to_content["dev_f1"].items())):
    
    tdms = global_title_to_tdms_dict[k] if k in global_title_to_tdms_dict.keys() else "unanswerable" if k in no_leaderboard_papers else "missed"
    
    if tdms == "unanswerable":
        unanswerable_count += 1
    elif tdms == "missed":
        missed_count += 1
        continue
    else:
        answerable_count += 1
        
                
    id_k = k.rsplit(".", 1)[0]
    if not os.path.exists(f"{target_folder}/fewshot/test/{id_k}"):
        os.makedirs(f"{target_folder}/fewshot/test/{id_k}")

    src_lb = f"{arxiv_leaderboard_folder}/{id_k}.tex"
    dst = f"{target_folder}/fewshot/test/{id_k}/{id_k}.tex"

    src_no_lb = f"{arxiv_no_leaderboard_fold}/{id_k}.tex"
    
    if os.path.exists(src_lb):
        shutil.copyfile(src_lb, dst)

    if os.path.exists(src_no_lb):
        shutil.copyfile(src_no_lb, dst)
                

    with open(f'{target_folder}/fewshot/test/{id_k}/annotations.txt', 'w') as file:
        file.write(f"{''.join(str(tdms))}\n")

print(f"unanswerable_count : {unanswerable_count}")
print(f"missed_count : {missed_count}")
print(f"answerable_count : {answerable_count}")

 82%|████████▏ | 1343/1642 [00:34<00:10, 29.30it/s]

100%|██████████| 1642/1642 [00:43<00:00, 37.36it/s]

unanswerable_count : 548
missed_count : 100
answerable_count : 994





In [61]:
# 100%|██████████| 1642/1642 [00:10<00:00, 155.69it/s]
# unanswerable_count : 548
# missed_count : 100
# answerable_count : 994

#  75%|███████▍  | 1229/1642 [00:07<00:02, 142.20it/s]
# 100%|██████████| 1642/1642 [00:10<00:00, 153.60it/s]
# unanswerable_count : 548
# missed_count : 100
# answerable_count : 994

In [62]:
# Zeroshot

unanswerable_count = 0
answerable_count = 0
missed_count = 0

for k in tqdm(test_f1_pd.title.unique(), total=len(test_f1_pd.title.unique())):
    
    tdms = global_title_to_tdms_dict[k] if k in global_title_to_tdms_dict.keys() else "unanswerable" if k in no_leaderboard_papers else "missed"
    
    if tdms == "unanswerable":
        unanswerable_count += 1
    elif tdms == "missed":
        missed_count += 1
        continue
    else:
        answerable_count += 1
        
    if tdms == []:
        ipdb.set_trace()
        
    id_k = k.rsplit(".", 1)[0]
    if not os.path.exists(f"{target_folder}/zeroshot/test/{id_k}"):
        os.makedirs(f"{target_folder}/zeroshot/test/{id_k}")

    src_lb = f"{arxiv_leaderboard_folder}/{id_k}.tex"
    dst = f"{target_folder}/zeroshot/test/{id_k}/{id_k}.tex"

    src_no_lb = f"{arxiv_no_leaderboard_fold}/{id_k}.tex"
    
    if os.path.exists(src_lb):
        shutil.copyfile(src_lb, dst)

    if os.path.exists(src_no_lb):
        shutil.copyfile(src_no_lb, dst)
        

    with open(f'{target_folder}/zeroshot/test/{id_k}/annotations.txt', 'w') as file:
        file.write(f"{''.join(str(tdms))}\n")


# additional no learderboard papers 
additional_no_lb = 0
for k, v in tqdm(title_to_content["dev_f1"].items(), total=len(title_to_content["dev_f1"].items())):
    
    if k not in no_leaderboard_papers:
        continue 
    
    if additional_no_lb >= 400:
        break
    
    tdms = global_title_to_tdms_dict[k] if k in global_title_to_tdms_dict.keys() else "unanswerable" if k in no_leaderboard_papers else "missed"
    
    if tdms == "unanswerable":
        unanswerable_count += 1
        # ipdb.set_trace()
    elif tdms == "missed":
        missed_count += 1
        continue
    else:
        answerable_count += 1
        
        
    id_k = k.rsplit(".", 1)[0]
    if not os.path.exists(f"{target_folder}/zeroshot/test/{id_k}"):
        os.makedirs(f"{target_folder}/zeroshot/test/{id_k}")

    src_lb = f"{arxiv_leaderboard_folder}/{id_k}.tex"
    dst = f"{target_folder}/zeroshot/test/{id_k}/{id_k}.tex"

    src_no_lb = f"{arxiv_no_leaderboard_fold}/{id_k}.tex"
    
    if os.path.exists(src_lb):
        shutil.copyfile(src_lb, dst)

    if os.path.exists(src_no_lb):
        shutil.copyfile(src_no_lb, dst)        

    with open(f'{target_folder}/zeroshot/test/{id_k}/annotations.txt', 'w') as file:
        file.write(f"{''.join(str(tdms))}\n")
    
    additional_no_lb+=1
        
print(f"unanswerable_count : {unanswerable_count}")
print(f"missed_count : {missed_count}")
print(f"answerable_count : {answerable_count}")

100%|██████████| 1000/1000 [00:27<00:00, 35.75it/s]
 91%|█████████ | 1494/1642 [00:12<00:01, 121.59it/s] 

unanswerable_count : 400
missed_count : 57
answerable_count : 943





In [63]:
# 100%|██████████| 1000/1000 [00:06<00:00, 149.34it/s]
#  91%|█████████ | 1494/1642 [00:02<00:00, 607.99it/s]  
#  unanswerable_count : 400
# missed_count : 57
# answerable_count : 943


# unanswerable_count : 400
# missed_count : 57
# answerable_count : 943

In [64]:
unanswerable_count

400

In [65]:
# q

In [66]:
# Zeroshot

unanswerable_count = 0
answerable_count = 0
missed_count = 0
missed_parsing = 0

for k in tqdm(test_f1_pd.title.unique(), total=len(test_f1_pd.title.unique())):
    
    tdms = global_title_to_tdms_dict[k] if k in global_title_to_tdms_dict.keys() else "unanswerable" if k in no_leaderboard_papers else "missed"
    
    if tdms == "unanswerable":
        unanswerable_count += 1
    elif tdms == "missed":
        missed_count += 1
        continue
    else:
        answerable_count += 1
        
    
    try:
        data = global_title_to_content[k]
        title_to_content["zero_shot_f1"][k] = data
    except :
        missed_parsing += 1
        continue 
    

# additional no learderboard papers 
additional_no_lb = 0
for k, v in tqdm(title_to_content["dev_f1"].items(), total=len(title_to_content["dev_f1"].items())):
    
    if k not in no_leaderboard_papers:
        continue 
    
    if additional_no_lb >= 400:
        break
    
    tdms = global_title_to_tdms_dict[k] if k in global_title_to_tdms_dict.keys() else "unanswerable" if k in no_leaderboard_papers else "missed"
    
    if tdms == "unanswerable":
        unanswerable_count += 1
        # ipdb.set_trace()
    elif tdms == "missed":
        missed_count += 1
        continue
    else:
        answerable_count += 1
        
        
    id_k = k.rsplit(".", 1)[0]
    
    try:
        data = global_title_to_content[k]
        title_to_content["zero_shot_f1"][k] = data
    except :
        missed_parsing += 1
        continue 
    
        
print(f"unanswerable_count : {unanswerable_count}")
print(f"missed_count : {missed_count}")
print(f"answerable_count : {answerable_count}")

100%|██████████| 1000/1000 [00:00<00:00, 194072.92it/s]
100%|██████████| 1642/1642 [00:00<00:00, 11551.20it/s]

unanswerable_count : 548
missed_count : 57
answerable_count : 943





In [67]:
len(global_title_to_content)

14030

In [68]:
global_content_to_title = {v: k for k,v in global_title_to_content.items()}

In [69]:
len(global_content_to_title)

13935

In [70]:
path_save = "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess"
# save to json
with open(f'{path_save}/global_title_to_summ_content.json', 'w') as fp:
    json.dump(global_title_to_content, fp, indent=4)

# save to json
with open(f'{path_save}/global_summ_content_to_title.json', 'w') as fp:
    json.dump(global_content_to_title, fp, indent=4)


In [82]:
# Opening JSON file
with open(f'{path_save}/global_content_to_title.json') as json_file:
    global_content_to_title = json.load(json_file)

In [74]:
len(global_title_to_content_)

14030

In [71]:
# # Zeroshot

# unanswerable_count = 0
# answerable_count = 0
# missed_count = 0
# missed_parsing = 0

# for k in tqdm(test_f1_pd.title.unique(), total=len(test_f1_pd.title.unique())):
    
#     tdms = global_title_to_tdms_dict[k] if k in global_title_to_tdms_dict.keys() else "unanswerable" if k in no_leaderboard_papers else "missed"
    
#     if tdms == "unanswerable":
#         unanswerable_count += 1
#     elif tdms == "missed":
#         missed_count += 1
#         continue
#     else:
#         answerable_count += 1
        
    
#     try:
#         data = global_title_to_content[k]
#         title_to_content["zero_shot_f1"][k] = data
#     except :
#         missed_parsing += 1
#         continue 
    

# # additional no learderboard papers 
# additional_no_lb = 0
# for k, v in tqdm(title_to_content["dev_f2"].items(), total=len(title_to_content["dev_f2"].items())):
    
#     if k not in no_leaderboard_papers:
#         continue 
    
#     if additional_no_lb >= 400:
#         break
    
#     tdms = global_title_to_tdms_dict[k] if k in global_title_to_tdms_dict.keys() else "unanswerable" if k in no_leaderboard_papers else "missed"
    
#     if tdms == "unanswerable":
#         unanswerable_count += 1
#         # ipdb.set_trace()
#     elif tdms == "missed":
#         missed_count += 1
#         continue
#     else:
#         answerable_count += 1
        
        
#     id_k = k.rsplit(".", 1)[0]
    
#     try:
#         data = global_title_to_content[k]
#         title_to_content["zero_shot_f2"][k] = data
#     except :
#         missed_parsing += 1
#         continue 
    
        
# print(f"unanswerable_count : {unanswerable_count}")
# print(f"missed_count : {missed_count}")
# print(f"answerable_count : {answerable_count}")

In [72]:
df_train_f1 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])
for i, title in tqdm(enumerate(title_to_content["train_f1"].keys()), total = len(title_to_content["train_f1"].keys())):
    
    # if (len(title_to_content["train_f1"][title]) < 10):
    #     continue 
    # if (title not in no_lead_papers_train_f1) :
    
    # if (global_title_to_tdms_dict[title] == []):
    #     ipdb.set_trace()
    #     continue

    df_train_f1 = pd.concat([df_train_f1, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : global_title_to_tdms_dict[title] if title in global_title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["train_f1"][title],
                'Lenght Context': len(title_to_content["train_f1"][title].split()),
                'Lenght TDMSs': len(str(global_title_to_tdms_dict[title] if title in global_title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_train_f1 describe: ")
display(df_train_f1.describe())  

df_dev_f1 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])  
for i, title in tqdm(enumerate(title_to_content["dev_f1"].keys()), total = len(title_to_content["dev_f1"].keys())):
    
    # if (len(title_to_content["dev_f1"][title]) < 10):
    #     continue 
    
    # if (title not in no_lead_papers_dev_f1) :
    # if (global_title_to_tdms_dict[title] == []):
    #      continue

    df_dev_f1 = pd.concat([df_dev_f1, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : global_title_to_tdms_dict[title] if title in global_title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["dev_f1"][title],
                'Lenght Context': len(title_to_content["dev_f1"][title].split()),
                'Lenght TDMSs': len(str(global_title_to_tdms_dict[title] if title in global_title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_dev_f1 describe: ")
display(df_dev_f1.describe())  

df_train_f2 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])
for i, title in tqdm(enumerate(title_to_content["train_f2"].keys()), total = len(title_to_content["train_f2"].keys())):
    
    # if (len(title_to_content["train_f2"][title]) < 10):
    #     continue 
    
    # if (title not in no_lead_papers_train_f2) :
    # if (global_title_to_tdms_dict[title] == []):
    #      continue

    df_train_f2 = pd.concat([df_train_f2, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : global_title_to_tdms_dict[title] if title in global_title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["train_f2"][title],
                'Lenght Context': len(title_to_content["train_f2"][title].split()),
                'Lenght TDMSs': len(str(global_title_to_tdms_dict[title] if title in global_title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_train_f2 describe: ")
display(df_train_f2.describe())  
 
df_dev_f2 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])  
for i, title in tqdm(enumerate(title_to_content["dev_f2"].keys()), total = len(title_to_content["dev_f2"].keys())):
    
    # if (len(title_to_content["dev_f2"][title]) < 10):
    #     continue 
    
    # if (title not in no_lead_papers_dev_f2) :
    # if (global_title_to_tdms_dict[title] == []):
    #          continue

    df_dev_f2 = pd.concat([df_dev_f2, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : global_title_to_tdms_dict[title] if title in global_title_to_tdms_dict.keys() else "unanswerable",
                'Context' : title_to_content["dev_f2"][title],
                'Lenght Context': len(title_to_content["dev_f2"][title].split()),
                'Lenght TDMSs': len(str(global_title_to_tdms_dict[title] if title in global_title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_dev_f2 describe: ")
display(df_dev_f2.describe())  

 99%|█████████▉| 12237/12388 [00:11<00:00, 960.69it/s]

100%|██████████| 12388/12388 [00:11<00:00, 1038.51it/s]

df_train_f1 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,12388.0,12388.0
mean,1544.759122,54.195835
std,1710.013467,127.5645
min,3.0,1.0
25%,518.0,1.0
50%,1454.5,20.0
75%,2150.0,60.0
max,127647.0,5866.0


100%|██████████| 1642/1642 [00:01<00:00, 1265.14it/s]

df_dev_f1 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,1642.0,1642.0
mean,1534.893423,45.068819
std,1297.672284,90.46478
min,13.0,1.0
25%,515.0,1.0
50%,1495.5,13.0
75%,2153.0,54.0
max,25520.0,1870.0


100%|██████████| 12382/12382 [00:11<00:00, 1050.81it/s]

df_train_f2 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,12382.0,12382.0
mean,1544.901712,54.058149
std,1706.252195,126.479566
min,3.0,1.0
25%,523.0,1.0
50%,1459.0,20.0
75%,2149.75,60.0
max,127647.0,5866.0


100%|██████████| 1648/1648 [00:01<00:00, 1260.41it/s]

df_dev_f2 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,1648.0,1648.0
mean,1533.85801,46.136529
std,1336.054999,101.504608
min,12.0,1.0
25%,485.25,1.0
50%,1459.0,14.0
75%,2166.25,52.0
max,25520.0,1829.0


In [73]:
df_zeroshot_f1 = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])
for i, title in tqdm(enumerate(title_to_content["zero_shot_f1"].keys()), total = len(title_to_content["zero_shot_f1"].keys())):
    
    # if (len(title_to_content["train_f1"][title]) < 10):
    #     continue 
    # if (title not in no_lead_papers_train_f1) :
    
    # if (global_title_to_tdms_dict[title] == []):
    #     ipdb.set_trace()
    #     continue

    df_zeroshot_f1 = pd.concat([df_zeroshot_f1, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : global_title_to_tdms_dict[title] if title in global_title_to_tdms_dict.keys() else "unanswerable",
                'Context' : global_title_to_content[title],
                'Lenght Context': len(global_title_to_content[title].split()),
                'Lenght TDMSs': len(str(global_title_to_tdms_dict[title] if title in global_title_to_tdms_dict.keys() else "unanswerable").split())
            }
        ])], ignore_index = True)
print("df_zeroshot_f1 describe: ")
display(df_zeroshot_f1.describe())  

100%|██████████| 1200/1200 [00:00<00:00, 1280.33it/s]

df_zeroshot_f1 describe: 





Unnamed: 0,Lenght Context,Lenght TDMSs
count,1200.0,1200.0
mean,1454.064167,64.499167
std,1394.411923,201.081189
min,3.0,1.0
25%,417.0,1.0
50%,1308.0,12.0
75%,2113.0,73.0
max,25520.0,5866.0


In [74]:
df_train_f1.drop(["Lenght Context", "Lenght TDMSs"], axis=1).describe()

Unnamed: 0,Title,TDMSs,Context
count,12388,12388,12388
unique,12388,7976,12312
top,1707.03497v2.pdf,unanswerable,Title:\t\n\nAbstract:\t\n\n[image]\n
freq,1,4401,20


In [75]:
df_train_f1.describe()

Unnamed: 0,Lenght Context,Lenght TDMSs
count,12388.0,12388.0
mean,1544.759122,54.195835
std,1710.013467,127.5645
min,3.0,1.0
25%,518.0,1.0
50%,1454.5,20.0
75%,2150.0,60.0
max,127647.0,5866.0


In [76]:
7976+4401

12377

In [77]:
template = ['Please answer a question about this article. If the question is unanswerable, say \"unanswerable\"',
'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".',
'If the question is unanswerable, say \"unanswerable\"',
'Try to answer this question if possible (otherwise reply \"unanswerable\"',
'If it is possible to answer this question, answer it for me (else, reply \"unanswerable\"',
'Answer this question, if possible (if impossible, reply \"unanswerable\"',
'Read this: What is the answer? (If it cannot be answered, return \"unanswerable\"',
'Read this: Now answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"',
'Answer based on context:',
'Answer this question based on the article:',
# ("{context}\n\n{question}", "{answer}"),
'Answer this question:',
'Read this article and answer this question',
'Based on the above article, answer a question.',
# 'Context: {context}\n\nQuestion: {question}\n\nAnswer:", "{answer}"),
]

template

['Please answer a question about this article. If the question is unanswerable, say "unanswerable"',
 'Read this and answer the question. If the question is unanswerable, say "unanswerable".',
 'If the question is unanswerable, say "unanswerable"',
 'Try to answer this question if possible (otherwise reply "unanswerable"',
 'If it is possible to answer this question, answer it for me (else, reply "unanswerable"',
 'Answer this question, if possible (if impossible, reply "unanswerable"',
 'Read this: What is the answer? (If it cannot be answered, return "unanswerable"',
 'Read this: Now answer this question, if there is an answer (If it cannot be answered, return "unanswerable"',
 'Answer based on context:',
 'Answer this question based on the article:',
 'Answer this question:',
 'Read this article and answer this question',
 'Based on the above article, answer a question.']

In [78]:
# df.head(2)

In [79]:
def create_pandas_dataset_from_pandas(df,
                                      squad_1 = False,
                                      squad_2 = False,
                                      squad_3 = False,
                                      squad_4 = False,
                                      squad_5 = False,
                                      squad_6 = False,
                                      squad_7 = False,
                                      squad_8 = False,
                                      drop_1 = False,
                                      drop_2 = False,
                                      drop_3 = False,
                                      drop_4 = False,
                                      drop_5 = False,
                                      drop_6 = False,
                                      drop_7 = False
                         ):

  ''' Create a Pandas Dataframe from pandas.
  Params:
        answer_threshold: Only consider those Question Answer pairs where the Answer is short.
  '''
  count_index = 0
  result_df  = pd.DataFrame(columns = ['id', 'prompt', 'answer'])   
  # q_type_1 = "Which Tasks are addressed in this article"
  # q_type_2 = "Which Datasets are addressed in this article"
  # q_type_3 = "Which Metrics are addressed in this article"
  # q_type_4 = "Which Tasks, Datasets, Metrics are addressed in this article"
  # q_type_5 = "Which Tasks, Datasets, Metrics and Scores are addressed in this article" 
  
  # q_types = [
  #   {"q": "Which Tasks are addressed in this article", "a_key": "Tasks"}, 
  #   {"q": "Which Datasets are addressed in this article", "a_key": "Datasets"}, 
  #   {"q": "Which Metrics are addressed in this article", "a_key": "Metrics"},
  #   {"q": "Which Tasks, Datasets, Metrics are addressed in this article", "a_key": "TDMs"},
  #   {"q": "Which Tasks, Datasets, Metrics and Scores are addressed in this article", "a_key": "TDMSs"}
  #   ]
  
  q_types = [
    # {"q": "What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, and metric?", "a_key": "TDMSs"},
    {"q": "What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, metric, and score?", "a_key": "TDMSs"},
    ]
  
  records = df.to_dict("records")
  # db_dict = defaultdict(lambda : list())
  for i, row in tqdm(enumerate(records), total = len(records)):        
      for q_type in q_types:
        
        # Squad_v2 
        if squad_1:
          result_df.loc[count_index] = [str(row["Title"])] + [f'{row["Context"]}\n\nPlease answer a question about this article. If the question is unanswerable, say \"unanswerable\". {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])]
          count_index += 1
        
        if squad_2:
          result_df.loc[count_index] = [str(row["Title"])] + [f'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".\n\n{row["Context"]}\n\n{q_type["q"]}'
  ] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_3:
          result_df.loc[count_index] = [str(row["Title"])] + [f'{row["Context"]}\n{q_type["q"]} (If the question is unanswerable, say \"unanswerable\"'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_4:
          result_df.loc[count_index] = [str(row["Title"])] + [f'{row["Context"]}\nTry to answer this question if possible (otherwise reply \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_5:
          result_df.loc[count_index] = [str(row["Title"])] + [f'{row["Context"]}\nIf it is possible to answer this question, answer it for me (else, reply \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_6:
          result_df.loc[count_index] = [str(row["Title"])] + [f'{row["Context"]}\n\nAnswer this question, if possible (if impossible, reply \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_7:
          result_df.loc[count_index] = [str(row["Title"])] + [f'Read this: {row["Context"]}\n\n{q_type["q"]}\nWhat is the answer? (If it cannot be answered, return \"unanswerable\")'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if squad_8:
          result_df.loc[count_index] = [str(row["Title"])] + [f'Read this: {row["Context"]}\nNow answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"): {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        
        # Drop
        if drop_1:
          result_df.loc[count_index] = [str(row["Title"])] + [f'Answer based on context:\n\n{row["Context"]}\n\n{q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_2:
          result_df.loc[count_index] = [str(row["Title"])] + [f'{row["Context"]}\n\nAnswer this question based on the article: {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_3:
          result_df.loc[count_index] = [str(row["Title"])] + [f'{row["Context"]}\n\n{q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_4:
          result_df.loc[count_index] = [str(row["Title"])] + [f'{row["Context"]}\nAnswer this question: {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_5:
          result_df.loc[count_index] = [str(row["Title"])] + [f'Read this article and answer this question {row["Context"]}\n{q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_6:
          result_df.loc[count_index] = [str(row["Title"])] + [f'{row["Context"]}\n\nBased on the above article, answer a question. {q_type["q"]}'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
        
        if drop_7:
          result_df.loc[count_index] = [str(row["Title"])] + [f'Context: {row["Context"]}\n\nQuestion: {q_type["q"]}\n\nAnswer:'] \
            + [str(row[q_type["a_key"]])] 
          count_index += 1
         
  return result_df

In [80]:
df_train_f1_all_templates = create_pandas_dataset_from_pandas(df_train_f1,
                                                                squad_1 = True,
                                                                squad_2 = True,
                                                                squad_3 = True,
                                                                squad_4 = True,
                                                                squad_5 = True,
                                                                squad_6 = True,
                                                                squad_7 = True,
                                                                squad_8 = True,
                                                                drop_1 = True,
                                                                drop_2 = True,
                                                                drop_3 = True,
                                                                drop_4 = True,
                                                                drop_5 = True,
                                                                drop_6 = True,
                                                                drop_7 = True
                                                              ) 
df_dev_f1_all_templates = create_pandas_dataset_from_pandas(df_dev_f1,
                                                            squad_1 = True,
                                                            squad_2 = True,
                                                            squad_3 = True,
                                                            squad_4 = True,
                                                            squad_5 = True,
                                                            squad_6 = True,
                                                            squad_7 = True,
                                                            squad_8 = True,
                                                            drop_1 = True,
                                                            drop_2 = True,
                                                            drop_3 = True,
                                                            drop_4 = True,
                                                            drop_5 = True,
                                                            drop_6 = True,
                                                            drop_7 = True
                                                            ) 
df_train_f2_all_templates = create_pandas_dataset_from_pandas(df_train_f2,
                                                            squad_1 = True,
                                                            squad_2 = True,
                                                            squad_3 = True,
                                                            squad_4 = True,
                                                            squad_5 = True,
                                                            squad_6 = True,
                                                            squad_7 = True,
                                                            squad_8 = True,
                                                            drop_1 = True,
                                                            drop_2 = True,
                                                            drop_3 = True,
                                                            drop_4 = True,
                                                            drop_5 = True,
                                                            drop_6 = True,
                                                            drop_7 = True
                                                              ) 
df_dev_f2_all_templates = create_pandas_dataset_from_pandas(df_dev_f2,
                                                            squad_1 = True,
                                                            squad_2 = True,
                                                            squad_3 = True,
                                                            squad_4 = True,
                                                            squad_5 = True,
                                                            squad_6 = True,
                                                            squad_7 = True,
                                                            squad_8 = True,
                                                            drop_1 = True,
                                                            drop_2 = True,
                                                            drop_3 = True,
                                                            drop_4 = True,
                                                            drop_5 = True,
                                                            drop_6 = True,
                                                            drop_7 = True 
                                                            ) 

df_zeroshot_f1_all_templates = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                            squad_1 = True,
                                                            squad_2 = True,
                                                            squad_3 = True,
                                                            squad_4 = True,
                                                            squad_5 = True,
                                                            squad_6 = True,
                                                            squad_7 = True,
                                                            squad_8 = True,
                                                            drop_1 = True,
                                                            drop_2 = True,
                                                            drop_3 = True,
                                                            drop_4 = True,
                                                            drop_5 = True,
                                                            drop_6 = True,
                                                            drop_7 = True 
                                                            ) 

print("df_train_f1_all_templates describe: ")
display(df_train_f1_all_templates.describe())
print("df_dev_f1_all_templates describe: ")
display(df_dev_f1_all_templates.describe())

print("df_train_f2_all_templates describe: ")
display(df_train_f2_all_templates.describe())
print("df_dev_f2_all_templates describe: ")
display(df_dev_f2_all_templates.describe())

print("df_zeroshot_f1_all_templates describe: ")
display(df_zeroshot_f1_all_templates.describe())

 12%|█▏        | 1529/12388 [00:36<05:33, 32.52it/s]

100%|██████████| 12388/12388 [30:02<00:00,  6.87it/s]
100%|██████████| 1642/1642 [00:39<00:00, 41.76it/s]
100%|██████████| 12382/12382 [30:00<00:00,  6.88it/s]
100%|██████████| 1648/1648 [00:39<00:00, 41.96it/s]
100%|██████████| 1200/1200 [00:26<00:00, 45.08it/s]


df_train_f1_all_templates describe: 


Unnamed: 0,id,prompt,answer
count,185820,185820,185820
unique,12388,184680,7976
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\nWhat are...,unanswerable
freq,15,20,66015


df_dev_f1_all_templates describe: 


Unnamed: 0,id,prompt,answer
count,24630,24630,24630
unique,1642,24615,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,15,2,9720


df_train_f2_all_templates describe: 


Unnamed: 0,id,prompt,answer
count,185730,185730,185730
unique,12382,184665,7988
top,1707.03497v2.pdf,Answer based on context:\n\nTitle:\t\n\nAbstra...,unanswerable
freq,15,20,65745


df_dev_f2_all_templates describe: 


Unnamed: 0,id,prompt,answer
count,24720,24720,24720
unique,1648,24690,983
top,2006.10721v2.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,15,2,9990


df_zeroshot_f1_all_templates describe: 


Unnamed: 0,id,prompt,answer
count,18000,18000,18000
unique,1200,17955,653
top,2209.03182v1.pdf,Context: Title:\t\n\nAbstract:\t\n\n[image]\n\...,unanswerable
freq,15,3,8220


In [81]:
df_train_f1_all_templates.describe()

Unnamed: 0,id,prompt,answer
count,185820,185820,185820
unique,12388,184680,7976
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\nWhat are...,unanswerable
freq,15,20,66015


In [82]:
df_train_f1_all_templates['answer'].apply(type).value_counts()


<class 'str'>    185820
Name: answer, dtype: int64

In [83]:
df_dev_f1_all_templates['answer'].apply(type).value_counts()

<class 'str'>    24630
Name: answer, dtype: int64

In [84]:
str(df_dev_f1_all_templates.at[5, 'answer'])

"[{'LEADERBOARD': {'Task': 'Semantic Segmentation', 'Dataset': 'Nighttime Driving', 'Metric': 'mIoU', 'Score': '36.1'}}]"

In [85]:
df_train_f1_all_templates.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_all_templates.parquet')
df_dev_f1_all_templates.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_all_templates.parquet')
df_zeroshot_f1_all_templates.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_all_templates.parquet')

df_train_f2_all_templates.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_all_templates.parquet')
df_dev_f2_all_templates.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_all_templates.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_all_templates.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_all_templates.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_all_templates.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_all_templates.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_all_templates.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_ALL_TEMPLATE")

Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-c2d0a9ded7acf665/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-c2d0a9ded7acf665/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-8cefe49801560e68/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-8cefe49801560e68/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-3dca7b74a8bc3e05/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-3dca7b74a8bc3e05/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-d71d6ae30ef33d65/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-d71d6ae30ef33d65/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-b8031069f9da66d6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-b8031069f9da66d6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 185820
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 24630
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 18000
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 185730
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 24720
        })
 

Saving the dataset (0/5 shards):   0%|          | 0/185820 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24630 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/185730 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/24720 [00:00<?, ? examples/s]

In [86]:
# df_train_f1_all_templates = pd.read_parquet('../data/df_train_f1_all_templates.parquet')
# df_dev_f1_all_templates = pd.read_parquet('../data/df_dev_f1_all_templates.parquet')
# df_train_f2_all_templates = pd.read_parquet('../data/df_train_f2_all_templates.parquet')
# df_dev_f2_all_templates = pd.read_parquet('../data/df_dev_f2_all_templates.parquet')

# Specific Template

In [87]:
df_train_f1_all_templates.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_all_templates.parquet')
df_dev_f1_all_templates.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_all_templates.parquet')
df_zeroshot_f1_all_templates.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_all_templates.parquet')

df_train_f2_all_templates.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_all_templates.parquet')
df_dev_f2_all_templates.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_all_templates.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_all_templates.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_all_templates.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_all_templates.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_all_templates.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_all_templates.parquet')
    })
})

Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-51349780d6fd44c1/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-51349780d6fd44c1/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-43a24fd055a0041c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-43a24fd055a0041c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-77268f8c1d4b2c0c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-77268f8c1d4b2c0c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-feef6d6f081a635a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-feef6d6f081a635a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-84fb0c44a329e243/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-84fb0c44a329e243/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


In [88]:
df_train_f1_squad_1 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_1 = True,
                                                        ) 
df_dev_f1_squad_1 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_1 = True,
                                                        ) 
df_zeroshot_f1_squad_1 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        squad_1 = True,
                                                        ) 

df_train_f2_squad_1 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_1 = True,
                                                        ) 
df_dev_f2_squad_1 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_1 = True,
                                                        ) 

print("df_train_f1_squad_1 describe: ")
display(df_train_f1_squad_1.describe())
print("df_dev_f1_squad_1 describe: ")
display(df_dev_f1_squad_1.describe())
print("df_dev_f1_squad_1 describe: ")
display(df_dev_f1_squad_1.describe())

print("df_train_f2_squad_1 describe: ")
display(df_train_f2_squad_1.describe())
print("df_zeroshot_f1_squad_1 describe: ")
display(df_zeroshot_f1_squad_1.describe())

df_train_f1_squad_1.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_1.parquet')
df_dev_f1_squad_1.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_1.parquet')
df_zeroshot_f1_squad_1.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_1.parquet')

df_train_f2_squad_1.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_1.parquet')
df_dev_f2_squad_1.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_1.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_1.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_1.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_1.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_SQUAD_1")

  1%|▏         | 157/12388 [00:00<00:15, 784.81it/s]

100%|██████████| 12388/12388 [00:17<00:00, 693.34it/s]
100%|██████████| 1642/1642 [00:02<00:00, 786.27it/s]
100%|██████████| 1200/1200 [00:01<00:00, 790.38it/s]
100%|██████████| 12382/12382 [00:17<00:00, 699.35it/s]
100%|██████████| 1648/1648 [00:02<00:00, 790.08it/s]


df_train_f1_squad_1 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nPlease...,unanswerable
freq,1,20,4401


df_dev_f1_squad_1 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_dev_f1_squad_1 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_train_f2_squad_1 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nPlease...,unanswerable
freq,1,20,4383


df_zeroshot_f1_squad_1 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nPlease...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-e14ae07e5ddd036f/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-e14ae07e5ddd036f/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-bd4fe45c97fe1e40/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-bd4fe45c97fe1e40/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-ffd3f28481e0c4d9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-ffd3f28481e0c4d9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-9bd43d2155d294a9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-9bd43d2155d294a9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-22ac5dbcff0819bc/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-22ac5dbcff0819bc/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

In [89]:
df_train_f1_squad_2 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_2 = True,
                                                        ) 
df_dev_f1_squad_2 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_2 = True,
                                                        ) 
df_zeroshot_f1_squad_2 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        squad_2 = True,
                                                        ) 

df_train_f2_squad_2 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_2 = True,
                                                        ) 
df_dev_f2_squad_2 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_2 = True,
                                                        ) 

print("df_train_f1_squad_2 describe: ")
display(df_train_f1_squad_2.describe())
print("df_dev_f1_squad_2 describe: ")
display(df_dev_f1_squad_2.describe())
print("df_dev_f1_squad_2 describe: ")
display(df_dev_f1_squad_2.describe())

print("df_train_f2_squad_2 describe: ")
display(df_train_f2_squad_2.describe())
print("df_zeroshot_f1_squad_2 describe: ")
display(df_zeroshot_f1_squad_2.describe())

df_train_f1_squad_2.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_2.parquet')
df_dev_f1_squad_2.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_2.parquet')
df_zeroshot_f1_squad_2.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_2.parquet')

df_train_f2_squad_2.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_2.parquet')
df_dev_f2_squad_2.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_2.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_2.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_2.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_2.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_SQUAD_2")

100%|██████████| 12388/12388 [00:17<00:00, 699.95it/s]
100%|██████████| 1642/1642 [00:02<00:00, 794.51it/s]
100%|██████████| 1200/1200 [00:01<00:00, 794.30it/s]
100%|██████████| 12382/12382 [00:17<00:00, 702.06it/s]
100%|██████████| 1648/1648 [00:02<00:00, 792.26it/s]


df_train_f1_squad_2 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Read this and answer the question. If the ques...,unanswerable
freq,1,20,4401


df_dev_f1_squad_2 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Read this and answer the question. If the ques...,unanswerable
freq,1,2,648


df_dev_f1_squad_2 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Read this and answer the question. If the ques...,unanswerable
freq,1,2,648


df_train_f2_squad_2 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Read this and answer the question. If the ques...,unanswerable
freq,1,20,4383


df_zeroshot_f1_squad_2 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Read this and answer the question. If the ques...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-baa2d5a078bfa390/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-baa2d5a078bfa390/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-dc004630c81c7bdf/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-dc004630c81c7bdf/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-31afba457076e280/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-31afba457076e280/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-f2f3dd240789ba1c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-f2f3dd240789ba1c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-e8ff040c3ec48c4e/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-e8ff040c3ec48c4e/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

In [90]:
df_train_f1_squad_3 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_3 = True,
                                                        ) 
df_dev_f1_squad_3 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_3 = True,
                                                        ) 
df_zeroshot_f1_squad_3 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        squad_3 = True,
                                                        ) 

df_train_f2_squad_3 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_3 = True,
                                                        ) 
df_dev_f2_squad_3 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_3 = True,
                                                        ) 

print("df_train_f1_squad_3 describe: ")
display(df_train_f1_squad_3.describe())
print("df_dev_f1_squad_3 describe: ")
display(df_dev_f1_squad_3.describe())
print("df_dev_f1_squad_3 describe: ")
display(df_dev_f1_squad_3.describe())

print("df_train_f2_squad_3 describe: ")
display(df_train_f2_squad_3.describe())
print("df_zeroshot_f1_squad_3 describe: ")
display(df_zeroshot_f1_squad_3.describe())

df_train_f1_squad_3.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_3.parquet')
df_dev_f1_squad_3.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_3.parquet')
df_zeroshot_f1_squad_3.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_3.parquet')

df_train_f2_squad_3.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_3.parquet')
df_dev_f2_squad_3.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_3.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_3.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_3.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_3.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_SQUAD_3")

100%|██████████| 12388/12388 [00:17<00:00, 710.50it/s]
100%|██████████| 1642/1642 [00:02<00:00, 799.84it/s]
100%|██████████| 1200/1200 [00:01<00:00, 798.22it/s]
100%|██████████| 12382/12382 [00:17<00:00, 709.14it/s]
100%|██████████| 1648/1648 [00:02<00:00, 799.28it/s]


df_train_f1_squad_3 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\nWhat are...,unanswerable
freq,1,20,4401


df_dev_f1_squad_3 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_dev_f1_squad_3 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_train_f2_squad_3 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\nWhat are...,unanswerable
freq,1,20,4383


df_zeroshot_f1_squad_3 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\nWhat are...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-6fd7f46d34d8722d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-6fd7f46d34d8722d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-c13023d763a1ec41/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-c13023d763a1ec41/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-82f956c64eeb4382/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-82f956c64eeb4382/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-0c8923c4953649c3/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-0c8923c4953649c3/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-07cf05dea504a784/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-07cf05dea504a784/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

In [91]:
df_train_f1_squad_4 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_4 = True,
                                                        ) 
df_dev_f1_squad_4 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_4 = True,
                                                        ) 
df_zeroshot_f1_squad_4 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        squad_4 = True,
                                                        ) 

df_train_f2_squad_4 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_4 = True,
                                                        ) 
df_dev_f2_squad_4 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_4 = True,
                                                        ) 

print("df_train_f1_squad_4 describe: ")
display(df_train_f1_squad_4.describe())
print("df_dev_f1_squad_4 describe: ")
display(df_dev_f1_squad_4.describe())
print("df_dev_f1_squad_4 describe: ")
display(df_dev_f1_squad_4.describe())

print("df_train_f2_squad_4 describe: ")
display(df_train_f2_squad_4.describe())
print("df_zeroshot_f1_squad_4 describe: ")
display(df_zeroshot_f1_squad_4.describe())

df_train_f1_squad_4.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_4.parquet')
df_dev_f1_squad_4.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_4.parquet')
df_zeroshot_f1_squad_4.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_4.parquet')

df_train_f2_squad_4.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_4.parquet')
df_dev_f2_squad_4.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_4.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_4.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_4.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_4.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_SQUAD_4")

  3%|▎         | 411/12388 [00:00<00:15, 787.53it/s]

100%|██████████| 12388/12388 [00:17<00:00, 697.43it/s]
100%|██████████| 1642/1642 [00:02<00:00, 796.03it/s]
100%|██████████| 1200/1200 [00:01<00:00, 796.83it/s]
100%|██████████| 12382/12382 [00:17<00:00, 702.97it/s]
100%|██████████| 1648/1648 [00:02<00:00, 797.14it/s]


df_train_f1_squad_4 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\nTry to a...,unanswerable
freq,1,20,4401


df_dev_f1_squad_4 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_dev_f1_squad_4 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_train_f2_squad_4 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\nTry to a...,unanswerable
freq,1,20,4383


df_zeroshot_f1_squad_4 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\nTry to a...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-8bfbc344f5e131e2/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-8bfbc344f5e131e2/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-ef529d02d87011bc/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-ef529d02d87011bc/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-4400f2b025cfa661/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-4400f2b025cfa661/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-046dbfecec008957/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-046dbfecec008957/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-4fc7288b975450a3/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-4fc7288b975450a3/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

In [92]:
df_train_f1_squad_5 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_5 = True,
                                                        ) 
df_dev_f1_squad_5 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_5 = True,
                                                        ) 
df_zeroshot_f1_squad_5 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        squad_5 = True,
                                                        ) 

df_train_f2_squad_5 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_5 = True,
                                                        ) 
df_dev_f2_squad_5 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_5 = True,
                                                        ) 

print("df_train_f1_squad_5 describe: ")
display(df_train_f1_squad_5.describe())
print("df_dev_f1_squad_5 describe: ")
display(df_dev_f1_squad_5.describe())
print("df_dev_f1_squad_5 describe: ")
display(df_dev_f1_squad_5.describe())

print("df_train_f2_squad_5 describe: ")
display(df_train_f2_squad_5.describe())
print("df_zeroshot_f1_squad_5 describe: ")
display(df_zeroshot_f1_squad_5.describe())

df_train_f1_squad_5.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_5.parquet')
df_dev_f1_squad_5.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_5.parquet')
df_zeroshot_f1_squad_5.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_5.parquet')

df_train_f2_squad_5.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_5.parquet')
df_dev_f2_squad_5.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_5.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_5.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_5.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_5.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_SQUAD_5")

  1%|          | 80/12388 [00:00<00:15, 799.50it/s]

100%|██████████| 12388/12388 [00:17<00:00, 691.62it/s]
100%|██████████| 1642/1642 [00:02<00:00, 784.77it/s]
100%|██████████| 1200/1200 [00:01<00:00, 784.41it/s]
100%|██████████| 12382/12382 [00:17<00:00, 691.10it/s]
100%|██████████| 1648/1648 [00:02<00:00, 778.71it/s]


df_train_f1_squad_5 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\nIf it is...,unanswerable
freq,1,20,4401


df_dev_f1_squad_5 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_dev_f1_squad_5 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_train_f2_squad_5 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\nIf it is...,unanswerable
freq,1,20,4383


df_zeroshot_f1_squad_5 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\nIf it is...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-b857ef88461ed576/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-b857ef88461ed576/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-318231ee7e7f0285/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-318231ee7e7f0285/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-197347fb5e0e23fc/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-197347fb5e0e23fc/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-23cff2a2f7803abc/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-23cff2a2f7803abc/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-bcc178c719799679/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-bcc178c719799679/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

In [93]:
df_train_f1_squad_6 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_6 = True,
                                                        ) 
df_dev_f1_squad_6 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_6 = True,
                                                        ) 
df_zeroshot_f1_squad_6 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        squad_6 = True,
                                                        ) 

df_train_f2_squad_6 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_6 = True,
                                                        ) 
df_dev_f2_squad_6 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_6 = True,
                                                        ) 

print("df_train_f1_squad_6 describe: ")
display(df_train_f1_squad_6.describe())
print("df_dev_f1_squad_6 describe: ")
display(df_dev_f1_squad_6.describe())
print("df_dev_f1_squad_6 describe: ")
display(df_dev_f1_squad_6.describe())

print("df_train_f2_squad_6 describe: ")
display(df_train_f2_squad_6.describe())
print("df_zeroshot_f1_squad_6 describe: ")
display(df_zeroshot_f1_squad_6.describe())

df_train_f1_squad_6.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_6.parquet')
df_dev_f1_squad_6.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_6.parquet')
df_zeroshot_f1_squad_6.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_6.parquet')

df_train_f2_squad_6.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_6.parquet')
df_dev_f2_squad_6.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_6.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_6.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_6.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_6.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_SQUAD_6")

100%|██████████| 12388/12388 [00:17<00:00, 702.69it/s]
100%|██████████| 1642/1642 [00:02<00:00, 792.03it/s]
100%|██████████| 1200/1200 [00:01<00:00, 796.28it/s]
100%|██████████| 12382/12382 [00:17<00:00, 701.40it/s]
100%|██████████| 1648/1648 [00:02<00:00, 793.09it/s]


df_train_f1_squad_6 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nAnswer...,unanswerable
freq,1,20,4401


df_dev_f1_squad_6 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_dev_f1_squad_6 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_train_f2_squad_6 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nAnswer...,unanswerable
freq,1,20,4383


df_zeroshot_f1_squad_6 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nAnswer...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-4983c89c5e4db66c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-4983c89c5e4db66c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-019445dea7892eed/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-019445dea7892eed/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-6c95b9c8f02b63aa/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-6c95b9c8f02b63aa/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-3d5d10d615a4d93b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-3d5d10d615a4d93b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-014048db185496c6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-014048db185496c6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

In [94]:
df_train_f1_squad_7 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_7 = True,
                                                        ) 
df_dev_f1_squad_7 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_7 = True,
                                                        ) 
df_zeroshot_f1_squad_7 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        squad_7 = True,
                                                        ) 

df_train_f2_squad_7 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_7 = True,
                                                        ) 
df_dev_f2_squad_7 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_7 = True,
                                                        ) 

print("df_train_f1_squad_7 describe: ")
display(df_train_f1_squad_7.describe())
print("df_dev_f1_squad_7 describe: ")
display(df_dev_f1_squad_7.describe())
print("df_dev_f1_squad_7 describe: ")
display(df_dev_f1_squad_7.describe())

print("df_train_f2_squad_7 describe: ")
display(df_train_f2_squad_7.describe())
print("df_zeroshot_f1_squad_7 describe: ")
display(df_zeroshot_f1_squad_7.describe())

df_train_f1_squad_7.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_7.parquet')
df_dev_f1_squad_7.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_7.parquet')
df_zeroshot_f1_squad_7.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_7.parquet')

df_train_f2_squad_7.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_7.parquet')
df_dev_f2_squad_7.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_7.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_7.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_7.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_7.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_SQUAD_7")

  8%|▊         | 1050/12388 [00:01<00:14, 786.27it/s]

100%|██████████| 12388/12388 [00:17<00:00, 702.76it/s]
100%|██████████| 1642/1642 [00:02<00:00, 792.12it/s]
100%|██████████| 1200/1200 [00:01<00:00, 794.69it/s]
100%|██████████| 12382/12382 [00:17<00:00, 701.58it/s]
100%|██████████| 1648/1648 [00:02<00:00, 793.40it/s]


df_train_f1_squad_7 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Read this: Title:\t\n\nAbstract:\t\n\n[image]\...,unanswerable
freq,1,20,4401


df_dev_f1_squad_7 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Read this: Title:\tA Sample ACM SIG Proceeding...,unanswerable
freq,1,2,648


df_dev_f1_squad_7 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Read this: Title:\tA Sample ACM SIG Proceeding...,unanswerable
freq,1,2,648


df_train_f2_squad_7 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Read this: Title:\t\n\nAbstract:\t\n\n[image]\...,unanswerable
freq,1,20,4383


df_zeroshot_f1_squad_7 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Read this: Title:\t\n\nAbstract:\t\n\n[image]\...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-dec483056c3f9249/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-dec483056c3f9249/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-44aaf699b5d106a5/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-44aaf699b5d106a5/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-0fe75c188a3f72f1/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-0fe75c188a3f72f1/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-9766f0dc85ec529d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-9766f0dc85ec529d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-6450d36e22fda91b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-6450d36e22fda91b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

In [95]:
df_train_f1_squad_8 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        squad_8 = True,
                                                        ) 
df_dev_f1_squad_8 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        squad_8 = True,
                                                        ) 
df_zeroshot_f1_squad_8 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        squad_8 = True,
                                                        ) 

df_train_f2_squad_8 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        squad_8 = True,
                                                        ) 
df_dev_f2_squad_8 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        squad_8 = True,
                                                        ) 

print("df_train_f1_squad_8 describe: ")
display(df_train_f1_squad_8.describe())
print("df_dev_f1_squad_8 describe: ")
display(df_dev_f1_squad_8.describe())
print("df_dev_f1_squad_8 describe: ")
display(df_dev_f1_squad_8.describe())

print("df_train_f2_squad_8 describe: ")
display(df_train_f2_squad_8.describe())
print("df_zeroshot_f1_squad_8 describe: ")
display(df_zeroshot_f1_squad_8.describe())

df_train_f1_squad_8.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_8.parquet')
df_dev_f1_squad_8.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_8.parquet')
df_zeroshot_f1_squad_8.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_8.parquet')

df_train_f2_squad_8.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_8.parquet')
df_dev_f2_squad_8.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_8.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_squad_8.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_squad_8.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_squad_8.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_squad_8.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_squad_8.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_SQUAD_8")

  0%|          | 0/12388 [00:00<?, ?it/s]

100%|██████████| 12388/12388 [00:17<00:00, 691.62it/s]
100%|██████████| 1642/1642 [00:02<00:00, 782.75it/s]
100%|██████████| 1200/1200 [00:01<00:00, 788.04it/s]
100%|██████████| 12382/12382 [00:17<00:00, 695.02it/s]
100%|██████████| 1648/1648 [00:02<00:00, 783.42it/s]


df_train_f1_squad_8 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Read this: Title:\t\n\nAbstract:\t\n\n[image]\...,unanswerable
freq,1,20,4401


df_dev_f1_squad_8 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Read this: Title:\tA Sample ACM SIG Proceeding...,unanswerable
freq,1,2,648


df_dev_f1_squad_8 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Read this: Title:\tA Sample ACM SIG Proceeding...,unanswerable
freq,1,2,648


df_train_f2_squad_8 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Read this: Title:\t\n\nAbstract:\t\n\n[image]\...,unanswerable
freq,1,20,4383


df_zeroshot_f1_squad_8 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Read this: Title:\t\n\nAbstract:\t\n\n[image]\...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-70482aebda0b8678/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-70482aebda0b8678/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-b617964440d86e1a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-b617964440d86e1a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-8c8fd836908d40a5/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-8c8fd836908d40a5/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-c9ddd73080f7c4d3/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-c9ddd73080f7c4d3/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-83b25a376de6a8ee/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-83b25a376de6a8ee/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

## DROP

In [96]:
df_train_f1_drop_1 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_1 = True,
                                                        ) 
df_dev_f1_drop_1 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_1 = True,
                                                        ) 
df_zeroshot_f1_drop_1 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        drop_1 = True,
                                                        ) 

df_train_f2_drop_1 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_1 = True,
                                                        ) 
df_dev_f2_drop_1 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_1 = True,
                                                        ) 

print("df_train_f1_drop_1 describe: ")
display(df_train_f1_drop_1.describe())
print("df_dev_f1_drop_1 describe: ")
display(df_dev_f1_drop_1.describe())
print("df_dev_f1_drop_1 describe: ")
display(df_dev_f1_drop_1.describe())

print("df_train_f2_drop_1 describe: ")
display(df_train_f2_drop_1.describe())
print("df_zeroshot_f1_drop_1 describe: ")
display(df_zeroshot_f1_drop_1.describe())

df_train_f1_drop_1.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_drop_1.parquet')
df_dev_f1_drop_1.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_drop_1.parquet')
df_zeroshot_f1_drop_1.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_drop_1.parquet')

df_train_f2_drop_1.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_drop_1.parquet')
df_dev_f2_drop_1.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_drop_1.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_drop_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_drop_1.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_drop_1.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_drop_1.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_drop_1.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_DROP_1")

  1%|          | 81/12388 [00:00<00:15, 809.43it/s]

100%|██████████| 12388/12388 [00:17<00:00, 704.55it/s]
100%|██████████| 1642/1642 [00:02<00:00, 790.66it/s]
100%|██████████| 1200/1200 [00:01<00:00, 798.41it/s]
100%|██████████| 12382/12382 [00:17<00:00, 703.88it/s]
100%|██████████| 1648/1648 [00:02<00:00, 794.40it/s]


df_train_f1_drop_1 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Answer based on context:\n\nTitle:\t\n\nAbstra...,unanswerable
freq,1,20,4401


df_dev_f1_drop_1 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Answer based on context:\n\nTitle:\tA Sample A...,unanswerable
freq,1,2,648


df_dev_f1_drop_1 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Answer based on context:\n\nTitle:\tA Sample A...,unanswerable
freq,1,2,648


df_train_f2_drop_1 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Answer based on context:\n\nTitle:\t\n\nAbstra...,unanswerable
freq,1,20,4383


df_zeroshot_f1_drop_1 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Answer based on context:\n\nTitle:\t\n\nAbstra...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-de2249b9a918c1ef/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-de2249b9a918c1ef/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-1bc2e87e72cd61e9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-1bc2e87e72cd61e9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-84edeb3e5191a9b1/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-84edeb3e5191a9b1/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-968b8842df0d2b8d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-968b8842df0d2b8d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-83feaf1f8f2a313c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-83feaf1f8f2a313c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

In [97]:
df_train_f1_drop_2 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_2 = True,
                                                        ) 
df_dev_f1_drop_2 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_2 = True,
                                                        ) 
df_zeroshot_f1_drop_2 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        drop_2 = True,
                                                        ) 

df_train_f2_drop_2 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_2 = True,
                                                        ) 
df_dev_f2_drop_2 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_2 = True,
                                                        ) 

print("df_train_f1_drop_2 describe: ")
display(df_train_f1_drop_2.describe())
print("df_dev_f1_drop_2 describe: ")
display(df_dev_f1_drop_2.describe())
print("df_dev_f1_drop_2 describe: ")
display(df_dev_f1_drop_2.describe())

print("df_train_f2_drop_2 describe: ")
display(df_train_f2_drop_2.describe())
print("df_zeroshot_f1_drop_2 describe: ")
display(df_zeroshot_f1_drop_2.describe())

df_train_f1_drop_2.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_drop_2.parquet')
df_dev_f1_drop_2.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_drop_2.parquet')
df_zeroshot_f1_drop_2.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_drop_2.parquet')

df_train_f2_drop_2.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_drop_2.parquet')
df_dev_f2_drop_2.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_drop_2.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_drop_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_drop_2.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_drop_2.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_drop_2.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_drop_2.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_DROP_2")

  0%|          | 0/12388 [00:00<?, ?it/s]

100%|██████████| 12388/12388 [00:17<00:00, 696.14it/s]
100%|██████████| 1642/1642 [00:02<00:00, 787.45it/s]
100%|██████████| 1200/1200 [00:01<00:00, 792.75it/s]
100%|██████████| 12382/12382 [00:17<00:00, 697.07it/s]
100%|██████████| 1648/1648 [00:02<00:00, 790.01it/s]


df_train_f1_drop_2 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nAnswer...,unanswerable
freq,1,20,4401


df_dev_f1_drop_2 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_dev_f1_drop_2 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_train_f2_drop_2 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nAnswer...,unanswerable
freq,1,20,4383


df_zeroshot_f1_drop_2 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nAnswer...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-9dd1839fd6a8f8bf/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-9dd1839fd6a8f8bf/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-4a273ce0cedbd2c0/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-4a273ce0cedbd2c0/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-1cfd3d6fff1f7485/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-1cfd3d6fff1f7485/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-6d5818fa064e7543/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-6d5818fa064e7543/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-d0120036ee804298/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-d0120036ee804298/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

In [98]:
df_train_f1_drop_3 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_3 = True,
                                                        ) 
df_dev_f1_drop_3 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_3 = True,
                                                        ) 
df_zeroshot_f1_drop_3 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        drop_3 = True,
                                                        ) 

df_train_f2_drop_3 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_3 = True,
                                                        ) 
df_dev_f2_drop_3 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_3 = True,
                                                        ) 

print("df_train_f1_drop_3 describe: ")
display(df_train_f1_drop_3.describe())
print("df_dev_f1_drop_3 describe: ")
display(df_dev_f1_drop_3.describe())
print("df_dev_f1_drop_3 describe: ")
display(df_dev_f1_drop_3.describe())

print("df_train_f2_drop_3 describe: ")
display(df_train_f2_drop_3.describe())
print("df_zeroshot_f1_drop_3 describe: ")
display(df_zeroshot_f1_drop_3.describe())

df_train_f1_drop_3.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_drop_3.parquet')
df_dev_f1_drop_3.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_drop_3.parquet')
df_zeroshot_f1_drop_3.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_drop_3.parquet')

df_train_f2_drop_3.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_drop_3.parquet')
df_dev_f2_drop_3.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_drop_3.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_drop_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_drop_3.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_drop_3.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_drop_3.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_drop_3.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_DROP_3")

  1%|          | 82/12388 [00:00<00:15, 818.12it/s]

100%|██████████| 12388/12388 [00:17<00:00, 699.95it/s]
100%|██████████| 1642/1642 [00:02<00:00, 789.87it/s]
100%|██████████| 1200/1200 [00:01<00:00, 794.74it/s]
100%|██████████| 12382/12382 [00:17<00:00, 702.96it/s]
100%|██████████| 1648/1648 [00:02<00:00, 794.83it/s]


df_train_f1_drop_3 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nWhat a...,unanswerable
freq,1,20,4401


df_dev_f1_drop_3 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_dev_f1_drop_3 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_train_f2_drop_3 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nWhat a...,unanswerable
freq,1,20,4383


df_zeroshot_f1_drop_3 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nWhat a...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-1e887097126e2fc6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-1e887097126e2fc6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-345e64dba63cf209/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-345e64dba63cf209/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-3d57c4fc9092f3f6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-3d57c4fc9092f3f6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-27d6f9748ccf79ca/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-27d6f9748ccf79ca/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-2b80392ceb7041d9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-2b80392ceb7041d9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

In [99]:
df_train_f1_drop_4 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_4 = True,
                                                        ) 
df_dev_f1_drop_4 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_4 = True,
                                                        ) 
df_zeroshot_f1_drop_4 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        drop_4 = True,
                                                        ) 

df_train_f2_drop_4 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_4 = True,
                                                        ) 
df_dev_f2_drop_4 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_4 = True,
                                                        ) 

print("df_train_f1_drop_4 describe: ")
display(df_train_f1_drop_4.describe())
print("df_dev_f1_drop_4 describe: ")
display(df_dev_f1_drop_4.describe())
print("df_dev_f1_drop_4 describe: ")
display(df_dev_f1_drop_4.describe())

print("df_train_f2_drop_4 describe: ")
display(df_train_f2_drop_4.describe())
print("df_zeroshot_f1_drop_4 describe: ")
display(df_zeroshot_f1_drop_4.describe())

df_train_f1_drop_4.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_drop_4.parquet')
df_dev_f1_drop_4.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_drop_4.parquet')
df_zeroshot_f1_drop_4.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_drop_4.parquet')

df_train_f2_drop_4.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_drop_4.parquet')
df_dev_f2_drop_4.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_drop_4.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_drop_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_drop_4.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_drop_4.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_drop_4.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_drop_4.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_DROP_4")

  6%|▋         | 802/12388 [00:01<00:14, 779.80it/s]

100%|██████████| 12388/12388 [00:17<00:00, 694.01it/s]
100%|██████████| 1642/1642 [00:02<00:00, 783.72it/s]
100%|██████████| 1200/1200 [00:01<00:00, 789.15it/s]
100%|██████████| 12382/12382 [00:17<00:00, 693.93it/s]
100%|██████████| 1648/1648 [00:02<00:00, 785.91it/s]


df_train_f1_drop_4 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\nAnswer t...,unanswerable
freq,1,20,4401


df_dev_f1_drop_4 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_dev_f1_drop_4 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_train_f2_drop_4 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\nAnswer t...,unanswerable
freq,1,20,4383


df_zeroshot_f1_drop_4 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\nAnswer t...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-55886ba5612e421c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-55886ba5612e421c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-f1622ff6a942563b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-f1622ff6a942563b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-a5334dc444c47e35/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-a5334dc444c47e35/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-be77d99bb0cf5de7/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-be77d99bb0cf5de7/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-f41f023753d850e7/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-f41f023753d850e7/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

In [100]:
df_train_f1_drop_5 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_5 = True,
                                                        ) 
df_dev_f1_drop_5 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_5 = True,
                                                        ) 
df_zeroshot_f1_drop_5 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        drop_5 = True,
                                                        ) 

df_train_f2_drop_5 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_5 = True,
                                                        ) 
df_dev_f2_drop_5 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_5 = True,
                                                        ) 

print("df_train_f1_drop_5 describe: ")
display(df_train_f1_drop_5.describe())
print("df_dev_f1_drop_5 describe: ")
display(df_dev_f1_drop_5.describe())
print("df_dev_f1_drop_5 describe: ")
display(df_dev_f1_drop_5.describe())

print("df_train_f2_drop_5 describe: ")
display(df_train_f2_drop_5.describe())
print("df_zeroshot_f1_drop_5 describe: ")
display(df_zeroshot_f1_drop_5.describe())

df_train_f1_drop_5.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_drop_5.parquet')
df_dev_f1_drop_5.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_drop_5.parquet')
df_zeroshot_f1_drop_5.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_drop_5.parquet')

df_train_f2_drop_5.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_drop_5.parquet')
df_dev_f2_drop_5.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_drop_5.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_drop_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_drop_5.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_drop_5.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_drop_5.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_drop_5.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_DROP_5")

  0%|          | 0/12388 [00:00<?, ?it/s]

100%|██████████| 12388/12388 [00:17<00:00, 701.26it/s]
100%|██████████| 1642/1642 [00:02<00:00, 788.09it/s]
100%|██████████| 1200/1200 [00:01<00:00, 796.01it/s]
100%|██████████| 12382/12382 [00:17<00:00, 698.52it/s]
100%|██████████| 1648/1648 [00:02<00:00, 791.49it/s]


df_train_f1_drop_5 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Read this article and answer this question Tit...,unanswerable
freq,1,20,4401


df_dev_f1_drop_5 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Read this article and answer this question Tit...,unanswerable
freq,1,2,648


df_dev_f1_drop_5 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Read this article and answer this question Tit...,unanswerable
freq,1,2,648


df_train_f2_drop_5 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Read this article and answer this question Tit...,unanswerable
freq,1,20,4383


df_zeroshot_f1_drop_5 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Read this article and answer this question Tit...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-27cdda53e44f08b2/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-27cdda53e44f08b2/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-8f8b06cfcb4f4402/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-8f8b06cfcb4f4402/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-ffa7a47cdf65bf57/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-ffa7a47cdf65bf57/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-f98edd716ca7181f/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-f98edd716ca7181f/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-024de107f1aeda8b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-024de107f1aeda8b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

In [101]:
df_train_f1_drop_6 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_6 = True,
                                                        ) 
df_dev_f1_drop_6 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_6 = True,
                                                        ) 
df_zeroshot_f1_drop_6 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        drop_6 = True,
                                                        ) 

df_train_f2_drop_6 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_6 = True,
                                                        ) 
df_dev_f2_drop_6 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_6 = True,
                                                        ) 

print("df_train_f1_drop_6 describe: ")
display(df_train_f1_drop_6.describe())
print("df_dev_f1_drop_6 describe: ")
display(df_dev_f1_drop_6.describe())
print("df_dev_f1_drop_6 describe: ")
display(df_dev_f1_drop_6.describe())

print("df_train_f2_drop_6 describe: ")
display(df_train_f2_drop_6.describe())
print("df_zeroshot_f1_drop_6 describe: ")
display(df_zeroshot_f1_drop_6.describe())

df_train_f1_drop_6.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_drop_6.parquet')
df_dev_f1_drop_6.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_drop_6.parquet')
df_zeroshot_f1_drop_6.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_drop_6.parquet')

df_train_f2_drop_6.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_drop_6.parquet')
df_dev_f2_drop_6.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_drop_6.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_drop_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_drop_6.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_drop_6.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_drop_6.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_drop_6.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_DROP_6")

  1%|          | 82/12388 [00:00<00:15, 812.74it/s]

100%|██████████| 12388/12388 [00:17<00:00, 700.32it/s]
100%|██████████| 1642/1642 [00:02<00:00, 789.42it/s]
100%|██████████| 1200/1200 [00:01<00:00, 794.17it/s]
100%|██████████| 12382/12382 [00:17<00:00, 696.02it/s]
100%|██████████| 1648/1648 [00:02<00:00, 785.20it/s]


df_train_f1_drop_6 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nBased ...,unanswerable
freq,1,20,4401


df_dev_f1_drop_6 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_dev_f1_drop_6 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Title:\tA Sample ACM SIG Proceedings Paper in ...,unanswerable
freq,1,2,648


df_train_f2_drop_6 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nBased ...,unanswerable
freq,1,20,4383


df_zeroshot_f1_drop_6 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Title:\t\n\nAbstract:\t\n\n[image]\n\n\nBased ...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-1ea35f01197685fd/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-1ea35f01197685fd/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-8fc7b286c48b950b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-8fc7b286c48b950b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-edfaee265ce90e2d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-edfaee265ce90e2d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-6c2baffec2ae5889/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-6c2baffec2ae5889/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-cb26de7e610d3091/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-cb26de7e610d3091/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

In [102]:
df_train_f1_drop_7 = create_pandas_dataset_from_pandas(df_train_f1,
                                                        drop_7 = True,
                                                        ) 
df_dev_f1_drop_7 = create_pandas_dataset_from_pandas(df_dev_f1,
                                                        drop_7 = True,
                                                        ) 
df_zeroshot_f1_drop_7 = create_pandas_dataset_from_pandas(df_zeroshot_f1,
                                                        drop_7 = True,
                                                        ) 

df_train_f2_drop_7 = create_pandas_dataset_from_pandas(df_train_f2,
                                                        drop_7 = True,
                                                        ) 
df_dev_f2_drop_7 = create_pandas_dataset_from_pandas(df_dev_f2,
                                                        drop_7 = True,
                                                        ) 

print("df_train_f1_drop_7 describe: ")
display(df_train_f1_drop_7.describe())
print("df_dev_f1_drop_7 describe: ")
display(df_dev_f1_drop_7.describe())
print("df_dev_f1_drop_7 describe: ")
display(df_dev_f1_drop_7.describe())

print("df_train_f2_drop_7 describe: ")
display(df_train_f2_drop_7.describe())
print("df_zeroshot_f1_drop_7 describe: ")
display(df_zeroshot_f1_drop_7.describe())

df_train_f1_drop_7.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_drop_7.parquet')
df_dev_f1_drop_7.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_drop_7.parquet')
df_zeroshot_f1_drop_7.to_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_drop_7.parquet')

df_train_f2_drop_7.to_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_drop_7.parquet')
df_dev_f2_drop_7.to_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_drop_7.parquet')

dataset = DatasetDict({
    'fold1': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f1_drop_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f1_drop_7.parquet'),
        "zeroshot": Dataset.from_parquet('../data/df_zeroshot_tdms_augmented_summarized_with_id_f1_drop_7.parquet')
    }),
    'fold2': DatasetDict({
        "train": Dataset.from_parquet('../data/df_train_tdms_augmented_summarized_with_id_f2_drop_7.parquet'),
        "validation": Dataset.from_parquet('../data/df_dev_tdms_augmented_summarized_with_id_f2_drop_7.parquet')
    })
})

print(dataset)

dataset.save_to_disk("../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_DROP_7")

  1%|          | 82/12388 [00:00<00:15, 819.56it/s]

100%|██████████| 12388/12388 [00:17<00:00, 702.80it/s]
100%|██████████| 1642/1642 [00:02<00:00, 791.31it/s]
100%|██████████| 1200/1200 [00:01<00:00, 797.40it/s]
100%|██████████| 12382/12382 [00:17<00:00, 699.10it/s]
100%|██████████| 1648/1648 [00:02<00:00, 791.02it/s]


df_train_f1_drop_7 describe: 


Unnamed: 0,id,prompt,answer
count,12388,12388,12388
unique,12388,12312,7976
top,1707.03497v2.pdf,Context: Title:\t\n\nAbstract:\t\n\n[image]\n\...,unanswerable
freq,1,20,4401


df_dev_f1_drop_7 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Context: Title:\tA Sample ACM SIG Proceedings ...,unanswerable
freq,1,2,648


df_dev_f1_drop_7 describe: 


Unnamed: 0,id,prompt,answer
count,1642,1642,1642
unique,1642,1641,994
top,1810.02575v1.pdf,Context: Title:\tA Sample ACM SIG Proceedings ...,unanswerable
freq,1,2,648


df_train_f2_drop_7 describe: 


Unnamed: 0,id,prompt,answer
count,12382,12382,12382
unique,12382,12311,7988
top,1707.03497v2.pdf,Context: Title:\t\n\nAbstract:\t\n\n[image]\n\...,unanswerable
freq,1,20,4383


df_zeroshot_f1_drop_7 describe: 


Unnamed: 0,id,prompt,answer
count,1200,1200,1200
unique,1200,1197,653
top,2209.03182v1.pdf,Context: Title:\t\n\nAbstract:\t\n\n[image]\n\...,unanswerable
freq,1,3,548


Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-3eb837a333cfc933/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-3eb837a333cfc933/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-17e46d64b06d8b5c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-17e46d64b06d8b5c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-67ca9e6892994fe9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-67ca9e6892994fe9/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-50b2abe59e79624b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-50b2abe59e79624b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
Downloading and preparing dataset parquet/default to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-b4020d14b46e9075/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /nfs/home/kabenamualus/.cache/huggingface/datasets/parquet/default-b4020d14b46e9075/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.
DatasetDict({
    fold1: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12388
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1642
        })
        zeroshot: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1200
        })
    })
    fold2: DatasetDict({
        train: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 12382
        })
        validation: Dataset({
            features: ['id', 'prompt', 'answer', '__index_level_0__'],
            num_rows: 1648
        })
    })

Saving the dataset (0/1 shards):   0%|          | 0/12388 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1642 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12382 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1648 [00:00<?, ? examples/s]

In [103]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [104]:
# root_directory = "../data/LLLM_LONG_SUMMARIZED_TDMS_ALL_TEMPLATE"
root_directory = "../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_DROP_7"


# reloaded_encoded_dataset = datasets.load_from_disk("../data/dataset/LLLM_TDMS_ALL_TEMPLATE")
# reloaded_encoded_dataset = DatasetDict.load_from_disk("../data/LLLM_TDMS_ALL_TEMPLATE")

dataset_fold1 = DatasetDict.load_from_disk(f"{root_directory}/fold1")
dataset_fold2 = DatasetDict.load_from_disk(f"{root_directory}/fold2")

In [105]:
dataset_fold1['train'][0]

{'id': '1707.03497v2.pdf',
 'prompt': 'Context: Title:\tValue Prediction Network\n\nAbstract:\tThis paper proposes a novel deep reinforcement learning (RL) architecture, called Value Prediction Network (VPN), which integrates model-free and model-based RL methods into a single neural network. In contrast to typical model-based RL methods, VPN learns a dynamics model whose abstract states are trained to make option-conditional predictions of future values (discounted sum of rewards) rather than of future observations. Our experimental results show that VPN has several advantages over both model-free and model-based baselines in a stochastic environment where careful planning is required but building an accurate observation-prediction model is difficult. Furthermore, VPN outperforms Deep Q-Network (DQN) on several Atari games even with short-lookahead planning, demonstrating its potential as a new way of learning a good state representation.\n\nExperiments\n\nOur experiments investigated